Fix a regression in the handling of some combining characters such as zero width joiners

Fixes #4439
This commit is contained in:
Kovid Goyal 2022-01-05 08:35:46 +05:30
parent 9aefcfe56f
commit d875615c03
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
6 changed files with 435 additions and 422 deletions

View File

@ -49,11 +49,10 @@ class_maps: Dict[str, Set[int]] = {}
all_symbols: Set[int] = set()
name_map: Dict[int, str] = {}
word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
zwj = 0x200d
soft_hyphen = 0xad
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
# See https://github.com/harfbuzz/harfbuzz/issues/169
marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints
marks = set(emoji_skin_tone_modifiers) | flag_codepoints
not_assigned = set(range(0, sys.maxunicode))
property_maps: Dict[str, Set[int]] = defaultdict(set)
@ -69,7 +68,6 @@ def parse_prop_list() -> None:
property_maps[name] |= chars
# see https://www.unicode.org/faq/unsup_char.html#3
marks |= property_maps['Other_Default_Ignorable_Code_Point']
marks.add(soft_hyphen)
def parse_ucd() -> None:
@ -113,6 +111,8 @@ def parse_ucd() -> None:
marks.add(codepoint)
elif category.startswith('S'):
all_symbols.add(codepoint)
elif category == 'Cf':
marks.add(codepoint)
with open('nerd-fonts-glyphs.txt') as f:
for line in f:
@ -382,9 +382,9 @@ def gen_ucd() -> None:
ascii_range='false'
)
category_test(
'is_non_rendered_char', p, 'Cc Cs'.split(),
'is_non_rendered_char', p, 'Cc Cs Cf'.split(),
'Other_Default_Ignorable_Code_Point and soft hyphen',
extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | {soft_hyphen},
extra_chars=property_maps['Other_Default_Ignorable_Code_Point'],
ascii_range='false'
)
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')

View File

@ -1,4 +1,4 @@
// unicode data, built from the unicode standard on: 2021-10-07
// unicode data, built from the unicode standard on: 2022-01-05
// see gen-wcwidth.py
#pragma once
#include "data-types.h"

26
kitty/emoji.h generated
View File

@ -1,4 +1,4 @@
// unicode data, built from the unicode standard on: 2021-10-07
// unicode data, built from the unicode standard on: 2022-01-05
// see gen-wcwidth.py
#pragma once
#include "data-types.h"
@ -276,7 +276,7 @@ is_emoji(char_type code) {
return true;
case 0x1f6d5 ... 0x1f6d7:
return true;
case 0x1f6e0 ... 0x1f6e5:
case 0x1f6dd ... 0x1f6e5:
return true;
case 0x1f6e9:
return true;
@ -288,29 +288,31 @@ is_emoji(char_type code) {
return true;
case 0x1f7e0 ... 0x1f7eb:
return true;
case 0x1f7f0:
return true;
case 0x1f90c ... 0x1f93a:
return true;
case 0x1f93c ... 0x1f945:
return true;
case 0x1f947 ... 0x1f978:
return true;
case 0x1f97a ... 0x1f9cb:
return true;
case 0x1f9cd ... 0x1f9ff:
case 0x1f947 ... 0x1f9ff:
return true;
case 0x1fa70 ... 0x1fa74:
return true;
case 0x1fa78 ... 0x1fa7a:
case 0x1fa78 ... 0x1fa7c:
return true;
case 0x1fa80 ... 0x1fa86:
return true;
case 0x1fa90 ... 0x1faa8:
case 0x1fa90 ... 0x1faac:
return true;
case 0x1fab0 ... 0x1fab6:
case 0x1fab0 ... 0x1faba:
return true;
case 0x1fac0 ... 0x1fac2:
case 0x1fac0 ... 0x1fac5:
return true;
case 0x1fad0 ... 0x1fad6:
case 0x1fad0 ... 0x1fad9:
return true;
case 0x1fae0 ... 0x1fae7:
return true;
case 0x1faf0 ... 0x1faf6:
return true;
default: return false;
}

699
kitty/unicode-data.c generated

File diff suppressed because one or more lines are too long

View File

@ -2,7 +2,7 @@
#include "data-types.h"
#include "state.h"
// START_KNOWN_MARKS
static const combining_type VS15 = 1325, VS16 = 1326;
static const combining_type VS15 = 1362, VS16 = 1363;
// END_KNOWN_MARKS
bool is_combining_char(char_type ch);

118
kitty/wcwidth-std.h generated
View File

@ -1,4 +1,4 @@
// unicode data, built from the unicode standard on: 2021-10-07
// unicode data, built from the unicode standard on: 2022-01-05
// see gen-wcwidth.py
#pragma once
#include "data-types.h"
@ -14,7 +14,7 @@ wcwidth_std(int32_t code) {
return 2;
// }}}
// Marks (6189 codepoints) {{{
// Marks (6350 codepoints) {{{
case 0x0:
return 0;
case 0xad:
@ -33,13 +33,17 @@ wcwidth_std(int32_t code) {
return 0;
case 0x5c7:
return 0;
case 0x600 ... 0x605:
return 0;
case 0x610 ... 0x61a:
return 0;
case 0x61c:
return 0;
case 0x64b ... 0x65f:
return 0;
case 0x670:
return 0;
case 0x6d6 ... 0x6dc:
case 0x6d6 ... 0x6dd:
return 0;
case 0x6df ... 0x6e4:
return 0;
@ -47,6 +51,8 @@ wcwidth_std(int32_t code) {
return 0;
case 0x6ea ... 0x6ed:
return 0;
case 0x70f:
return 0;
case 0x711:
return 0;
case 0x730 ... 0x74a:
@ -67,11 +73,11 @@ wcwidth_std(int32_t code) {
return 0;
case 0x859 ... 0x85b:
return 0;
case 0x890 ... 0x891:
return 0;
case 0x898 ... 0x89f:
return 0;
case 0x8ca ... 0x8e1:
return 0;
case 0x8e3 ... 0x903:
case 0x8ca ... 0x903:
return 0;
case 0x93a ... 0x93c:
return 0;
@ -271,9 +277,7 @@ wcwidth_std(int32_t code) {
return 0;
case 0x17dd:
return 0;
case 0x180b ... 0x180d:
return 0;
case 0x180f:
case 0x180b ... 0x180f:
return 0;
case 0x1885 ... 0x1886:
return 0;
@ -319,9 +323,11 @@ wcwidth_std(int32_t code) {
return 0;
case 0x1dc0 ... 0x1dff:
return 0;
case 0x200d:
case 0x200b ... 0x200f:
return 0;
case 0x2065:
case 0x202a ... 0x202e:
return 0;
case 0x2060 ... 0x206f:
return 0;
case 0x20d0 ... 0x20f0:
return 0;
@ -405,9 +411,11 @@ wcwidth_std(int32_t code) {
return 0;
case 0xfe20 ... 0xfe2f:
return 0;
case 0xfeff:
return 0;
case 0xffa0:
return 0;
case 0xfff0 ... 0xfff8:
case 0xfff0 ... 0xfffb:
return 0;
case 0x101fd:
return 0;
@ -447,8 +455,12 @@ wcwidth_std(int32_t code) {
return 0;
case 0x110b0 ... 0x110ba:
return 0;
case 0x110bd:
return 0;
case 0x110c2:
return 0;
case 0x110cd:
return 0;
case 0x11100 ... 0x11102:
return 0;
case 0x11127 ... 0x11134:
@ -563,6 +575,8 @@ wcwidth_std(int32_t code) {
return 0;
case 0x11ef3 ... 0x11ef6:
return 0;
case 0x13430 ... 0x13438:
return 0;
case 0x16af0 ... 0x16af4:
return 0;
case 0x16b30 ... 0x16b36:
@ -579,15 +593,15 @@ wcwidth_std(int32_t code) {
return 0;
case 0x1bc9d ... 0x1bc9e:
return 0;
case 0x1bca0 ... 0x1bca3:
return 0;
case 0x1cf00 ... 0x1cf2d:
return 0;
case 0x1cf30 ... 0x1cf46:
return 0;
case 0x1d165 ... 0x1d169:
return 0;
case 0x1d16d ... 0x1d172:
return 0;
case 0x1d17b ... 0x1d182:
case 0x1d16d ... 0x1d182:
return 0;
case 0x1d185 ... 0x1d18b:
return 0;
@ -629,63 +643,17 @@ wcwidth_std(int32_t code) {
return 0;
case 0x1f3fb ... 0x1f3ff:
return 0;
case 0xe0000:
return 0;
case 0xe0002 ... 0xe001f:
return 0;
case 0xe0080 ... 0xe0fff:
case 0xe0000 ... 0xe0fff:
return 0;
// }}}
// Non-printing characters (2273 codepoints) {{{
// Non-printing characters (2112 codepoints) {{{
case 0x1 ... 0x1f:
return -1;
case 0x7f ... 0x9f:
return -1;
case 0x600 ... 0x605:
return -1;
case 0x61c:
return -1;
case 0x6dd:
return -1;
case 0x70f:
return -1;
case 0x890 ... 0x891:
return -1;
case 0x8e2:
return -1;
case 0x180e:
return -1;
case 0x200b ... 0x200c:
return -1;
case 0x200e ... 0x200f:
return -1;
case 0x202a ... 0x202e:
return -1;
case 0x2060 ... 0x2064:
return -1;
case 0x2066 ... 0x206f:
return -1;
case 0xd800 ... 0xdfff:
return -1;
case 0xfeff:
return -1;
case 0xfff9 ... 0xfffb:
return -1;
case 0x110bd:
return -1;
case 0x110cd:
return -1;
case 0x13430 ... 0x13438:
return -1;
case 0x1bca0 ... 0x1bca3:
return -1;
case 0x1d173 ... 0x1d17a:
return -1;
case 0xe0001:
return -1;
case 0xe0020 ... 0xe007f:
return -1;
// }}}
// Private use (137468 codepoints) {{{
@ -3203,7 +3171,7 @@ is_emoji_presentation_base(uint32_t code) {
return true;
case 0x1f6d5 ... 0x1f6d7:
return true;
case 0x1f6e0 ... 0x1f6e5:
case 0x1f6dd ... 0x1f6e5:
return true;
case 0x1f6e9:
return true;
@ -3215,29 +3183,31 @@ is_emoji_presentation_base(uint32_t code) {
return true;
case 0x1f7e0 ... 0x1f7eb:
return true;
case 0x1f7f0:
return true;
case 0x1f90c ... 0x1f93a:
return true;
case 0x1f93c ... 0x1f945:
return true;
case 0x1f947 ... 0x1f978:
return true;
case 0x1f97a ... 0x1f9cb:
return true;
case 0x1f9cd ... 0x1f9ff:
case 0x1f947 ... 0x1f9ff:
return true;
case 0x1fa70 ... 0x1fa74:
return true;
case 0x1fa78 ... 0x1fa7a:
case 0x1fa78 ... 0x1fa7c:
return true;
case 0x1fa80 ... 0x1fa86:
return true;
case 0x1fa90 ... 0x1faa8:
case 0x1fa90 ... 0x1faac:
return true;
case 0x1fab0 ... 0x1fab6:
case 0x1fab0 ... 0x1faba:
return true;
case 0x1fac0 ... 0x1fac2:
case 0x1fac0 ... 0x1fac5:
return true;
case 0x1fad0 ... 0x1fad6:
case 0x1fad0 ... 0x1fad9:
return true;
case 0x1fae0 ... 0x1fae7:
return true;
case 0x1faf0 ... 0x1faf6:
return true;
default: return false;
}