Fix soft hyphens not being preserved when round tripping text through the terminal

Also roundtrip all characters in the Cf category. Characters with the DI (Default Ignorable) property are now preserved but not rendered and treated as zero-width as per the unicode standard. See https://www.unicode.org/faq/unsup_char.html
2021-10-07 10:26:57 +05:30 · 2021-10-07 10:26:57 +05:30 · fbf47f75d5
commit fbf47f75d5
parent 1b42f69119
10 changed files with 465 additions and 392 deletions
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@ -74,6 +74,9 @@ To update |kitty|, :doc:`follow the instructions <binary>`.
  Applications must use the dedicated escape code to turn on the protocol.
  (:iss:`4075`)
 - Fix soft hyphens not being preserved when round tripping text through the
  terminal
 0.23.1 [2021-08-17]
 ----------------------
--- a/gen-wcwidth.py
+++ b/gen-wcwidth.py
@ -51,9 +51,26 @@ all_symbols: Set[int] = set()
 name_map: Dict[int, str] = {}
 word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
 zwj = 0x200d
 soft_hyphen = 0xad
 flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
 # See https://github.com/harfbuzz/harfbuzz/issues/169
 marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints
 not_assigned = set(range(0, sys.maxunicode))
 property_maps: Dict[str, Set[int]] = defaultdict(set)
 def parse_prop_list() -> None:
    global marks
    for line in get_data('ucd/PropList.txt'):
        if line.startswith('#'):
            continue
        cp_or_range, rest = line.split(';', 1)
        chars = parse_range_spec(cp_or_range.strip())
        name = rest.strip().split()[0]
        property_maps[name] |= chars
    # see https://www.unicode.org/faq/unsup_char.html#3
    marks |= property_maps['Other_Default_Ignorable_Code_Point']
    marks.add(soft_hyphen)
 def parse_ucd() -> None:
@ -354,16 +371,21 @@ def gen_ucd() -> None:
        p('#include "unicode-data.h"')
        category_test(
                'is_combining_char', p,
-                {c for c in class_maps if c.startswith('M')},
+                (),
-                'M category (marks)',
+                'Combining and default ignored characters',
-                # See https://github.com/harfbuzz/harfbuzz/issues/169
+                extra_chars=marks,
                extra_chars=emoji_skin_tone_modifiers | {zwj},
                least_check_return='false'
        )
        category_test(
-            'is_ignored_char', p, 'Cc Cf Cs'.split(),
+            'is_ignored_char', p, 'Cc Cs'.split(),
            'Control characters and non-characters',
-            extra_chars=non_characters, exclude={zwj},
+            extra_chars=non_characters,
            ascii_range='false'
        )
        category_test(
            'is_non_rendered_char', p, 'Cc Cs'.split(),
            'Other_Default_Ignorable_Code_Point and soft hyphen',
            extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | {soft_hyphen},
            ascii_range='false'
        )
        category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
@ -378,15 +400,19 @@ def gen_ucd() -> None:
        p('combining_type mark_for_codepoint(char_type c) {')
        rmap = codepoint_to_mark_map(p, mark_map)
        p('}\n')
-        with open('kitty/unicode-data.h') as f:
+        with open('kitty/unicode-data.h', 'r+') as f:
-            unicode_data = f.read()
+            raw = f.read()
-        m = re.search(r'^#define VS15 (\d+)', unicode_data, re.M)
+            f.seek(0)
-        if m is not None:
+            raw, num = re.subn(
-            expected = int(m.group(1))
+                r'^// START_KNOWN_MARKS.+?^// END_KNOWN_MARKS',
-        if rmap[0xfe0e] != expected:
+                '// START_KNOWN_MARKS\nstatic const combining_type '
-            raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format(
+                f'VS15 = {rmap[0xfe0e]}, VS16 = {rmap[0xfe0f]};'
-                rmap[0xfe0e], rmap[0xfe0f]
+                '\n// END_KNOWN_MARKS', raw, flags=re.MULTILINE | re.DOTALL)
-            ))
+            if not num:
                raise SystemExit('Faile dto patch mark definitions in unicode-data.h')
            f.truncate()
            f.write(raw)
    with open('kittens/hints/url_regex.py', 'w') as f:
        f.write('# generated by gen-wcwidth.py, do not edit\n\n')
        f.write("url_delimiters = '{}'  # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))
@ -537,6 +563,7 @@ def gen_wcwidth() -> None:
 parse_ucd()
 parse_prop_list()
 parse_emoji()
 parse_eaw()
 gen_ucd()
--- a/kittens/unicode_input/names.h
+++ b/kittens/unicode_input/names.h
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2021-10-04
+// unicode data, built from the unicode standard on: 2021-10-07
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
--- a/kitty/emoji.h
+++ b/kitty/emoji.h
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2021-10-04
+// unicode data, built from the unicode standard on: 2021-10-07
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
--- a/kitty/fonts.c
+++ b/kitty/fonts.c
@ -387,8 +387,7 @@ has_cell_text(Font *self, CPUCell *cell) {
    char_type combining_chars[arraysz(cell->cc_idx)];
    unsigned num_cc = 0;
    for (unsigned i = 0; i < arraysz(cell->cc_idx) && cell->cc_idx[i]; i++) {
-        if (cell->cc_idx[i] == VS15 || cell->cc_idx[i] == VS16) continue;
+        if (!is_non_rendered_char(cell->cc_idx[i])) combining_chars[num_cc++] = codepoint_for_mark(cell->cc_idx[i]);
        combining_chars[num_cc++] = codepoint_for_mark(cell->cc_idx[i]);
    }
    if (num_cc == 0) return true;
    if (num_cc == 1) {
--- a/kitty/unicode-data.c
+++ b/kitty/unicode-data.c
--- a/kitty/unicode-data.h
+++ b/kitty/unicode-data.h
@ -1,14 +1,16 @@
 #pragma once
 #include "data-types.h"
 #include "state.h"
-#define VS15 1320
+// START_KNOWN_MARKS
-#define VS16 1321
+static const combining_type VS15 = 1325, VS16 = 1326;
 // END_KNOWN_MARKS
 bool is_combining_char(char_type ch);
 bool is_ignored_char(char_type ch);
 bool is_word_char(char_type ch);
 bool is_CZ_category(char_type);
 bool is_P_category(char_type);
 bool is_non_rendered_char(char_type);
 char_type codepoint_for_mark(combining_type m);
 combining_type mark_for_codepoint(char_type c);
--- a/kitty/wcwidth-std.h
+++ b/kitty/wcwidth-std.h
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2021-10-04
+// unicode data, built from the unicode standard on: 2021-10-07
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
@ -14,9 +14,11 @@ wcwidth_std(int32_t code) {
 			return 2;
 		// }}}
-		// Marks (2415 codepoints) {{{
+		// Marks (6189 codepoints) {{{
 		case 0x0:
 			return 0;
 		case 0xad:
 			return 0;
 		case 0x300 ... 0x36f:
 			return 0;
 		case 0x483 ... 0x489:
@ -253,6 +255,8 @@ wcwidth_std(int32_t code) {
 			return 0;
 		case 0x109a ... 0x109d:
 			return 0;
 		case 0x115f ... 0x1160:
 			return 0;
 		case 0x135d ... 0x135f:
 			return 0;
 		case 0x1712 ... 0x1715:
@ -317,6 +321,8 @@ wcwidth_std(int32_t code) {
 			return 0;
 		case 0x200d:
 			return 0;
 		case 0x2065:
 			return 0;
 		case 0x20d0 ... 0x20f0:
 			return 0;
 		case 0x2cef ... 0x2cf1:
@ -329,6 +335,8 @@ wcwidth_std(int32_t code) {
 			return 0;
 		case 0x3099 ... 0x309a:
 			return 0;
 		case 0x3164:
 			return 0;
 		case 0xa66f ... 0xa672:
 			return 0;
 		case 0xa674 ... 0xa67d:
@ -397,6 +405,10 @@ wcwidth_std(int32_t code) {
 			return 0;
 		case 0xfe20 ... 0xfe2f:
 			return 0;
 		case 0xffa0:
 			return 0;
 		case 0xfff0 ... 0xfff8:
 			return 0;
 		case 0x101fd:
 			return 0;
 		case 0x102e0:
@ -617,17 +629,19 @@ wcwidth_std(int32_t code) {
 			return 0;
 		case 0x1f3fb ... 0x1f3ff:
 			return 0;
-		case 0xe0100 ... 0xe01ef:
+		case 0xe0000:
 			return 0;
 		case 0xe0002 ... 0xe001f:
 			return 0;
 		case 0xe0080 ... 0xe0fff:
 			return 0;
 		// }}}
-		// Non-printing characters (2274 codepoints) {{{
+		// Non-printing characters (2273 codepoints) {{{
 		case 0x1 ... 0x1f:
 			return -1;
 		case 0x7f ... 0x9f:
 			return -1;
 		case 0xad:
 			return -1;
 		case 0x600 ... 0x605:
 			return -1;
 		case 0x61c:
@ -1275,8 +1289,8 @@ wcwidth_std(int32_t code) {
 			return -2;
 		// }}}
-		// East Asian double width (182472 codepoints) {{{
+		// East Asian double width (182470 codepoints) {{{
-		case 0x1100 ... 0x115f:
+		case 0x1100 ... 0x115e:
 			return 2;
 		case 0x231a ... 0x231b:
 			return 2;
@ -1366,7 +1380,9 @@ wcwidth_std(int32_t code) {
 			return 2;
 		case 0x3105 ... 0x312f:
 			return 2;
-		case 0x3131 ... 0x318e:
+		case 0x3131 ... 0x3163:
 			return 2;
 		case 0x3165 ... 0x318e:
 			return 2;
 		case 0x3190 ... 0x31e3:
 			return 2;
@ -1533,7 +1549,7 @@ wcwidth_std(int32_t code) {
 		// Emoji Presentation (0 codepoints) {{{
 		// }}}
-		// Not assigned in the unicode character database (764536 codepoints) {{{
+		// Not assigned in the unicode character database (760767 codepoints) {{{
 		case 0x378 ... 0x379:
 			return -4;
 		case 0x380 ... 0x383:
@ -2012,8 +2028,6 @@ wcwidth_std(int32_t code) {
 			return -4;
 		case 0x1fff:
 			return -4;
 		case 0x2065:
 			return -4;
 		case 0x2072 ... 0x2073:
 			return -4;
 		case 0x208f:
@ -2204,7 +2218,7 @@ wcwidth_std(int32_t code) {
 			return -4;
 		case 0xffe7:
 			return -4;
-		case 0xffef ... 0xfff8:
+		case 0xffef:
 			return -4;
 		case 0xfffe ... 0xffff:
 			return -4;
@ -2904,13 +2918,9 @@ wcwidth_std(int32_t code) {
 			return -4;
 		case 0x2fffe ... 0x2ffff:
 			return -4;
-		case 0x3fffe ... 0xe0000:
+		case 0x3fffe ... 0xdffff:
 			return -4;
-		case 0xe0002 ... 0xe001f:
+		case 0xe1000 ... 0xeffff:
 			return -4;
 		case 0xe0080 ... 0xe00ff:
 			return -4;
 		case 0xe01f0 ... 0xeffff:
 			return -4;
 		case 0xffffe ... 0xfffff:
 			return -4;
--- a/kitty_tests/datatypes.py
+++ b/kitty_tests/datatypes.py
@ -369,6 +369,7 @@ class TestDataTypes(BaseTest):
        self.ae(wcswidth('\U0001F1E6a\U0001F1E8a'), 6)
        self.ae(wcswidth('\U0001F1E6\U0001F1E8a'), 3)
        self.ae(wcswidth('\U0001F1E6\U0001F1E8\U0001F1E6'), 4)
        self.ae(wcswidth('a\u00adb'), 2)
        # Regional indicator symbols (unicode flags) are defined as having
        # Emoji_Presentation so must have width 2
        self.ae(tuple(map(w, '\U0001f1ee\U0001f1f3')), (2, 2))
--- a/kitty_tests/screen.py
+++ b/kitty_tests/screen.py
@ -513,6 +513,15 @@ class TestScreen(BaseTest):
        self.ae(s.text_for_selection(), expected)
        s.scroll(2, True)
        self.ae(s.text_for_selection(), expected)
        s.reset()
    def test_soft_hyphen(self):
        s = self.create_screen()
        s.draw('a\u00adb')
        self.ae(s.cursor.x, 2)
        s.start_selection(0, 0)
        s.update_selection(2, 0)
        self.ae(s.text_for_selection(), ('a\u00adb',))
    def test_variation_selectors(self):
        s = self.create_screen()