Fix soft hyphens not being preserved when round tripping text through the terminal
Also roundtrip all characters in the Cf category. Characters with the DI (Default Ignorable) property are now preserved but not rendered and treated as zero-width as per the unicode standard. See https://www.unicode.org/faq/unsup_char.html
This commit is contained in:
parent
1b42f69119
commit
fbf47f75d5
@ -74,6 +74,9 @@ To update |kitty|, :doc:`follow the instructions <binary>`.
|
|||||||
Applications must use the dedicated escape code to turn on the protocol.
|
Applications must use the dedicated escape code to turn on the protocol.
|
||||||
(:iss:`4075`)
|
(:iss:`4075`)
|
||||||
|
|
||||||
|
- Fix soft hyphens not being preserved when round tripping text through the
|
||||||
|
terminal
|
||||||
|
|
||||||
|
|
||||||
0.23.1 [2021-08-17]
|
0.23.1 [2021-08-17]
|
||||||
----------------------
|
----------------------
|
||||||
|
|||||||
@ -51,9 +51,26 @@ all_symbols: Set[int] = set()
|
|||||||
name_map: Dict[int, str] = {}
|
name_map: Dict[int, str] = {}
|
||||||
word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
|
word_search_map: DefaultDict[str, Set[int]] = defaultdict(set)
|
||||||
zwj = 0x200d
|
zwj = 0x200d
|
||||||
|
soft_hyphen = 0xad
|
||||||
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
|
flag_codepoints = frozenset(range(0x1F1E6, 0x1F1E6 + 26))
|
||||||
|
# See https://github.com/harfbuzz/harfbuzz/issues/169
|
||||||
marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints
|
marks = set(emoji_skin_tone_modifiers) | {zwj} | flag_codepoints
|
||||||
not_assigned = set(range(0, sys.maxunicode))
|
not_assigned = set(range(0, sys.maxunicode))
|
||||||
|
property_maps: Dict[str, Set[int]] = defaultdict(set)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_prop_list() -> None:
|
||||||
|
global marks
|
||||||
|
for line in get_data('ucd/PropList.txt'):
|
||||||
|
if line.startswith('#'):
|
||||||
|
continue
|
||||||
|
cp_or_range, rest = line.split(';', 1)
|
||||||
|
chars = parse_range_spec(cp_or_range.strip())
|
||||||
|
name = rest.strip().split()[0]
|
||||||
|
property_maps[name] |= chars
|
||||||
|
# see https://www.unicode.org/faq/unsup_char.html#3
|
||||||
|
marks |= property_maps['Other_Default_Ignorable_Code_Point']
|
||||||
|
marks.add(soft_hyphen)
|
||||||
|
|
||||||
|
|
||||||
def parse_ucd() -> None:
|
def parse_ucd() -> None:
|
||||||
@ -354,16 +371,21 @@ def gen_ucd() -> None:
|
|||||||
p('#include "unicode-data.h"')
|
p('#include "unicode-data.h"')
|
||||||
category_test(
|
category_test(
|
||||||
'is_combining_char', p,
|
'is_combining_char', p,
|
||||||
{c for c in class_maps if c.startswith('M')},
|
(),
|
||||||
'M category (marks)',
|
'Combining and default ignored characters',
|
||||||
# See https://github.com/harfbuzz/harfbuzz/issues/169
|
extra_chars=marks,
|
||||||
extra_chars=emoji_skin_tone_modifiers | {zwj},
|
|
||||||
least_check_return='false'
|
least_check_return='false'
|
||||||
)
|
)
|
||||||
category_test(
|
category_test(
|
||||||
'is_ignored_char', p, 'Cc Cf Cs'.split(),
|
'is_ignored_char', p, 'Cc Cs'.split(),
|
||||||
'Control characters and non-characters',
|
'Control characters and non-characters',
|
||||||
extra_chars=non_characters, exclude={zwj},
|
extra_chars=non_characters,
|
||||||
|
ascii_range='false'
|
||||||
|
)
|
||||||
|
category_test(
|
||||||
|
'is_non_rendered_char', p, 'Cc Cs'.split(),
|
||||||
|
'Other_Default_Ignorable_Code_Point and soft hyphen',
|
||||||
|
extra_chars=property_maps['Other_Default_Ignorable_Code_Point'] | {soft_hyphen},
|
||||||
ascii_range='false'
|
ascii_range='false'
|
||||||
)
|
)
|
||||||
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
|
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
|
||||||
@ -378,15 +400,19 @@ def gen_ucd() -> None:
|
|||||||
p('combining_type mark_for_codepoint(char_type c) {')
|
p('combining_type mark_for_codepoint(char_type c) {')
|
||||||
rmap = codepoint_to_mark_map(p, mark_map)
|
rmap = codepoint_to_mark_map(p, mark_map)
|
||||||
p('}\n')
|
p('}\n')
|
||||||
with open('kitty/unicode-data.h') as f:
|
with open('kitty/unicode-data.h', 'r+') as f:
|
||||||
unicode_data = f.read()
|
raw = f.read()
|
||||||
m = re.search(r'^#define VS15 (\d+)', unicode_data, re.M)
|
f.seek(0)
|
||||||
if m is not None:
|
raw, num = re.subn(
|
||||||
expected = int(m.group(1))
|
r'^// START_KNOWN_MARKS.+?^// END_KNOWN_MARKS',
|
||||||
if rmap[0xfe0e] != expected:
|
'// START_KNOWN_MARKS\nstatic const combining_type '
|
||||||
raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format(
|
f'VS15 = {rmap[0xfe0e]}, VS16 = {rmap[0xfe0f]};'
|
||||||
rmap[0xfe0e], rmap[0xfe0f]
|
'\n// END_KNOWN_MARKS', raw, flags=re.MULTILINE | re.DOTALL)
|
||||||
))
|
if not num:
|
||||||
|
raise SystemExit('Faile dto patch mark definitions in unicode-data.h')
|
||||||
|
f.truncate()
|
||||||
|
f.write(raw)
|
||||||
|
|
||||||
with open('kittens/hints/url_regex.py', 'w') as f:
|
with open('kittens/hints/url_regex.py', 'w') as f:
|
||||||
f.write('# generated by gen-wcwidth.py, do not edit\n\n')
|
f.write('# generated by gen-wcwidth.py, do not edit\n\n')
|
||||||
f.write("url_delimiters = '{}' # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))
|
f.write("url_delimiters = '{}' # noqa".format(''.join(classes_to_regex(cz, exclude='\n\r'))))
|
||||||
@ -537,6 +563,7 @@ def gen_wcwidth() -> None:
|
|||||||
|
|
||||||
|
|
||||||
parse_ucd()
|
parse_ucd()
|
||||||
|
parse_prop_list()
|
||||||
parse_emoji()
|
parse_emoji()
|
||||||
parse_eaw()
|
parse_eaw()
|
||||||
gen_ucd()
|
gen_ucd()
|
||||||
|
|||||||
2
kittens/unicode_input/names.h
generated
2
kittens/unicode_input/names.h
generated
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2021-10-04
|
// unicode data, built from the unicode standard on: 2021-10-07
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|||||||
2
kitty/emoji.h
generated
2
kitty/emoji.h
generated
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2021-10-04
|
// unicode data, built from the unicode standard on: 2021-10-07
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|||||||
@ -387,8 +387,7 @@ has_cell_text(Font *self, CPUCell *cell) {
|
|||||||
char_type combining_chars[arraysz(cell->cc_idx)];
|
char_type combining_chars[arraysz(cell->cc_idx)];
|
||||||
unsigned num_cc = 0;
|
unsigned num_cc = 0;
|
||||||
for (unsigned i = 0; i < arraysz(cell->cc_idx) && cell->cc_idx[i]; i++) {
|
for (unsigned i = 0; i < arraysz(cell->cc_idx) && cell->cc_idx[i]; i++) {
|
||||||
if (cell->cc_idx[i] == VS15 || cell->cc_idx[i] == VS16) continue;
|
if (!is_non_rendered_char(cell->cc_idx[i])) combining_chars[num_cc++] = codepoint_for_mark(cell->cc_idx[i]);
|
||||||
combining_chars[num_cc++] = codepoint_for_mark(cell->cc_idx[i]);
|
|
||||||
}
|
}
|
||||||
if (num_cc == 0) return true;
|
if (num_cc == 0) return true;
|
||||||
if (num_cc == 1) {
|
if (num_cc == 1) {
|
||||||
|
|||||||
726
kitty/unicode-data.c
generated
726
kitty/unicode-data.c
generated
File diff suppressed because one or more lines are too long
@ -1,14 +1,16 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
#include "state.h"
|
#include "state.h"
|
||||||
#define VS15 1320
|
// START_KNOWN_MARKS
|
||||||
#define VS16 1321
|
static const combining_type VS15 = 1325, VS16 = 1326;
|
||||||
|
// END_KNOWN_MARKS
|
||||||
|
|
||||||
bool is_combining_char(char_type ch);
|
bool is_combining_char(char_type ch);
|
||||||
bool is_ignored_char(char_type ch);
|
bool is_ignored_char(char_type ch);
|
||||||
bool is_word_char(char_type ch);
|
bool is_word_char(char_type ch);
|
||||||
bool is_CZ_category(char_type);
|
bool is_CZ_category(char_type);
|
||||||
bool is_P_category(char_type);
|
bool is_P_category(char_type);
|
||||||
|
bool is_non_rendered_char(char_type);
|
||||||
char_type codepoint_for_mark(combining_type m);
|
char_type codepoint_for_mark(combining_type m);
|
||||||
combining_type mark_for_codepoint(char_type c);
|
combining_type mark_for_codepoint(char_type c);
|
||||||
|
|
||||||
|
|||||||
48
kitty/wcwidth-std.h
generated
48
kitty/wcwidth-std.h
generated
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2021-10-04
|
// unicode data, built from the unicode standard on: 2021-10-07
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
@ -14,9 +14,11 @@ wcwidth_std(int32_t code) {
|
|||||||
return 2;
|
return 2;
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// Marks (2415 codepoints) {{{
|
// Marks (6189 codepoints) {{{
|
||||||
case 0x0:
|
case 0x0:
|
||||||
return 0;
|
return 0;
|
||||||
|
case 0xad:
|
||||||
|
return 0;
|
||||||
case 0x300 ... 0x36f:
|
case 0x300 ... 0x36f:
|
||||||
return 0;
|
return 0;
|
||||||
case 0x483 ... 0x489:
|
case 0x483 ... 0x489:
|
||||||
@ -253,6 +255,8 @@ wcwidth_std(int32_t code) {
|
|||||||
return 0;
|
return 0;
|
||||||
case 0x109a ... 0x109d:
|
case 0x109a ... 0x109d:
|
||||||
return 0;
|
return 0;
|
||||||
|
case 0x115f ... 0x1160:
|
||||||
|
return 0;
|
||||||
case 0x135d ... 0x135f:
|
case 0x135d ... 0x135f:
|
||||||
return 0;
|
return 0;
|
||||||
case 0x1712 ... 0x1715:
|
case 0x1712 ... 0x1715:
|
||||||
@ -317,6 +321,8 @@ wcwidth_std(int32_t code) {
|
|||||||
return 0;
|
return 0;
|
||||||
case 0x200d:
|
case 0x200d:
|
||||||
return 0;
|
return 0;
|
||||||
|
case 0x2065:
|
||||||
|
return 0;
|
||||||
case 0x20d0 ... 0x20f0:
|
case 0x20d0 ... 0x20f0:
|
||||||
return 0;
|
return 0;
|
||||||
case 0x2cef ... 0x2cf1:
|
case 0x2cef ... 0x2cf1:
|
||||||
@ -329,6 +335,8 @@ wcwidth_std(int32_t code) {
|
|||||||
return 0;
|
return 0;
|
||||||
case 0x3099 ... 0x309a:
|
case 0x3099 ... 0x309a:
|
||||||
return 0;
|
return 0;
|
||||||
|
case 0x3164:
|
||||||
|
return 0;
|
||||||
case 0xa66f ... 0xa672:
|
case 0xa66f ... 0xa672:
|
||||||
return 0;
|
return 0;
|
||||||
case 0xa674 ... 0xa67d:
|
case 0xa674 ... 0xa67d:
|
||||||
@ -397,6 +405,10 @@ wcwidth_std(int32_t code) {
|
|||||||
return 0;
|
return 0;
|
||||||
case 0xfe20 ... 0xfe2f:
|
case 0xfe20 ... 0xfe2f:
|
||||||
return 0;
|
return 0;
|
||||||
|
case 0xffa0:
|
||||||
|
return 0;
|
||||||
|
case 0xfff0 ... 0xfff8:
|
||||||
|
return 0;
|
||||||
case 0x101fd:
|
case 0x101fd:
|
||||||
return 0;
|
return 0;
|
||||||
case 0x102e0:
|
case 0x102e0:
|
||||||
@ -617,17 +629,19 @@ wcwidth_std(int32_t code) {
|
|||||||
return 0;
|
return 0;
|
||||||
case 0x1f3fb ... 0x1f3ff:
|
case 0x1f3fb ... 0x1f3ff:
|
||||||
return 0;
|
return 0;
|
||||||
case 0xe0100 ... 0xe01ef:
|
case 0xe0000:
|
||||||
|
return 0;
|
||||||
|
case 0xe0002 ... 0xe001f:
|
||||||
|
return 0;
|
||||||
|
case 0xe0080 ... 0xe0fff:
|
||||||
return 0;
|
return 0;
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// Non-printing characters (2274 codepoints) {{{
|
// Non-printing characters (2273 codepoints) {{{
|
||||||
case 0x1 ... 0x1f:
|
case 0x1 ... 0x1f:
|
||||||
return -1;
|
return -1;
|
||||||
case 0x7f ... 0x9f:
|
case 0x7f ... 0x9f:
|
||||||
return -1;
|
return -1;
|
||||||
case 0xad:
|
|
||||||
return -1;
|
|
||||||
case 0x600 ... 0x605:
|
case 0x600 ... 0x605:
|
||||||
return -1;
|
return -1;
|
||||||
case 0x61c:
|
case 0x61c:
|
||||||
@ -1275,8 +1289,8 @@ wcwidth_std(int32_t code) {
|
|||||||
return -2;
|
return -2;
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// East Asian double width (182472 codepoints) {{{
|
// East Asian double width (182470 codepoints) {{{
|
||||||
case 0x1100 ... 0x115f:
|
case 0x1100 ... 0x115e:
|
||||||
return 2;
|
return 2;
|
||||||
case 0x231a ... 0x231b:
|
case 0x231a ... 0x231b:
|
||||||
return 2;
|
return 2;
|
||||||
@ -1366,7 +1380,9 @@ wcwidth_std(int32_t code) {
|
|||||||
return 2;
|
return 2;
|
||||||
case 0x3105 ... 0x312f:
|
case 0x3105 ... 0x312f:
|
||||||
return 2;
|
return 2;
|
||||||
case 0x3131 ... 0x318e:
|
case 0x3131 ... 0x3163:
|
||||||
|
return 2;
|
||||||
|
case 0x3165 ... 0x318e:
|
||||||
return 2;
|
return 2;
|
||||||
case 0x3190 ... 0x31e3:
|
case 0x3190 ... 0x31e3:
|
||||||
return 2;
|
return 2;
|
||||||
@ -1533,7 +1549,7 @@ wcwidth_std(int32_t code) {
|
|||||||
// Emoji Presentation (0 codepoints) {{{
|
// Emoji Presentation (0 codepoints) {{{
|
||||||
// }}}
|
// }}}
|
||||||
|
|
||||||
// Not assigned in the unicode character database (764536 codepoints) {{{
|
// Not assigned in the unicode character database (760767 codepoints) {{{
|
||||||
case 0x378 ... 0x379:
|
case 0x378 ... 0x379:
|
||||||
return -4;
|
return -4;
|
||||||
case 0x380 ... 0x383:
|
case 0x380 ... 0x383:
|
||||||
@ -2012,8 +2028,6 @@ wcwidth_std(int32_t code) {
|
|||||||
return -4;
|
return -4;
|
||||||
case 0x1fff:
|
case 0x1fff:
|
||||||
return -4;
|
return -4;
|
||||||
case 0x2065:
|
|
||||||
return -4;
|
|
||||||
case 0x2072 ... 0x2073:
|
case 0x2072 ... 0x2073:
|
||||||
return -4;
|
return -4;
|
||||||
case 0x208f:
|
case 0x208f:
|
||||||
@ -2204,7 +2218,7 @@ wcwidth_std(int32_t code) {
|
|||||||
return -4;
|
return -4;
|
||||||
case 0xffe7:
|
case 0xffe7:
|
||||||
return -4;
|
return -4;
|
||||||
case 0xffef ... 0xfff8:
|
case 0xffef:
|
||||||
return -4;
|
return -4;
|
||||||
case 0xfffe ... 0xffff:
|
case 0xfffe ... 0xffff:
|
||||||
return -4;
|
return -4;
|
||||||
@ -2904,13 +2918,9 @@ wcwidth_std(int32_t code) {
|
|||||||
return -4;
|
return -4;
|
||||||
case 0x2fffe ... 0x2ffff:
|
case 0x2fffe ... 0x2ffff:
|
||||||
return -4;
|
return -4;
|
||||||
case 0x3fffe ... 0xe0000:
|
case 0x3fffe ... 0xdffff:
|
||||||
return -4;
|
return -4;
|
||||||
case 0xe0002 ... 0xe001f:
|
case 0xe1000 ... 0xeffff:
|
||||||
return -4;
|
|
||||||
case 0xe0080 ... 0xe00ff:
|
|
||||||
return -4;
|
|
||||||
case 0xe01f0 ... 0xeffff:
|
|
||||||
return -4;
|
return -4;
|
||||||
case 0xffffe ... 0xfffff:
|
case 0xffffe ... 0xfffff:
|
||||||
return -4;
|
return -4;
|
||||||
|
|||||||
@ -369,6 +369,7 @@ class TestDataTypes(BaseTest):
|
|||||||
self.ae(wcswidth('\U0001F1E6a\U0001F1E8a'), 6)
|
self.ae(wcswidth('\U0001F1E6a\U0001F1E8a'), 6)
|
||||||
self.ae(wcswidth('\U0001F1E6\U0001F1E8a'), 3)
|
self.ae(wcswidth('\U0001F1E6\U0001F1E8a'), 3)
|
||||||
self.ae(wcswidth('\U0001F1E6\U0001F1E8\U0001F1E6'), 4)
|
self.ae(wcswidth('\U0001F1E6\U0001F1E8\U0001F1E6'), 4)
|
||||||
|
self.ae(wcswidth('a\u00adb'), 2)
|
||||||
# Regional indicator symbols (unicode flags) are defined as having
|
# Regional indicator symbols (unicode flags) are defined as having
|
||||||
# Emoji_Presentation so must have width 2
|
# Emoji_Presentation so must have width 2
|
||||||
self.ae(tuple(map(w, '\U0001f1ee\U0001f1f3')), (2, 2))
|
self.ae(tuple(map(w, '\U0001f1ee\U0001f1f3')), (2, 2))
|
||||||
|
|||||||
@ -513,6 +513,15 @@ class TestScreen(BaseTest):
|
|||||||
self.ae(s.text_for_selection(), expected)
|
self.ae(s.text_for_selection(), expected)
|
||||||
s.scroll(2, True)
|
s.scroll(2, True)
|
||||||
self.ae(s.text_for_selection(), expected)
|
self.ae(s.text_for_selection(), expected)
|
||||||
|
s.reset()
|
||||||
|
|
||||||
|
def test_soft_hyphen(self):
|
||||||
|
s = self.create_screen()
|
||||||
|
s.draw('a\u00adb')
|
||||||
|
self.ae(s.cursor.x, 2)
|
||||||
|
s.start_selection(0, 0)
|
||||||
|
s.update_selection(2, 0)
|
||||||
|
self.ae(s.text_for_selection(), ('a\u00adb',))
|
||||||
|
|
||||||
def test_variation_selectors(self):
|
def test_variation_selectors(self):
|
||||||
s = self.create_screen()
|
s = self.create_screen()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user