Mapping that can be used to store unicode mark symbols in only two bytes

This commit is contained in:
Kovid Goyal 2018-01-18 16:06:07 +05:30
parent 409bd37db5
commit 32632264ee
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 294 additions and 0 deletions

View File

@ -184,6 +184,24 @@ def gen_ucd():
category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories') category_test('is_word_char', p, {c for c in class_maps if c[0] in 'LN'}, 'L and N categories')
category_test('is_CZ_category', p, {c for c in class_maps if c[0] in 'CZ'}, 'C and Z categories') category_test('is_CZ_category', p, {c for c in class_maps if c[0] in 'CZ'}, 'C and Z categories')
category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)') category_test('is_P_category', p, {c for c in class_maps if c[0] == 'P'}, 'P category (punctuation)')
mark_map = [0] + list(sorted(marks))
p('char_type codepoint_for_mark(combining_type m) {')
p(f'\tstatic char_type map[{len(mark_map)}] =', '{', ', '.join(map(str, mark_map)), '}; // {{{ mapping }}}')
p('\tif (m < arraysz(map)) return map[m];')
p('\treturn 0;')
p('}\n')
p('combining_type mark_for_codepoint(char_type c) {')
p('\tswitch(c) { // {{{')
rmap = {c: m for m, c in enumerate(mark_map)}
for spec in get_ranges(mark_map):
if isinstance(spec, tuple):
s = rmap[spec[0]]
p(f'\t\tcase {spec[0]} ... {spec[1]}: return {s} + c - {spec[0]};')
else:
p(f'\t\tcase {spec}: return {rmap[spec]};')
p('default: return 0;')
p('\t} // }}}')
p('}\n')
def gen_wcwidth(): def gen_wcwidth():

File diff suppressed because one or more lines are too long