Add a script to generate wcwidth as well
Generated function is more efficient than the implementation from wcwidth9 and also makes it easy to update when the unicode standard changes.
This commit is contained in:
parent
11ee317884
commit
d1282b9f55
2
.gitattributes
vendored
2
.gitattributes
vendored
@ -1,4 +1,4 @@
|
|||||||
kitty/wcwidth9.h linguist-generated=true
|
kitty/wcwidth-std.h linguist-generated=true
|
||||||
kitty/emoji.h linguist-generated=true
|
kitty/emoji.h linguist-generated=true
|
||||||
kitty/keys.h linguist-generated=true
|
kitty/keys.h linguist-generated=true
|
||||||
kitty/charsets.c linguist-generated=true
|
kitty/charsets.c linguist-generated=true
|
||||||
|
|||||||
@ -1,2 +1,2 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
cloc --exclude-list-file <(echo -e 'kitty/wcwidth9.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
|
cloc --exclude-list-file <(echo -e 'kitty/wcwidth-std.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
|
||||||
|
|||||||
75
gen-emoji.py
75
gen-emoji.py
@ -1,75 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# vim:fileencoding=utf-8
|
|
||||||
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
import os
|
|
||||||
from collections import defaultdict
|
|
||||||
from functools import partial
|
|
||||||
from itertools import groupby
|
|
||||||
from operator import itemgetter
|
|
||||||
from urllib.request import urlopen
|
|
||||||
|
|
||||||
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
raw = urlopen('http://unicode.org/Public/emoji/5.0/emoji-data.txt').read().decode('utf-8')
|
|
||||||
seen = set()
|
|
||||||
cmap = defaultdict(set)
|
|
||||||
for line in raw.splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line or line.startswith('#'):
|
|
||||||
continue
|
|
||||||
spec, rest = line.partition(';')[::2]
|
|
||||||
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
|
|
||||||
if '.' in spec:
|
|
||||||
spec = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
|
|
||||||
spec = set(range(spec[0], spec[1] + 1))
|
|
||||||
else:
|
|
||||||
spec = {int(spec, 16)}
|
|
||||||
cmap[rest] |= spec
|
|
||||||
seen |= spec
|
|
||||||
items = list(seen)
|
|
||||||
|
|
||||||
|
|
||||||
def get_ranges(items):
|
|
||||||
items.sort()
|
|
||||||
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
|
|
||||||
group = tuple(map(itemgetter(1), g))
|
|
||||||
a, b = group[0], group[-1]
|
|
||||||
if a == b:
|
|
||||||
yield a
|
|
||||||
else:
|
|
||||||
yield a, b
|
|
||||||
|
|
||||||
|
|
||||||
def write_case(spec, p):
|
|
||||||
if isinstance(spec, tuple):
|
|
||||||
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
|
|
||||||
else:
|
|
||||||
p('\t\tcase 0x{:x}:'.format(spec))
|
|
||||||
|
|
||||||
|
|
||||||
with open('kitty/emoji.h', 'w') as f:
|
|
||||||
p = partial(print, file=f)
|
|
||||||
p('#pragma once')
|
|
||||||
p('#include "data-types.h"\n')
|
|
||||||
p('START_ALLOW_CASE_RANGE')
|
|
||||||
p('static inline bool is_emoji(uint32_t code) {')
|
|
||||||
p('\tswitch(code) {')
|
|
||||||
for spec in get_ranges(items):
|
|
||||||
last = spec[1] if isinstance(spec, tuple) else spec
|
|
||||||
if last < 0x231a:
|
|
||||||
continue
|
|
||||||
write_case(spec, p)
|
|
||||||
p('\t\t\treturn true;')
|
|
||||||
p('\t\tdefault: return false;')
|
|
||||||
p('\t}')
|
|
||||||
p('\treturn false; \n}')
|
|
||||||
p('static inline bool is_emoji_modifier(uint32_t code) {')
|
|
||||||
p('\tswitch(code) {')
|
|
||||||
for spec in get_ranges(list(cmap['Emoji_Modifier'])):
|
|
||||||
write_case(spec, p)
|
|
||||||
p('\t\t\treturn true;')
|
|
||||||
p('\t\tdefault: return false;')
|
|
||||||
p('\t}')
|
|
||||||
p('\treturn false; \n}')
|
|
||||||
p('END_ALLOW_CASE_RANGE')
|
|
||||||
199
gen-wcwidth.py
Executable file
199
gen-wcwidth.py
Executable file
@ -0,0 +1,199 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from datetime import date
|
||||||
|
from functools import partial
|
||||||
|
from itertools import groupby
|
||||||
|
from operator import itemgetter
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
# We ignore the first few emojis as they are widely assumed to be single width
|
||||||
|
# in legacy applications
|
||||||
|
FIRST_EMOJI = 0x2194
|
||||||
|
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
|
||||||
|
def get_data(fname, folder='UCD'):
|
||||||
|
url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
|
||||||
|
bn = os.path.basename(url)
|
||||||
|
local = os.path.join('/tmp', bn)
|
||||||
|
if os.path.exists(local):
|
||||||
|
data = open(local, 'rb').read()
|
||||||
|
else:
|
||||||
|
data = urlopen(url).read()
|
||||||
|
open(local, 'wb').write(data)
|
||||||
|
for line in data.decode('utf-8').splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('#'):
|
||||||
|
yield line
|
||||||
|
|
||||||
|
|
||||||
|
# Map of class names to set of codepoints in class
|
||||||
|
class_maps = {}
|
||||||
|
combining_codepoints = set()
|
||||||
|
not_assigned = set(range(0, sys.maxunicode))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_ucd():
|
||||||
|
first = None
|
||||||
|
for line in get_data('ucd/UnicodeData.txt'):
|
||||||
|
parts = [x.strip() for x in line.split(';')]
|
||||||
|
codepoint = int(parts[0], 16)
|
||||||
|
category = parts[2]
|
||||||
|
s = class_maps.setdefault(category, set())
|
||||||
|
desc = parts[1]
|
||||||
|
codepoints = (codepoint,)
|
||||||
|
if first is None:
|
||||||
|
if desc.endswith(', First>'):
|
||||||
|
first = codepoint
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
codepoints = range(first, codepoint + 1)
|
||||||
|
first = None
|
||||||
|
for codepoint in codepoints:
|
||||||
|
s.add(codepoint)
|
||||||
|
not_assigned.discard(codepoint)
|
||||||
|
cc = parts[3]
|
||||||
|
if cc and cc != '0':
|
||||||
|
combining_codepoints.add(codepoint)
|
||||||
|
|
||||||
|
|
||||||
|
def split_two(line):
|
||||||
|
spec, rest = line.split(';', 1)
|
||||||
|
spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
|
||||||
|
if '..' in spec:
|
||||||
|
chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
|
||||||
|
chars = set(range(chars[0], chars[1] + 1))
|
||||||
|
else:
|
||||||
|
chars = {int(spec, 16)}
|
||||||
|
return chars, rest
|
||||||
|
|
||||||
|
|
||||||
|
all_emoji = set()
|
||||||
|
emoji_categories = {}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_emoji():
|
||||||
|
for line in get_data('emoji-data.txt', 'emoji'):
|
||||||
|
chars, rest = split_two(line)
|
||||||
|
if max(chars) >= FIRST_EMOJI:
|
||||||
|
s = emoji_categories.setdefault(rest, set())
|
||||||
|
s |= chars
|
||||||
|
all_emoji.update(chars)
|
||||||
|
|
||||||
|
|
||||||
|
doublewidth, ambiguous = set(), set()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_eaw():
|
||||||
|
global doublewidth, ambiguous
|
||||||
|
seen = set()
|
||||||
|
for line in get_data('ucd/EastAsianWidth.txt'):
|
||||||
|
chars, eaw = split_two(line)
|
||||||
|
if eaw == 'A':
|
||||||
|
ambiguous |= chars
|
||||||
|
seen |= chars
|
||||||
|
elif eaw == 'W' or eaw == 'F':
|
||||||
|
doublewidth |= chars
|
||||||
|
seen |= chars
|
||||||
|
doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
|
||||||
|
doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
|
||||||
|
doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
|
||||||
|
doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
|
||||||
|
doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
|
||||||
|
|
||||||
|
|
||||||
|
def get_ranges(items):
|
||||||
|
items.sort()
|
||||||
|
for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
|
||||||
|
group = tuple(map(itemgetter(1), g))
|
||||||
|
a, b = group[0], group[-1]
|
||||||
|
if a == b:
|
||||||
|
yield a
|
||||||
|
else:
|
||||||
|
yield a, b
|
||||||
|
|
||||||
|
|
||||||
|
def write_case(spec, p):
|
||||||
|
if isinstance(spec, tuple):
|
||||||
|
p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
|
||||||
|
else:
|
||||||
|
p('\t\tcase 0x{:x}:'.format(spec))
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def create_header(path):
|
||||||
|
f = open(path, 'w')
|
||||||
|
p = partial(print, file=f)
|
||||||
|
p('// unicode data, built from the unicode standard on:', date.today())
|
||||||
|
p('// see gen-wcwidth.py')
|
||||||
|
p('#pragma once')
|
||||||
|
p('#include "data-types.h"\n')
|
||||||
|
p('START_ALLOW_CASE_RANGE')
|
||||||
|
p()
|
||||||
|
yield p
|
||||||
|
p()
|
||||||
|
p('END_ALLOW_CASE_RANGE')
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def gen_emoji():
|
||||||
|
with create_header('kitty/emoji.h') as p:
|
||||||
|
p('static inline bool\nis_emoji(char_type code) {')
|
||||||
|
p('\tswitch(code) {')
|
||||||
|
for spec in get_ranges(list(all_emoji)):
|
||||||
|
write_case(spec, p)
|
||||||
|
p('\t\t\treturn true;')
|
||||||
|
p('\t\tdefault: return false;')
|
||||||
|
p('\t}')
|
||||||
|
p('\treturn false; \n}')
|
||||||
|
p('static inline bool\nis_emoji_modifier(char_type code) {')
|
||||||
|
p('\tswitch(code) {')
|
||||||
|
for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
|
||||||
|
write_case(spec, p)
|
||||||
|
p('\t\t\treturn true;')
|
||||||
|
p('\t\tdefault: return false;')
|
||||||
|
p('\t}')
|
||||||
|
p('\treturn false; \n}')
|
||||||
|
|
||||||
|
|
||||||
|
def gen_wcwidth():
|
||||||
|
seen = set()
|
||||||
|
|
||||||
|
def add(p, comment, chars_, ret):
|
||||||
|
chars = chars_ - seen
|
||||||
|
seen.update(chars)
|
||||||
|
p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{{')
|
||||||
|
for spec in get_ranges(list(chars)):
|
||||||
|
write_case(spec, p)
|
||||||
|
p(f'\t\t\treturn {ret};')
|
||||||
|
p('\t\t// }}}\n')
|
||||||
|
|
||||||
|
with create_header('kitty/wcwidth-std.h') as p:
|
||||||
|
p('static int\nwcwidth_std(int32_t code) {')
|
||||||
|
p('\tswitch(code) {')
|
||||||
|
|
||||||
|
non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
|
||||||
|
add(p, 'Non-printing characters', non_printing, -1)
|
||||||
|
add(p, 'Combining characters', combining_codepoints, -1)
|
||||||
|
add(p, 'Private use', class_maps['Co'], -3)
|
||||||
|
add(p, 'East Asian ambiguous width', ambiguous, -2)
|
||||||
|
add(p, 'East Asian double width', doublewidth, 2)
|
||||||
|
add(p, 'Emoji', all_emoji, 2)
|
||||||
|
|
||||||
|
add(p, 'Not assigned in the unicode character database', not_assigned, -1)
|
||||||
|
|
||||||
|
p('\t\tdefault: return 1;')
|
||||||
|
p('\t}')
|
||||||
|
p('\treturn 1; \n}')
|
||||||
|
|
||||||
|
|
||||||
|
parse_ucd()
|
||||||
|
parse_emoji()
|
||||||
|
parse_eaw()
|
||||||
|
gen_wcwidth()
|
||||||
|
gen_emoji()
|
||||||
14
kitty/emoji.h
generated
14
kitty/emoji.h
generated
@ -1,9 +1,17 @@
|
|||||||
|
// unicode data, built from the unicode standard on: 2017-12-20
|
||||||
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|
||||||
START_ALLOW_CASE_RANGE
|
START_ALLOW_CASE_RANGE
|
||||||
static inline bool is_emoji(uint32_t code) {
|
|
||||||
|
static inline bool
|
||||||
|
is_emoji(char_type code) {
|
||||||
switch(code) {
|
switch(code) {
|
||||||
|
case 0x2194 ... 0x2199:
|
||||||
|
return true;
|
||||||
|
case 0x21a9 ... 0x21aa:
|
||||||
|
return true;
|
||||||
case 0x231a ... 0x231b:
|
case 0x231a ... 0x231b:
|
||||||
return true;
|
return true;
|
||||||
case 0x2328:
|
case 0x2328:
|
||||||
@ -276,7 +284,8 @@ static inline bool is_emoji(uint32_t code) {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
static inline bool is_emoji_modifier(uint32_t code) {
|
static inline bool
|
||||||
|
is_emoji_modifier(char_type code) {
|
||||||
switch(code) {
|
switch(code) {
|
||||||
case 0x1f3fb ... 0x1f3ff:
|
case 0x1f3fb ... 0x1f3ff:
|
||||||
return true;
|
return true;
|
||||||
@ -284,4 +293,5 @@ static inline bool is_emoji_modifier(uint32_t code) {
|
|||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
END_ALLOW_CASE_RANGE
|
END_ALLOW_CASE_RANGE
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include "unicode-data.h"
|
#include "unicode-data.h"
|
||||||
#include "modes.h"
|
#include "modes.h"
|
||||||
#include "wcwidth9.h"
|
#include "wcwidth-std.h"
|
||||||
#include "control-codes.h"
|
#include "control-codes.h"
|
||||||
|
|
||||||
static const ScreenModes empty_modes = {0, .mDECAWM=true, .mDECTCEM=true, .mDECARM=true};
|
static const ScreenModes empty_modes = {0, .mDECAWM=true, .mDECTCEM=true, .mDECARM=true};
|
||||||
@ -275,8 +275,8 @@ safe_wcwidth(uint32_t ch) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
change_wcwidth(bool use9) {
|
change_wcwidth(bool use_std) {
|
||||||
wcwidth_impl = (use9) ? wcwidth9 : wcwidth;
|
wcwidth_impl = use_std ? wcwidth_std : wcwidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
2435
kitty/wcwidth-std.h
generated
Normal file
2435
kitty/wcwidth-std.h
generated
Normal file
File diff suppressed because it is too large
Load Diff
1321
kitty/wcwidth9.h
1321
kitty/wcwidth9.h
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user