Add a script to generate wcwidth as well

Generated function is more efficient than the implementation from wcwidth9 and also makes it easy to update when the unicode standard changes.
2017-12-20 16:06:58 +05:30 · 2017-12-20 16:06:58 +05:30 · d1282b9f55
commit d1282b9f55
parent 11ee317884
8 changed files with 2653 additions and 1405 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,4 +1,4 @@
-kitty/wcwidth9.h linguist-generated=true
+kitty/wcwidth-std.h linguist-generated=true
 kitty/emoji.h linguist-generated=true
 kitty/keys.h linguist-generated=true
 kitty/charsets.c linguist-generated=true
--- a/2
+++ b/2
@ -1,2 +1,2 @@
 #!/bin/bash
-cloc --exclude-list-file <(echo -e 'kitty/wcwidth9.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
+cloc --exclude-list-file <(echo -e 'kitty/wcwidth-std.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty
--- a/gen-emoji.py
+++ b/gen-emoji.py
@ -1,75 +0,0 @@
 #!/usr/bin/env python3
 # vim:fileencoding=utf-8
 # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
 import os
 from collections import defaultdict
 from functools import partial
 from itertools import groupby
 from operator import itemgetter
 from urllib.request import urlopen
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 raw = urlopen('http://unicode.org/Public/emoji/5.0/emoji-data.txt').read().decode('utf-8')
 seen = set()
 cmap = defaultdict(set)
 for line in raw.splitlines():
    line = line.strip()
    if not line or line.startswith('#'):
        continue
    spec, rest = line.partition(';')[::2]
    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
    if '.' in spec:
        spec = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
        spec = set(range(spec[0], spec[1] + 1))
    else:
        spec = {int(spec, 16)}
    cmap[rest] |= spec
    seen |= spec
 items = list(seen)
 def get_ranges(items):
    items.sort()
    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
        group = tuple(map(itemgetter(1), g))
        a, b = group[0], group[-1]
        if a == b:
            yield a
        else:
            yield a, b
 def write_case(spec, p):
    if isinstance(spec, tuple):
        p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
    else:
        p('\t\tcase 0x{:x}:'.format(spec))
 with open('kitty/emoji.h', 'w') as f:
    p = partial(print, file=f)
    p('#pragma once')
    p('#include "data-types.h"\n')
    p('START_ALLOW_CASE_RANGE')
    p('static inline bool is_emoji(uint32_t code) {')
    p('\tswitch(code) {')
    for spec in get_ranges(items):
        last = spec[1] if isinstance(spec, tuple) else spec
        if last < 0x231a:
            continue
        write_case(spec, p)
        p('\t\t\treturn true;')
    p('\t\tdefault: return false;')
    p('\t}')
    p('\treturn false; \n}')
    p('static inline bool is_emoji_modifier(uint32_t code) {')
    p('\tswitch(code) {')
    for spec in get_ranges(list(cmap['Emoji_Modifier'])):
        write_case(spec, p)
        p('\t\t\treturn true;')
    p('\t\tdefault: return false;')
    p('\t}')
    p('\treturn false; \n}')
    p('END_ALLOW_CASE_RANGE')
--- a/gen-wcwidth.py
+++ b/gen-wcwidth.py
@ -0,0 +1,199 @@
 #!/usr/bin/env python3
 # vim:fileencoding=utf-8
 # License: GPL v3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
 import os
 import sys
 from contextlib import contextmanager
 from datetime import date
 from functools import partial
 from itertools import groupby
 from operator import itemgetter
 from urllib.request import urlopen
 # We ignore the first few emojis as they are widely assumed to be single width
 # in legacy applications
 FIRST_EMOJI = 0x2194
 os.chdir(os.path.dirname(os.path.abspath(__file__)))
 def get_data(fname, folder='UCD'):
    url = f'https://www.unicode.org/Public/{folder}/latest/{fname}'
    bn = os.path.basename(url)
    local = os.path.join('/tmp', bn)
    if os.path.exists(local):
        data = open(local, 'rb').read()
    else:
        data = urlopen(url).read()
        open(local, 'wb').write(data)
    for line in data.decode('utf-8').splitlines():
        line = line.strip()
        if line and not line.startswith('#'):
            yield line
 # Map of class names to set of codepoints in class
 class_maps = {}
 combining_codepoints = set()
 not_assigned = set(range(0, sys.maxunicode))
 def parse_ucd():
    first = None
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
        codepoints = (codepoint,)
        if first is None:
            if desc.endswith(', First>'):
                first = codepoint
                continue
        else:
            codepoints = range(first, codepoint + 1)
            first = None
        for codepoint in codepoints:
            s.add(codepoint)
            not_assigned.discard(codepoint)
            cc = parts[3]
            if cc and cc != '0':
                combining_codepoints.add(codepoint)
 def split_two(line):
    spec, rest = line.split(';', 1)
    spec, rest = spec.strip(), rest.strip().split(' ', 1)[0].strip()
    if '..' in spec:
        chars = tuple(map(lambda x: int(x, 16), filter(None, spec.split('.'))))
        chars = set(range(chars[0], chars[1] + 1))
    else:
        chars = {int(spec, 16)}
    return chars, rest
 all_emoji = set()
 emoji_categories = {}
 def parse_emoji():
    for line in get_data('emoji-data.txt', 'emoji'):
        chars, rest = split_two(line)
        if max(chars) >= FIRST_EMOJI:
            s = emoji_categories.setdefault(rest, set())
            s |= chars
            all_emoji.update(chars)
 doublewidth, ambiguous = set(), set()
 def parse_eaw():
    global doublewidth, ambiguous
    seen = set()
    for line in get_data('ucd/EastAsianWidth.txt'):
        chars, eaw = split_two(line)
        if eaw == 'A':
            ambiguous |= chars
            seen |= chars
        elif eaw == 'W' or eaw == 'F':
            doublewidth |= chars
            seen |= chars
    doublewidth |= set(range(0x3400, 0x4DBF + 1)) - seen
    doublewidth |= set(range(0x4E00, 0x9FFF + 1)) - seen
    doublewidth |= set(range(0xF900, 0xFAFF + 1)) - seen
    doublewidth |= set(range(0x20000, 0x2FFFD + 1)) - seen
    doublewidth |= set(range(0x30000, 0x3FFFD + 1)) - seen
 def get_ranges(items):
    items.sort()
    for k, g in groupby(enumerate(items), lambda m: m[0]-m[1]):
        group = tuple(map(itemgetter(1), g))
        a, b = group[0], group[-1]
        if a == b:
            yield a
        else:
            yield a, b
 def write_case(spec, p):
    if isinstance(spec, tuple):
        p('\t\tcase 0x{:x} ... 0x{:x}:'.format(*spec))
    else:
        p('\t\tcase 0x{:x}:'.format(spec))
@contextmanager
 def create_header(path):
    f = open(path, 'w')
    p = partial(print, file=f)
    p('// unicode data, built from the unicode standard on:', date.today())
    p('// see gen-wcwidth.py')
    p('#pragma once')
    p('#include "data-types.h"\n')
    p('START_ALLOW_CASE_RANGE')
    p()
    yield p
    p()
    p('END_ALLOW_CASE_RANGE')
    f.close()
 def gen_emoji():
    with create_header('kitty/emoji.h') as p:
        p('static inline bool\nis_emoji(char_type code) {')
        p('\tswitch(code) {')
        for spec in get_ranges(list(all_emoji)):
            write_case(spec, p)
            p('\t\t\treturn true;')
        p('\t\tdefault: return false;')
        p('\t}')
        p('\treturn false; \n}')
        p('static inline bool\nis_emoji_modifier(char_type code) {')
        p('\tswitch(code) {')
        for spec in get_ranges(list(emoji_categories['Emoji_Modifier'])):
            write_case(spec, p)
            p('\t\t\treturn true;')
        p('\t\tdefault: return false;')
        p('\t}')
        p('\treturn false; \n}')
 def gen_wcwidth():
    seen = set()
    def add(p, comment, chars_, ret):
        chars = chars_ - seen
        seen.update(chars)
        p(f'\t\t// {comment} ({len(chars)} codepoints)' + ' {{{')
        for spec in get_ranges(list(chars)):
            write_case(spec, p)
            p(f'\t\t\treturn {ret};')
        p('\t\t// }}}\n')
    with create_header('kitty/wcwidth-std.h') as p:
        p('static int\nwcwidth_std(int32_t code) {')
        p('\tswitch(code) {')
        non_printing = class_maps['Cc'] | class_maps['Cf'] | class_maps['Cs']
        add(p, 'Non-printing characters', non_printing, -1)
        add(p, 'Combining characters', combining_codepoints, -1)
        add(p, 'Private use', class_maps['Co'], -3)
        add(p, 'East Asian ambiguous width', ambiguous, -2)
        add(p, 'East Asian double width', doublewidth, 2)
        add(p, 'Emoji', all_emoji, 2)
        add(p, 'Not assigned in the unicode character database', not_assigned, -1)
        p('\t\tdefault: return 1;')
        p('\t}')
        p('\treturn 1; \n}')
 parse_ucd()
 parse_emoji()
 parse_eaw()
 gen_wcwidth()
 gen_emoji()
--- a/kitty/emoji.h
+++ b/kitty/emoji.h
@ -1,9 +1,17 @@
 // unicode data, built from the unicode standard on: 2017-12-20
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
 START_ALLOW_CASE_RANGE
-static inline bool is_emoji(uint32_t code) {
+
 static inline bool
 is_emoji(char_type code) {
 	switch(code) {
 		case 0x2194 ... 0x2199:
 			return true;
 		case 0x21a9 ... 0x21aa:
 			return true;
 		case 0x231a ... 0x231b:
 			return true;
 		case 0x2328:
@ -276,7 +284,8 @@ static inline bool is_emoji(uint32_t code) {
 	}
 	return false; 
 }
-static inline bool is_emoji_modifier(uint32_t code) {
+static inline bool
 is_emoji_modifier(char_type code) {
 	switch(code) {
 		case 0x1f3fb ... 0x1f3ff:
 			return true;
@ -284,4 +293,5 @@ static inline bool is_emoji_modifier(uint32_t code) {
 	}
 	return false; 
 }
 END_ALLOW_CASE_RANGE
--- a/kitty/screen.c
+++ b/kitty/screen.c
@ -18,7 +18,7 @@
 #include <fcntl.h>
 #include "unicode-data.h"
 #include "modes.h"
-#include "wcwidth9.h"
+#include "wcwidth-std.h"
 #include "control-codes.h"
 static const ScreenModes empty_modes = {0, .mDECAWM=true, .mDECTCEM=true, .mDECARM=true};
@ -275,8 +275,8 @@ safe_wcwidth(uint32_t ch) {
 }
 void
-change_wcwidth(bool use9) {
+change_wcwidth(bool use_std) {
-    wcwidth_impl = (use9) ? wcwidth9 : wcwidth;
+    wcwidth_impl = use_std ? wcwidth_std : wcwidth;
 }
--- a/kitty/wcwidth-std.h
+++ b/kitty/wcwidth-std.h
--- a/kitty/wcwidth9.h
+++ b/kitty/wcwidth9.h
`@ -1,2 +1,2 @@`
	`#!/bin/bash`	`#!/bin/bash`
	`cloc --exclude-list-file <(echo -e 'kitty/wcwidth9.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty`	`cloc --exclude-list-file <(echo -e 'kitty/wcwidth-std.h\nkitty/glfw.c\nkitty/keys.h\nkitty/charsets.c\nkitty/key_encoding.py\nkitty/rgb.py\nkitty/gl.h\nkitty/gl-wrapper.h\nkitty/gl-wrapper.c\nkitty/khrplatform.h\nkitty/glfw-wrapper.h\nkitty/glfw-wrapper.c\nkitty/emoji.h') kitty`