Module with all the data for unicode entry by character name

2018-02-09 19:56:25 +05:30 · 2018-02-09 19:56:25 +05:30 · 8c18486836
commit 8c18486836
parent b6ed3951bc
8 changed files with 63316 additions and 192 deletions
--- a/gen-wcwidth.py
+++ b/gen-wcwidth.py
@ -4,9 +4,11 @@
 import os
 import sys
 from collections import defaultdict
 from contextlib import contextmanager
 from datetime import date
 from functools import partial
 from html.entities import html5
 from itertools import groupby
 from operator import itemgetter
 from urllib.request import urlopen
@ -31,15 +33,32 @@ def get_data(fname, folder='UCD'):
 # Map of class names to set of codepoints in class
 class_maps = {}
 name_map = {}
 word_search_map = defaultdict(set)
 marks = set()
 not_assigned = set(range(0, sys.maxunicode))
 def parse_ucd():
    def add_word(w, c):
        if c <= 32 or c == 127 or 128 <= c <= 159:
            return
        word_search_map[w.lower()].add(c)
    first = None
    for word, c in html5.items():
        if len(c) == 1:
            add_word(word.rstrip(';'), ord(c))
    word_search_map['nnbsp'].add(0x202f)
    for line in get_data('ucd/UnicodeData.txt'):
        parts = [x.strip() for x in line.split(';')]
        codepoint = int(parts[0], 16)
        name = parts[1]
        if name:
            name_map[codepoint] = name
            for word in name.lower().split():
                add_word(word, codepoint)
        category = parts[2]
        s = class_maps.setdefault(category, set())
        desc = parts[1]
@ -129,18 +148,20 @@ def write_case(spec, p):
@contextmanager
-def create_header(path):
+def create_header(path, include_data_types=True):
    f = open(path, 'w')
    p = partial(print, file=f)
    p('// unicode data, built from the unicode standard on:', date.today())
    p('// see gen-wcwidth.py')
    if path.endswith('.h'):
        p('#pragma once')
    if include_data_types:
        p('#include "data-types.h"\n')
        p('START_ALLOW_CASE_RANGE')
    p()
    yield p
    p()
    if include_data_types:
        p('END_ALLOW_CASE_RANGE')
    f.close()
@ -180,6 +201,20 @@ def category_test(name, p, classes, comment, static=False):
    p('\treturn false;\n}\n')
 def codepoint_to_mark_map(p, mark_map):
    p('\tswitch(c) { // {{{')
    rmap = {c: m for m, c in enumerate(mark_map)}
    for spec in get_ranges(mark_map):
        if isinstance(spec, tuple):
            s = rmap[spec[0]]
            p(f'\t\tcase {spec[0]} ... {spec[1]}: return {s} + c - {spec[0]};')
        else:
            p(f'\t\tcase {spec}: return {rmap[spec]};')
    p('default: return 0;')
    p('\t} // }}}')
    return rmap
 def gen_ucd():
    with create_header('kitty/unicode-data.c') as p:
        p('#include "unicode-data.h"')
@ -195,16 +230,7 @@ def gen_ucd():
        p('\treturn 0;')
        p('}\n')
        p('combining_type mark_for_codepoint(char_type c) {')
-        p('\tswitch(c) { // {{{')
+        rmap = codepoint_to_mark_map(p, mark_map)
        rmap = {c: m for m, c in enumerate(mark_map)}
        for spec in get_ranges(mark_map):
            if isinstance(spec, tuple):
                s = rmap[spec[0]]
                p(f'\t\tcase {spec[0]} ... {spec[1]}: return {s} + c - {spec[0]};')
            else:
                p(f'\t\tcase {spec}: return {rmap[spec]};')
        p('default: return 0;')
        p('\t} // }}}')
        p('}\n')
        if rmap[0xfe0e] != 1275:
            raise ValueError('The mark for 0xfe0e has changed, you have to update VS15 to {} and VS16 to {} in unicode-data.h'.format(
@ -212,6 +238,50 @@ def gen_ucd():
            ))
 def gen_names():
    words = tuple(sorted(word_search_map))
    with create_header('kittens/unicode_input/names.h') as p:
        cp_map = list(sorted(name_map))
        p(f'static const char* name_map[{len(cp_map)}] = {{' ' // {{{')
        for cp in cp_map:
            w = name_map[cp].replace('"', '\\"')
            p(f'\t"{w}",')
        p("}; // }}}\n")
        p(f'static const char* idx_to_word[{len(words)}] = ' '{ // {{{')
        for s in words:
            s = s.replace('"', '\\"')
            p(f'\t"{s}",')
        p("}; // }}}\n")
        first_letters = {ord(w[0]) for w in words if ord(w[0]) < 256}
        wmap = {w: i for i, w in enumerate(words)}
        p(f'static const unsigned short* words_for_first_letter[256] = ' '{ // {{{')
        for fl in range(0, 256):
            if fl in first_letters:
                winds = [str(wmap[w]) for w in words if w.startswith(chr(fl))]
                p(f'\t(const unsigned short[{len(winds) + 1}]){{{len(winds)}, ', ', '.join(winds), '},')
            else:
                p('NULL,')
        p("}; // }}}\n")
        p(f'static const char_type* codepoints_for_word_idx[{len(words)}] = ' '{ // {{{')
        for s in words:
            cps = word_search_map[s]
            a = ', '.join(map(str, cps))
            p(f'\t(const char_type[{len(cps) + 1}]){{{len(cps)}, ', a, '},')
        p("}; // }}}\n")
        p('static char_type mark_for_codepoint(char_type c) {')
        codepoint_to_mark_map(p, cp_map)
        p('}\n')
        p('static inline const char* name_for_codepoint(char_type cp) {')
        p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;')
        p('\treturn name_map[m];')
        p('}\n')
 def gen_wcwidth():
    seen = set()
@ -259,3 +329,4 @@ parse_eaw()
 gen_ucd()
 gen_wcwidth()
 gen_emoji()
 gen_names()
--- a/kittens/unicode_input/main.py
+++ b/kittens/unicode_input/main.py
@ -3,9 +3,10 @@
 # License: GPL v3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
 import sys
 from gettext import gettext as _
 from kitty.fast_data_types import wcswidth
-from kitty.key_encoding import backspace_key, enter_key, ESCAPE
+from kitty.key_encoding import ESCAPE, backspace_key, enter_key
 from ..tui.handler import Handler
 from ..tui.loop import Loop
@ -45,12 +46,12 @@ class UnicodeInput(Handler):
    def initialize(self, *args):
        Handler.initialize(self, *args)
        self.write(set_line_wrapping(False))
-        self.write(set_window_title('Unicode input'))
+        self.write(set_window_title(_('Unicode input')))
        self.draw_screen()
    def draw_screen(self):
        self.write(clear_screen())
-        self.print('Enter the hex code for the unicode character')
+        self.print(_('Enter the hex code for the unicode character'))
        self.write(self.prompt)
        self.write(self.current_input)
--- a/kittens/unicode_input/names.h
+++ b/kittens/unicode_input/names.h
--- a/kittens/unicode_input/unicode_names.c
+++ b/kittens/unicode_input/unicode_names.c
@ -0,0 +1,81 @@
 /*
 * unicode_names.c
 * Copyright (C) 2018 Kovid Goyal <kovid at kovidgoyal.net>
 *
 * Distributed under terms of the GPL3 license.
 */
 #include "names.h"
 static PyObject*
 all_words(PyObject *self UNUSED) {
    PyObject *ans = PyTuple_New(arraysz(idx_to_word));
    if (!ans) return NULL;
    for (size_t i = 0; i < arraysz(idx_to_word); i++) {
        PyObject *w = PyUnicode_FromString(idx_to_word[i]);
        if (w == NULL) { Py_DECREF(ans); return NULL; }
        PyTuple_SET_ITEM(ans, i, w);
    }
    return ans;
 }
 static inline PyObject*
 codepoints_for_word(const char *word, size_t len) {
    PyObject *ans = PyFrozenSet_New(NULL); if (ans == NULL) return NULL;
    const unsigned short *words = words_for_first_letter[(unsigned)*word];
    if (words == NULL) return ans;
    for (unsigned short i = 1; i <= words[0]; i++) {
        unsigned short word_idx = words[i];
        const char *w = idx_to_word[word_idx];
        if(strncmp(word, w, len) == 0) {
            const char_type* codepoints = codepoints_for_word_idx[word_idx];
            for (char_type i = 1; i <= codepoints[0]; i++) {
                PyObject *t = PyLong_FromUnsignedLong(codepoints[i]); if (t == NULL) { Py_DECREF(ans); return NULL; }
                int ret = PySet_Add(ans, t); Py_DECREF(t); if (ret != 0) { Py_DECREF(ans); return NULL; }
            }
            break;
        }
    }
    return ans;
 }
 static PyObject*
 cfw(PyObject *self UNUSED, PyObject *args) {
    const char *word;
    if (!PyArg_ParseTuple(args, "s", &word)) return NULL;
    return codepoints_for_word(word, strlen(word));
 }
 static PyObject*
 nfc(PyObject *self UNUSED, PyObject *args) {
    unsigned int cp;
    if (!PyArg_ParseTuple(args, "I", &cp)) return NULL;
    const char *n = name_for_codepoint(cp);
    if (n == NULL) Py_RETURN_NONE;
    return PyUnicode_FromString(n);
 }
 static PyMethodDef module_methods[] = {
    METHODB(all_words, METH_NOARGS),
    {"codepoints_for_word", (PyCFunction)cfw, METH_VARARGS, ""},
    {"name_for_codepoint", (PyCFunction)nfc, METH_VARARGS, ""},
    {NULL, NULL, 0, NULL}        /* Sentinel */
 };
 static struct PyModuleDef module = {
   .m_base = PyModuleDef_HEAD_INIT,
   .m_name = "unicode_names",   /* name of module */
   .m_doc = NULL,
   .m_size = -1,
   .m_methods = module_methods
 };
 EXPORTED PyMODINIT_FUNC
 PyInit_unicode_names(void) {
    PyObject *m;
    m = PyModule_Create(&module);
    if (m == NULL) return NULL;
    return m;
 }
--- a/kitty/emoji.h
+++ b/kitty/emoji.h
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2018-02-06
+// unicode data, built from the unicode standard on: 2018-02-09
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
@ -18,10 +18,14 @@ is_emoji(char_type code) {
 			return true;
 		case 0xae:
 			return true;
 		case 0x200d:
 			return true;
 		case 0x203c:
 			return true;
 		case 0x2049:
 			return true;
 		case 0x20e3:
 			return true;
 		case 0x2122:
 			return true;
 		case 0x2139:
@ -34,6 +38,8 @@ is_emoji(char_type code) {
 			return true;
 		case 0x2328:
 			return true;
 		case 0x2388:
 			return true;
 		case 0x23cf:
 			return true;
 		case 0x23e9 ... 0x23f3:
@ -50,89 +56,15 @@ is_emoji(char_type code) {
 			return true;
 		case 0x25fb ... 0x25fe:
 			return true;
-		case 0x2600 ... 0x2604:
+		case 0x2600 ... 0x2605:
 			return true;
-		case 0x260e:
+		case 0x2607 ... 0x2612:
 			return true;
-		case 0x2611:
+		case 0x2614 ... 0x2685:
 			return true;
-		case 0x2614 ... 0x2615:
+		case 0x2690 ... 0x2705:
 			return true;
-		case 0x2618:
+		case 0x2708 ... 0x2712:
 			return true;
 		case 0x261d:
 			return true;
 		case 0x2620:
 			return true;
 		case 0x2622 ... 0x2623:
 			return true;
 		case 0x2626:
 			return true;
 		case 0x262a:
 			return true;
 		case 0x262e ... 0x262f:
 			return true;
 		case 0x2638 ... 0x263a:
 			return true;
 		case 0x2640:
 			return true;
 		case 0x2642:
 			return true;
 		case 0x2648 ... 0x2653:
 			return true;
 		case 0x2660:
 			return true;
 		case 0x2663:
 			return true;
 		case 0x2665 ... 0x2666:
 			return true;
 		case 0x2668:
 			return true;
 		case 0x267b:
 			return true;
 		case 0x267f:
 			return true;
 		case 0x2692 ... 0x2697:
 			return true;
 		case 0x2699:
 			return true;
 		case 0x269b ... 0x269c:
 			return true;
 		case 0x26a0 ... 0x26a1:
 			return true;
 		case 0x26aa ... 0x26ab:
 			return true;
 		case 0x26b0 ... 0x26b1:
 			return true;
 		case 0x26bd ... 0x26be:
 			return true;
 		case 0x26c4 ... 0x26c5:
 			return true;
 		case 0x26c8:
 			return true;
 		case 0x26ce ... 0x26cf:
 			return true;
 		case 0x26d1:
 			return true;
 		case 0x26d3 ... 0x26d4:
 			return true;
 		case 0x26e9 ... 0x26ea:
 			return true;
 		case 0x26f0 ... 0x26f5:
 			return true;
 		case 0x26f7 ... 0x26fa:
 			return true;
 		case 0x26fd:
 			return true;
 		case 0x2702:
 			return true;
 		case 0x2705:
 			return true;
 		case 0x2708 ... 0x270d:
 			return true;
 		case 0x270f:
 			return true;
 		case 0x2712:
 			return true;
 		case 0x2714:
 			return true;
@ -158,7 +90,7 @@ is_emoji(char_type code) {
 			return true;
 		case 0x2757:
 			return true;
-		case 0x2763 ... 0x2764:
+		case 0x2763 ... 0x2767:
 			return true;
 		case 0x2795 ... 0x2797:
 			return true;
@ -186,11 +118,15 @@ is_emoji(char_type code) {
 			return true;
 		case 0x3299:
 			return true;
-		case 0x1f004:
+		case 0xfe0f:
 			return true;
-		case 0x1f0cf:
+		case 0x1f000 ... 0x1f0ff:
 			return true;
-		case 0x1f170 ... 0x1f171:
+		case 0x1f10d ... 0x1f10f:
 			return true;
 		case 0x1f12f:
 			return true;
 		case 0x1f16c ... 0x1f171:
 			return true;
 		case 0x1f17e ... 0x1f17f:
 			return true;
@ -198,9 +134,9 @@ is_emoji(char_type code) {
 			return true;
 		case 0x1f191 ... 0x1f19a:
 			return true;
-		case 0x1f1e6 ... 0x1f1ff:
+		case 0x1f1ad ... 0x1f1ff:
 			return true;
-		case 0x1f201 ... 0x1f202:
+		case 0x1f201 ... 0x1f20f:
 			return true;
 		case 0x1f21a:
 			return true;
@ -208,95 +144,35 @@ is_emoji(char_type code) {
 			return true;
 		case 0x1f232 ... 0x1f23a:
 			return true;
-		case 0x1f250 ... 0x1f251:
+		case 0x1f23c ... 0x1f23f:
 			return true;
-		case 0x1f300 ... 0x1f321:
+		case 0x1f249 ... 0x1f53d:
 			return true;
-		case 0x1f324 ... 0x1f393:
+		case 0x1f546 ... 0x1f64f:
 			return true;
-		case 0x1f396 ... 0x1f397:
+		case 0x1f680 ... 0x1f6ff:
 			return true;
-		case 0x1f399 ... 0x1f39b:
+		case 0x1f774 ... 0x1f77f:
 			return true;
-		case 0x1f39e ... 0x1f3f0:
+		case 0x1f7d5 ... 0x1f7ff:
 			return true;
-		case 0x1f3f3 ... 0x1f3f5:
+		case 0x1f80c ... 0x1f80f:
 			return true;
-		case 0x1f3f7 ... 0x1f4fd:
+		case 0x1f848 ... 0x1f84f:
 			return true;
-		case 0x1f4ff ... 0x1f53d:
+		case 0x1f85a ... 0x1f85f:
 			return true;
-		case 0x1f549 ... 0x1f54e:
+		case 0x1f888 ... 0x1f88f:
 			return true;
-		case 0x1f550 ... 0x1f567:
+		case 0x1f8ae ... 0x1f8ff:
 			return true;
-		case 0x1f56f ... 0x1f570:
+		case 0x1f90c ... 0x1f93a:
 			return true;
-		case 0x1f573 ... 0x1f57a:
+		case 0x1f93c ... 0x1f945:
 			return true;
-		case 0x1f587:
+		case 0x1f947 ... 0x1fffd:
 			return true;
-		case 0x1f58a ... 0x1f58d:
+		case 0xe0020 ... 0xe007f:
 			return true;
 		case 0x1f590:
 			return true;
 		case 0x1f595 ... 0x1f596:
 			return true;
 		case 0x1f5a4 ... 0x1f5a5:
 			return true;
 		case 0x1f5a8:
 			return true;
 		case 0x1f5b1 ... 0x1f5b2:
 			return true;
 		case 0x1f5bc:
 			return true;
 		case 0x1f5c2 ... 0x1f5c4:
 			return true;
 		case 0x1f5d1 ... 0x1f5d3:
 			return true;
 		case 0x1f5dc ... 0x1f5de:
 			return true;
 		case 0x1f5e1:
 			return true;
 		case 0x1f5e3:
 			return true;
 		case 0x1f5e8:
 			return true;
 		case 0x1f5ef:
 			return true;
 		case 0x1f5f3:
 			return true;
 		case 0x1f5fa ... 0x1f64f:
 			return true;
 		case 0x1f680 ... 0x1f6c5:
 			return true;
 		case 0x1f6cb ... 0x1f6d2:
 			return true;
 		case 0x1f6e0 ... 0x1f6e5:
 			return true;
 		case 0x1f6e9:
 			return true;
 		case 0x1f6eb ... 0x1f6ec:
 			return true;
 		case 0x1f6f0:
 			return true;
 		case 0x1f6f3 ... 0x1f6f8:
 			return true;
 		case 0x1f910 ... 0x1f93a:
 			return true;
 		case 0x1f93c ... 0x1f93e:
 			return true;
 		case 0x1f940 ... 0x1f945:
 			return true;
 		case 0x1f947 ... 0x1f94c:
 			return true;
 		case 0x1f950 ... 0x1f96b:
 			return true;
 		case 0x1f980 ... 0x1f997:
 			return true;
 		case 0x1f9c0:
 			return true;
 		case 0x1f9d0 ... 0x1f9e6:
 			return true;
 		default: return false;
 	}
--- a/kitty/unicode-data.c
+++ b/kitty/unicode-data.c
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2018-02-06
+// unicode data, built from the unicode standard on: 2018-02-09
 // see gen-wcwidth.py
 #include "data-types.h"
--- a/kitty/wcwidth-std.h
+++ b/kitty/wcwidth-std.h
@ -1,4 +1,4 @@
-// unicode data, built from the unicode standard on: 2018-02-06
+// unicode data, built from the unicode standard on: 2018-02-09
 // see gen-wcwidth.py
 #pragma once
 #include "data-types.h"
@ -593,7 +593,7 @@ wcwidth_std(int32_t code) {
 			return -3;
 		// }}}
-		// Text Presentation (216 codepoints) {{{
+		// Text Presentation (218 codepoints) {{{
 		case 0x23:
 			return 1;
 		case 0x2a:
@ -662,7 +662,7 @@ wcwidth_std(int32_t code) {
 			return 1;
 		case 0x2642:
 			return 1;
-		case 0x2660:
+		case 0x265f ... 0x2660:
 			return 1;
 		case 0x2663:
 			return 1;
@ -672,6 +672,8 @@ wcwidth_std(int32_t code) {
 			return 1;
 		case 0x267b:
 			return 1;
 		case 0x267e:
 			return 1;
 		case 0x2692:
 			return 1;
 		case 0x2694 ... 0x2697:
@ -1406,12 +1408,32 @@ wcwidth_std(int32_t code) {
 			return 2;
 		// }}}
-		// Emoji Presentation (26 codepoints) {{{
+		// Emoji Presentation (92 codepoints) {{{
 		case 0x1f1e6 ... 0x1f1ff:
 			return 2;
 		case 0x1f6f9:
 			return 2;
 		case 0x1f94d ... 0x1f94f:
 			return 2;
 		case 0x1f96c ... 0x1f970:
 			return 2;
 		case 0x1f973 ... 0x1f976:
 			return 2;
 		case 0x1f97a:
 			return 2;
 		case 0x1f97c ... 0x1f97f:
 			return 2;
 		case 0x1f998 ... 0x1f9a2:
 			return 2;
 		case 0x1f9b0 ... 0x1f9b9:
 			return 2;
 		case 0x1f9c1 ... 0x1f9c2:
 			return 2;
 		case 0x1f9e7 ... 0x1f9ff:
 			return 2;
 		// }}}
-		// Not assigned in the unicode character database (767560 codepoints) {{{
+		// Not assigned in the unicode character database (767494 codepoints) {{{
 		case 0x378 ... 0x379:
 			return -1;
 		case 0x380 ... 0x383:
@ -2650,7 +2672,7 @@ wcwidth_std(int32_t code) {
 			return -1;
 		case 0x1f6ed ... 0x1f6ef:
 			return -1;
-		case 0x1f6f9 ... 0x1f6ff:
+		case 0x1f6fa ... 0x1f6ff:
 			return -1;
 		case 0x1f774 ... 0x1f77f:
 			return -1;
@ -2670,15 +2692,19 @@ wcwidth_std(int32_t code) {
 			return -1;
 		case 0x1f93f:
 			return -1;
-		case 0x1f94d ... 0x1f94f:
+		case 0x1f971 ... 0x1f972:
 			return -1;
-		case 0x1f96c ... 0x1f97f:
+		case 0x1f977 ... 0x1f979:
 			return -1;
-		case 0x1f998 ... 0x1f9bf:
+		case 0x1f97b:
 			return -1;
-		case 0x1f9c1 ... 0x1f9cf:
+		case 0x1f9a3 ... 0x1f9af:
 			return -1;
-		case 0x1f9e7 ... 0x1ffff:
+		case 0x1f9ba ... 0x1f9bf:
 			return -1;
 		case 0x1f9c3 ... 0x1f9cf:
 			return -1;
 		case 0x1fa00 ... 0x1ffff:
 			return -1;
 		case 0x2fffe ... 0x2ffff:
 			return -1;
@ -2779,7 +2805,7 @@ is_emoji_presentation_base(uint32_t code) {
 			return true;
 		case 0x2648 ... 0x2653:
 			return true;
-		case 0x2660:
+		case 0x265f ... 0x2660:
 			return true;
 		case 0x2663:
 			return true;
@ -2789,7 +2815,7 @@ is_emoji_presentation_base(uint32_t code) {
 			return true;
 		case 0x267b:
 			return true;
-		case 0x267f:
+		case 0x267e ... 0x267f:
 			return true;
 		case 0x2692 ... 0x2697:
 			return true;
--- a/setup.py
+++ b/setup.py
@ -409,6 +409,22 @@ def compile_glfw(incremental, compilation_database, all_keys):
        compile_c_extension(genv, 'kitty/glfw-' + module, incremental, compilation_database, all_keys, sources, all_headers)
 def kittens_env():
    kenv = env.copy()
    cflags = kenv.cflags
    cflags.append('-pthread')
    cflags.append('-Ikitty')
    pylib = get_python_flags(cflags)
    kenv.ldpaths += pylib
    return kenv
 def compile_kittens(incremental, compilation_database, all_keys):
    sources = ['kittens/unicode_input/unicode_names.c']
    all_headers = ['kittens/unicode_input/names.h', 'kitty/data-types.h']
    compile_c_extension(kittens_env(), 'kittens/unicode_input/unicode_names', incremental, compilation_database, all_keys, sources, all_headers)
 def build(args, native_optimizations=True):
    global env
    try:
@ -426,6 +442,7 @@ def build(args, native_optimizations=True):
            kitty_env(), 'kitty/fast_data_types', args.incremental, compilation_database, all_keys, *find_c_files()
        )
        compile_glfw(args.incremental, compilation_database, all_keys)
        compile_kittens(args.incremental, compilation_database, all_keys)
        for key in set(compilation_database) - all_keys:
            del compilation_database[key]
    finally: