Unicode input: When searching by name search for prefix matches as well as whole word matches
So now hori matches both "hori" and "horizontal". Switched to a prefix-trie internally.
This commit is contained in:
parent
9d67198ff9
commit
0b99bb534f
119
gen-wcwidth.py
119
gen-wcwidth.py
@ -44,7 +44,8 @@ def parse_ucd():
|
|||||||
def add_word(w, c):
|
def add_word(w, c):
|
||||||
if c <= 32 or c == 127 or 128 <= c <= 159:
|
if c <= 32 or c == 127 or 128 <= c <= 159:
|
||||||
return
|
return
|
||||||
word_search_map[w.lower()].add(c)
|
if len(w) > 1:
|
||||||
|
word_search_map[w.lower()].add(c)
|
||||||
|
|
||||||
first = None
|
first = None
|
||||||
for word, c in html5.items():
|
for word, c in html5.items():
|
||||||
@ -261,48 +262,106 @@ def gen_ucd():
|
|||||||
|
|
||||||
|
|
||||||
def gen_names():
|
def gen_names():
|
||||||
words = tuple(sorted(word_search_map))
|
|
||||||
|
|
||||||
with create_header('kittens/unicode_input/names.h') as p:
|
with create_header('kittens/unicode_input/names.h') as p:
|
||||||
cp_map = list(sorted(name_map))
|
mark_to_cp = list(sorted(name_map))
|
||||||
p(f'static const char* name_map[{len(cp_map)}] = {{' ' // {{{')
|
cp_to_mark = {cp: m for m, cp in enumerate(mark_to_cp)}
|
||||||
for cp in cp_map:
|
# Mapping of mark to codepoint name
|
||||||
|
p(f'static const char* name_map[{len(mark_to_cp)}] = {{' ' // {{{')
|
||||||
|
for cp in mark_to_cp:
|
||||||
w = name_map[cp].replace('"', '\\"')
|
w = name_map[cp].replace('"', '\\"')
|
||||||
p(f'\t"{w}",')
|
p(f'\t"{w}",')
|
||||||
p("}; // }}}\n")
|
p("}; // }}}\n")
|
||||||
|
|
||||||
p(f'static const char* idx_to_word[{len(words)}] = ' '{ // {{{')
|
# Mapping of mark to codepoint
|
||||||
for s in words:
|
p(f'static const char_type mark_to_cp[{len(mark_to_cp)}] = {{' ' // {{{')
|
||||||
s = s.replace('"', '\\"')
|
p(', '.join(map(str, mark_to_cp)))
|
||||||
p(f'\t"{s}",')
|
p('}; // }}}\n')
|
||||||
p("}; // }}}\n")
|
|
||||||
|
|
||||||
first_letters = {ord(w[0]) for w in words if ord(w[0]) < 256}
|
|
||||||
wmap = {w: i for i, w in enumerate(words)}
|
|
||||||
p(f'static const unsigned short* words_for_first_letter[256] = ' '{ // {{{')
|
|
||||||
for fl in range(0, 256):
|
|
||||||
if fl in first_letters:
|
|
||||||
winds = [str(wmap[w]) for w in words if w.startswith(chr(fl))]
|
|
||||||
p(f'\t(const unsigned short[{len(winds) + 1}]){{{len(winds)}, ', ', '.join(winds), '},')
|
|
||||||
else:
|
|
||||||
p('NULL,')
|
|
||||||
p("}; // }}}\n")
|
|
||||||
|
|
||||||
p(f'static const char_type* codepoints_for_word_idx[{len(words)}] = ' '{ // {{{')
|
|
||||||
for s in words:
|
|
||||||
cps = word_search_map[s]
|
|
||||||
a = ', '.join(map(str, cps))
|
|
||||||
p(f'\t(const char_type[{len(cps) + 1}]){{{len(cps)}, ', a, '},')
|
|
||||||
p("}; // }}}\n")
|
|
||||||
|
|
||||||
|
# Function to get mark number for codepoint
|
||||||
p('static char_type mark_for_codepoint(char_type c) {')
|
p('static char_type mark_for_codepoint(char_type c) {')
|
||||||
codepoint_to_mark_map(p, cp_map)
|
codepoint_to_mark_map(p, mark_to_cp)
|
||||||
p('}\n')
|
p('}\n')
|
||||||
p('static inline const char* name_for_codepoint(char_type cp) {')
|
p('static inline const char* name_for_codepoint(char_type cp) {')
|
||||||
p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;')
|
p('\tchar_type m = mark_for_codepoint(cp); if (m == 0) return NULL;')
|
||||||
p('\treturn name_map[m];')
|
p('\treturn name_map[m];')
|
||||||
p('}\n')
|
p('}\n')
|
||||||
|
|
||||||
|
# Array of all words
|
||||||
|
word_map = tuple(sorted(word_search_map))
|
||||||
|
word_rmap = {w: i for i, w in enumerate(word_map)}
|
||||||
|
p(f'static const char* all_words_map[{len(word_map)}] = {{' ' // {{{')
|
||||||
|
cwords = (w.replace('"', '\\"') for w in word_map)
|
||||||
|
p(', '.join(f'"{w}"' for w in cwords))
|
||||||
|
p('}; // }}}\n')
|
||||||
|
|
||||||
|
# Array of sets of marks for each word
|
||||||
|
word_to_marks = {word_rmap[w]: frozenset(map(cp_to_mark.__getitem__, cps)) for w, cps in word_search_map.items()}
|
||||||
|
all_mark_groups = frozenset(word_to_marks.values())
|
||||||
|
array = [0]
|
||||||
|
mg_to_offset = {}
|
||||||
|
for mg in all_mark_groups:
|
||||||
|
mg_to_offset[mg] = len(array)
|
||||||
|
array.append(len(mg))
|
||||||
|
array.extend(sorted(mg))
|
||||||
|
p(f'static const char_type mark_groups[{len(array)}] = {{' ' // {{{')
|
||||||
|
p(', '.join(map(str, array)))
|
||||||
|
p('}; // }}}\n')
|
||||||
|
offsets_array = []
|
||||||
|
for wi, w in enumerate(word_map):
|
||||||
|
mg = word_to_marks[wi]
|
||||||
|
offsets_array.append(mg_to_offset[mg])
|
||||||
|
p(f'static const char_type mark_to_offset[{len(offsets_array)}] = {{' ' // {{{')
|
||||||
|
p(', '.join(map(str, offsets_array)))
|
||||||
|
p('}; // }}}\n')
|
||||||
|
|
||||||
|
# The trie
|
||||||
|
p(f'typedef struct {{ uint32_t children_offset; uint32_t match_offset; }} word_trie;\n')
|
||||||
|
all_trie_nodes = []
|
||||||
|
|
||||||
|
class TrieNode:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.match_offset = 0
|
||||||
|
self.children_offset = 0
|
||||||
|
self.children = {}
|
||||||
|
|
||||||
|
def add_letter(self, letter):
|
||||||
|
if letter not in self.children:
|
||||||
|
self.children[letter] = len(all_trie_nodes)
|
||||||
|
all_trie_nodes.append(TrieNode())
|
||||||
|
return self.children[letter]
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f'{{ .children_offset={self.children_offset}, .match_offset={self.match_offset} }}'
|
||||||
|
|
||||||
|
root = TrieNode()
|
||||||
|
all_trie_nodes.append(root)
|
||||||
|
|
||||||
|
def add_word(word_idx):
|
||||||
|
word = word_map[word_idx]
|
||||||
|
parent = root
|
||||||
|
for letter in map(ord, word):
|
||||||
|
idx = parent.add_letter(letter)
|
||||||
|
parent = all_trie_nodes[idx]
|
||||||
|
parent.match_offset = offsets_array[word_idx]
|
||||||
|
|
||||||
|
for i in range(len(word_map)):
|
||||||
|
add_word(i)
|
||||||
|
children_array = [0]
|
||||||
|
for node in all_trie_nodes:
|
||||||
|
if node.children:
|
||||||
|
node.children_offset = len(children_array)
|
||||||
|
children_array.append(len(node.children))
|
||||||
|
for letter, child_offset in node.children.items():
|
||||||
|
children_array.append((child_offset << 8) | (letter & 0xff))
|
||||||
|
|
||||||
|
p(f'static const word_trie all_trie_nodes[{len(all_trie_nodes)}] = {{' ' // {{{')
|
||||||
|
p(',\n'.join(map(str, all_trie_nodes)))
|
||||||
|
p('\n}; // }}}\n')
|
||||||
|
p(f'static const uint32_t children_array[{len(children_array)}] = {{' ' // {{{')
|
||||||
|
p(', '.join(map(str, children_array)))
|
||||||
|
p('}; // }}}\n')
|
||||||
|
|
||||||
|
|
||||||
def gen_wcwidth():
|
def gen_wcwidth():
|
||||||
seen = set()
|
seen = set()
|
||||||
|
|||||||
@ -56,7 +56,7 @@ def name(cp):
|
|||||||
@lru_cache(maxsize=256)
|
@lru_cache(maxsize=256)
|
||||||
def codepoints_matching_search(parts):
|
def codepoints_matching_search(parts):
|
||||||
ans = []
|
ans = []
|
||||||
if parts and parts[0]:
|
if parts and parts[0] and len(parts[0]) > 1:
|
||||||
codepoints = points_for_word(parts[0])
|
codepoints = points_for_word(parts[0])
|
||||||
for word in parts[1:]:
|
for word in parts[1:]:
|
||||||
pts = points_for_word(word)
|
pts = points_for_word(word)
|
||||||
|
|||||||
63099
kittens/unicode_input/names.h
generated
63099
kittens/unicode_input/names.h
generated
File diff suppressed because one or more lines are too long
@ -9,32 +9,61 @@
|
|||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
all_words(PYNOARG) {
|
all_words(PYNOARG) {
|
||||||
PyObject *ans = PyTuple_New(arraysz(idx_to_word));
|
PyObject *ans = PyTuple_New(arraysz(all_words_map));
|
||||||
if (!ans) return NULL;
|
if (!ans) return NULL;
|
||||||
for (size_t i = 0; i < arraysz(idx_to_word); i++) {
|
for (size_t i = 0; i < arraysz(all_words_map); i++) {
|
||||||
PyObject *w = PyUnicode_FromString(idx_to_word[i]);
|
PyObject *w = PyUnicode_FromString(all_words_map[i]);
|
||||||
if (w == NULL) { Py_DECREF(ans); return NULL; }
|
if (w == NULL) { Py_DECREF(ans); return NULL; }
|
||||||
PyTuple_SET_ITEM(ans, i, w);
|
PyTuple_SET_ITEM(ans, i, w);
|
||||||
}
|
}
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
add_matches(const word_trie *wt, char_type *codepoints, size_t *pos, const size_t sz) {
|
||||||
|
size_t num = mark_groups[wt->match_offset];
|
||||||
|
for (size_t i = wt->match_offset + 1; i < wt->match_offset + 1 + num && *pos < sz; i++, (*pos)++) {
|
||||||
|
codepoints[*pos] = mark_to_cp[mark_groups[i]];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
process_trie_node(const word_trie *wt, char_type *codepoints, size_t *pos, const size_t sz) {
|
||||||
|
if (wt->match_offset) add_matches(wt, codepoints, pos, sz);
|
||||||
|
size_t num_children = children_array[wt->children_offset];
|
||||||
|
if (!num_children) return;
|
||||||
|
for (size_t c = wt->children_offset + 1; c < wt->children_offset + 1 + num_children; c++) {
|
||||||
|
if (*pos > sz) return;
|
||||||
|
uint32_t x = children_array[c];
|
||||||
|
process_trie_node(&all_trie_nodes[x >> 8], codepoints, pos, sz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static inline PyObject*
|
static inline PyObject*
|
||||||
codepoints_for_word(const char *word, size_t len) {
|
codepoints_for_word(const char *word, size_t len) {
|
||||||
PyObject *ans = PyFrozenSet_New(NULL); if (ans == NULL) return NULL;
|
const word_trie *wt = all_trie_nodes;
|
||||||
const unsigned short *words = words_for_first_letter[(unsigned)*word];
|
for (size_t i = 0; i < len; i++) {
|
||||||
if (words == NULL) return ans;
|
unsigned char ch = word[i];
|
||||||
for (unsigned short i = 1; i <= words[0]; i++) {
|
size_t num_children = children_array[wt->children_offset];
|
||||||
unsigned short word_idx = words[i];
|
if (!num_children) return PyFrozenSet_New(NULL);
|
||||||
const char *w = idx_to_word[word_idx];
|
bool found = false;
|
||||||
if (strncmp(word, w, len) == 0 && strlen(w) == len) {
|
for (size_t c = wt->children_offset + 1; c < wt->children_offset + 1 + num_children; c++) {
|
||||||
const char_type* codepoints = codepoints_for_word_idx[word_idx];
|
uint32_t x = children_array[c];
|
||||||
for (char_type i = 1; i <= codepoints[0]; i++) {
|
if ((x & 0xff) == ch) {
|
||||||
PyObject *t = PyLong_FromUnsignedLong(codepoints[i]); if (t == NULL) { Py_DECREF(ans); return NULL; }
|
found = true;
|
||||||
int ret = PySet_Add(ans, t); Py_DECREF(t); if (ret != 0) { Py_DECREF(ans); return NULL; }
|
wt = &all_trie_nodes[x >> 8];
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
if (!found) return PyFrozenSet_New(NULL);
|
||||||
|
}
|
||||||
|
static char_type codepoints[1024];
|
||||||
|
size_t cpos = 0;
|
||||||
|
process_trie_node(wt, codepoints, &cpos, arraysz(codepoints));
|
||||||
|
PyObject *ans = PyFrozenSet_New(NULL); if (ans == NULL) return NULL;
|
||||||
|
for (size_t i = 0; i < cpos; i++) {
|
||||||
|
PyObject *t = PyLong_FromUnsignedLong(codepoints[i]); if (t == NULL) { Py_DECREF(ans); return NULL; }
|
||||||
|
int ret = PySet_Add(ans, t); Py_DECREF(t); if (ret != 0) { Py_DECREF(ans); return NULL; }
|
||||||
}
|
}
|
||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|||||||
2
kitty/emoji.h
generated
2
kitty/emoji.h
generated
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2018-02-09
|
// unicode data, built from the unicode standard on: 2018-04-24
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2018-02-09
|
// unicode data, built from the unicode standard on: 2018-04-24
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|
||||||
|
|||||||
2
kitty/wcwidth-std.h
generated
2
kitty/wcwidth-std.h
generated
@ -1,4 +1,4 @@
|
|||||||
// unicode data, built from the unicode standard on: 2018-02-09
|
// unicode data, built from the unicode standard on: 2018-04-24
|
||||||
// see gen-wcwidth.py
|
// see gen-wcwidth.py
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "data-types.h"
|
#include "data-types.h"
|
||||||
|
|||||||
22
kitty_tests/unicode_input.py
Normal file
22
kitty_tests/unicode_input.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=utf-8
|
||||||
|
# License: GPL v3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
|
||||||
|
from . import BaseTest
|
||||||
|
|
||||||
|
|
||||||
|
class TestUnicodeInput(BaseTest):
|
||||||
|
|
||||||
|
def test_word_trie(self):
|
||||||
|
from kittens.unicode_input.unicode_names import codepoints_for_word
|
||||||
|
|
||||||
|
def matches(a, *words):
|
||||||
|
ans = codepoints_for_word(a)
|
||||||
|
for w in words:
|
||||||
|
ans &= codepoints_for_word(w)
|
||||||
|
return set(ans)
|
||||||
|
|
||||||
|
self.ae(matches('horiz', 'ell'), {0x2026, 0x22ef, 0x2b2c, 0x2b2d, 0xfe19})
|
||||||
|
self.ae(matches('horizontal', 'ell'), {0x2026, 0x22ef, 0x2b2c, 0x2b2d, 0xfe19})
|
||||||
|
self.assertFalse(matches('sfgsfgsfgfgsdg'))
|
||||||
Loading…
x
Reference in New Issue
Block a user