Rewrite wcswidth as a state machine

This commit is contained in:
Kovid Goyal 2020-04-07 10:10:30 +05:30
parent 0b9a37139e
commit 0862e85577
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 47 additions and 31 deletions

View File

@ -1867,27 +1867,39 @@ screen_wcswidth(PyObject UNUSED *self, PyObject *str) {
unsigned long ans = 0; unsigned long ans = 0;
char_type prev_ch = 0; char_type prev_ch = 0;
int prev_width = 0; int prev_width = 0;
bool in_sgr = false; typedef enum {NORMAL, IN_SGR, FLAG_PAIR_STARTED} WCSState;
WCSState state = NORMAL;
for (i = 0; i < len; i++) { for (i = 0; i < len; i++) {
char_type ch = PyUnicode_READ(kind, data, i); char_type ch = PyUnicode_READ(kind, data, i);
if (in_sgr) { switch(state) {
if (ch == 'm') in_sgr = false; case IN_SGR: {
continue; if (ch == 'm') state = NORMAL;
} } continue;
if (ch == 0x1b && i + 1 < len && PyUnicode_READ(kind, data, i + 1) == '[') { in_sgr = true; continue; }
if (ch == 0xfe0f) { case FLAG_PAIR_STARTED: {
state = NORMAL;
if (is_flag_pair(prev_ch, ch)) break;
} /* fallthrough */
case NORMAL: {
if (ch == 0x1b && i + 1 < len && PyUnicode_READ(kind, data, i + 1) == '[') { state = IN_SGR; continue; }
switch(ch) {
case 0xfe0f: {
if (is_emoji_presentation_base(prev_ch) && prev_width == 1) { if (is_emoji_presentation_base(prev_ch) && prev_width == 1) {
ans += 1; ans += 1;
prev_width = 2; prev_width = 2;
} else prev_width = 0; } else prev_width = 0;
} else if (ch == 0xfe0e) { } break;
case 0xfe0e: {
if (is_emoji_presentation_base(prev_ch) && prev_width == 2) { if (is_emoji_presentation_base(prev_ch) && prev_width == 2) {
ans -= 1; ans -= 1;
prev_width = 1; prev_width = 1;
} else prev_width = 0; } else prev_width = 0;
} else if (is_flag_pair(prev_ch, ch)) { } break;
prev_width = 2;
} else { default: {
if (is_flag_codepoint(ch)) state = FLAG_PAIR_STARTED;
int w = wcwidth_std(ch); int w = wcwidth_std(ch);
switch(w) { switch(w) {
case -1: case -1:
@ -1899,7 +1911,10 @@ screen_wcswidth(PyObject UNUSED *self, PyObject *str) {
prev_width = 1; break; prev_width = 1; break;
} }
ans += prev_width; ans += prev_width;
} } break;
} break; // switch(ch)
} break; // case NORMAL
} // switch(state)
prev_ch = ch; prev_ch = ch;
} }
return PyLong_FromUnsignedLong(ans); return PyLong_FromUnsignedLong(ans);

View File

@ -362,6 +362,7 @@ class TestDataTypes(BaseTest):
self.ae(wcswidth('\U0001f1e6a'), 3) self.ae(wcswidth('\U0001f1e6a'), 3)
self.ae(wcswidth('\U0001F1E6a\U0001F1E8a'), 6) self.ae(wcswidth('\U0001F1E6a\U0001F1E8a'), 6)
self.ae(wcswidth('\U0001F1E6\U0001F1E8a'), 3) self.ae(wcswidth('\U0001F1E6\U0001F1E8a'), 3)
self.ae(wcswidth('\U0001F1E6\U0001F1E8\U0001F1E6'), 4)
# Regional indicator symbols (unicode flags) are defined as having # Regional indicator symbols (unicode flags) are defined as having
# Emoji_Presentation so must have width 2 # Emoji_Presentation so must have width 2
self.ae(tuple(map(w, '\U0001f1ee\U0001f1f3')), (2, 2)) self.ae(tuple(map(w, '\U0001f1ee\U0001f1f3')), (2, 2))