From 88d896e7458710eec3e038ca7af4faa11a9777ec Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 13 Sep 2017 23:07:38 +0530 Subject: [PATCH] Move function to detect URLs into C code --- kitty/line.c | 81 ++++++++++++++++++++++++++++++++++++++++ kitty/lineops.h | 1 + kitty_tests/datatypes.py | 21 +++++++++++ 3 files changed, 103 insertions(+) diff --git a/kitty/line.c b/kitty/line.c index 1d396c7ed..5369f6f78 100644 --- a/kitty/line.c +++ b/kitty/line.c @@ -6,6 +6,7 @@ */ #include "data-types.h" +#include "unicode-data.h" #include "lineops.h" static PyObject * @@ -51,6 +52,85 @@ line_text_at(char_type ch, combining_type cc) { return ans; } +static const char* url_prefixes[4] = {"https", "http", "file", "ftp"}; +static size_t url_prefix_lengths[sizeof(url_prefixes)/sizeof(url_prefixes[0])] = {0}; +typedef enum URL_PARSER_STATES {ANY, FIRST_SLASH, SECOND_SLASH} URL_PARSER_STATE; + +static inline index_type +find_colon_slash(Line *self, index_type x, index_type limit) { + // Find :// at or before x + index_type pos = x; + URL_PARSER_STATE state = ANY; + limit = MAX(2, limit); + if (pos < limit) return 0; + do { + char_type ch = self->cells[pos].ch & CHAR_MASK; + if (!is_url_char(ch)) return false; + switch(state) { + case ANY: + if (ch == '/') state = FIRST_SLASH; + break; + case FIRST_SLASH: + state = ch == '/' ? SECOND_SLASH : ANY; + break; + case SECOND_SLASH: + if (ch == ':') return pos; + state = ANY; + break; + } + pos--; + } while(pos >= limit); + return 0; +} + +static inline bool +prefix_matches(Line *self, index_type at, const char* prefix, index_type prefix_len) { + if (prefix_len > at) return false; + index_type p, i; + for (p = at - prefix_len, i = 0; i < prefix_len && p < self->xnum; i++, p++) { + if ((self->cells[p].ch & CHAR_MASK) != (unsigned char)prefix[i]) return false; + } + return i == prefix_len ? true : false; +} + +static inline bool +has_url_prefix_at(Line *self, index_type at, index_type min_prefix_len, index_type *ans) { + if (UNLIKELY(!url_prefix_lengths[0])) { + for (index_type i = 0; i < sizeof(url_prefixes)/sizeof(url_prefixes[0]); i++) url_prefix_lengths[i] = strlen(url_prefixes[i]); + } + for (index_type i = 0; i < sizeof(url_prefixes)/sizeof(url_prefixes[0]); i++) { + index_type prefix_len = url_prefix_lengths[i]; + if (at < prefix_len || prefix_len < min_prefix_len) continue; + if (prefix_matches(self, at, url_prefixes[i], prefix_len)) { *ans = at - prefix_len; return true; } + } + return false; +} + +#define MAX_URL_SCHEME_LEN 5 +#define MIN_URL_LEN 5 +index_type +line_url_start_at(Line *self, index_type x) { + // Find the starting cell for a URL that contains the position x. A URL is defined as + // known-prefix://url-chars. If no URL is found self->xnum is returned. + if (x >= self->xnum || self->xnum <= MIN_URL_LEN + 3) return self->xnum; + index_type ds_pos = 0, t; + // First look for :// ahead of x + if (self->xnum - x > MAX_URL_SCHEME_LEN + 3) ds_pos = find_colon_slash(self, x + MAX_URL_SCHEME_LEN + 3, x < 2 ? 0 : x - 2); + if (ds_pos != 0) { + if (has_url_prefix_at(self, ds_pos, ds_pos > x ? ds_pos - x: 0, &t)) return t; + } + ds_pos = find_colon_slash(self, x, 0); + if (ds_pos == 0 || self->xnum < ds_pos + MIN_URL_LEN + 3) return self->xnum; + if (has_url_prefix_at(self, ds_pos, 0, &t)) return t; + return self->xnum; +} + +static PyObject* +url_start_at(Line *self, PyObject *x) { +#define url_start_at_doc "url_start_at(x) -> Return the start cell number for a URL containing x or self->xnum if not found" + return PyLong_FromUnsignedLong((unsigned long)line_url_start_at(self, PyLong_AsUnsignedLong(x))); +} + static PyObject* text_at(Line* self, Py_ssize_t xval) { #define text_at_doc "[x] -> Return the text in the specified cell" @@ -474,6 +554,7 @@ static PyMethodDef methods[] = { METHOD(as_ansi, METH_NOARGS) METHOD(is_continued, METH_NOARGS) METHOD(width, METH_O) + METHOD(url_start_at, METH_O) {NULL} /* Sentinel */ }; diff --git a/kitty/lineops.h b/kitty/lineops.h index e7e277179..02c53ae1a 100644 --- a/kitty/lineops.h +++ b/kitty/lineops.h @@ -61,6 +61,7 @@ void line_apply_cursor(Line *self, Cursor *cursor, unsigned int at, unsigned int void line_set_char(Line *, unsigned int , uint32_t , unsigned int , Cursor *); void line_right_shift(Line *, unsigned int , unsigned int ); void line_add_combining_char(Line *, uint32_t , unsigned int ); +index_type line_url_start_at(Line *self, index_type x); index_type line_as_ansi(Line *self, Py_UCS4 *buf, index_type buflen); unsigned int line_length(Line *self); PyObject* unicode_in_range(Line *self, index_type start, index_type limit, bool include_cc, char leading_char); diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py index ea532a6ad..b2e7ee5fb 100644 --- a/kitty_tests/datatypes.py +++ b/kitty_tests/datatypes.py @@ -215,6 +215,27 @@ class TestDataTypes(BaseTest): l.set_char(0, 'x', 1, q) self.assertEqualAttributes(l.cursor_from(0), q) + def test_url_at(self): + def create(t): + lb = create.lb = LineBuf(1, len(t)) + l = lb.line(0) + l.set_text(t, 0, len(t), C()) + return l + + def lspace_test(n): + l = create(' ' * n + 'http://acme.com') + for i in range(0, n): + self.ae(l.url_start_at(i), len(l)) + for i in range(n, len(l)): + self.ae(l.url_start_at(i), n) + for i in range(5): + lspace_test(i) + l = create('b https://testing.me a') + for s in (0, 1, len(l) - 1, len(l) - 2): + self.ae(l.url_start_at(s), len(l), 'failed with start at: %d' % s) + for s in range(2, len(l) - 2): + self.ae(l.url_start_at(s), 2, 'failed with start at: %d (%s)' % (s, str(l)[s:])) + def rewrap(self, lb, lb2): hb = HistoryBuf(lb2.ynum, lb2.xnum) cy = lb.rewrap(lb2, hb)