Port the subseq matcher to Go
This commit is contained in:
parent
b088ab91cf
commit
29dd2438c9
@ -1,86 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "data-types.h"
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
#define ISWINDOWS
|
|
||||||
#define STDCALL __stdcall
|
|
||||||
#ifndef ssize_t
|
|
||||||
#include <BaseTsd.h>
|
|
||||||
typedef SSIZE_T ssize_t;
|
|
||||||
#ifndef SSIZE_MAX
|
|
||||||
#if defined(_WIN64)
|
|
||||||
#define SSIZE_MAX _I64_MAX
|
|
||||||
#else
|
|
||||||
#define SSIZE_MAX LONG_MAX
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
#else
|
|
||||||
#define STDCALL
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "vector.h"
|
|
||||||
|
|
||||||
typedef uint8_t len_t;
|
|
||||||
typedef uint32_t text_t;
|
|
||||||
|
|
||||||
#define LEN_MAX UINT8_MAX
|
|
||||||
#define IS_LOWERCASE(x) (x) >= 'a' && (x) <= 'z'
|
|
||||||
#define IS_UPPERCASE(x) (x) >= 'A' && (x) <= 'Z'
|
|
||||||
#define LOWERCASE(x) ((IS_UPPERCASE(x)) ? (x) + 32 : (x))
|
|
||||||
#define arraysz(x) (sizeof(x)/sizeof(x[0]))
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
text_t* src;
|
|
||||||
ssize_t src_sz;
|
|
||||||
len_t haystack_len;
|
|
||||||
len_t *positions;
|
|
||||||
double score;
|
|
||||||
ssize_t idx;
|
|
||||||
} Candidate;
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
Candidate *haystack;
|
|
||||||
size_t haystack_count;
|
|
||||||
text_t level1[LEN_MAX], level2[LEN_MAX], level3[LEN_MAX], needle[LEN_MAX];
|
|
||||||
len_t level1_len, level2_len, level3_len, needle_len;
|
|
||||||
size_t haystack_size;
|
|
||||||
text_t *output;
|
|
||||||
size_t output_sz, output_pos;
|
|
||||||
int oom;
|
|
||||||
} GlobalData;
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
bool output_positions;
|
|
||||||
size_t limit;
|
|
||||||
int num_threads;
|
|
||||||
text_t mark_before[128], mark_after[128], delimiter[128];
|
|
||||||
size_t mark_before_sz, mark_after_sz, delimiter_sz;
|
|
||||||
} Options;
|
|
||||||
|
|
||||||
VECTOR_OF(len_t, Positions)
|
|
||||||
VECTOR_OF(text_t, Chars)
|
|
||||||
VECTOR_OF(Candidate, Candidates)
|
|
||||||
|
|
||||||
|
|
||||||
void output_results(GlobalData *, Candidate *haystack, size_t count, Options *opts, len_t needle_len);
|
|
||||||
void* alloc_workspace(len_t max_haystack_len, GlobalData*);
|
|
||||||
void* free_workspace(void *v);
|
|
||||||
double score_item(void *v, text_t *haystack, len_t haystack_len, len_t *match_positions);
|
|
||||||
unsigned int encode_codepoint(text_t ch, char* dest);
|
|
||||||
size_t unescape(const char *src, char *dest, size_t destlen);
|
|
||||||
int cpu_count(void);
|
|
||||||
void* alloc_threads(size_t num_threads);
|
|
||||||
#ifdef ISWINDOWS
|
|
||||||
bool start_thread(void* threads, size_t i, unsigned int (STDCALL *start_routine) (void *), void *arg);
|
|
||||||
ssize_t getdelim(char **lineptr, size_t *n, int delim, FILE *stream);
|
|
||||||
#else
|
|
||||||
bool start_thread(void* threads, size_t i, void *(*start_routine) (void *), void *arg);
|
|
||||||
#endif
|
|
||||||
void wait_for_thread(void *threads, size_t i);
|
|
||||||
void free_threads(void *threads);
|
|
||||||
@ -1,244 +0,0 @@
|
|||||||
/*
|
|
||||||
* main.c
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "choose-data-types.h"
|
|
||||||
#include "charsets.h"
|
|
||||||
|
|
||||||
#include <errno.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <fcntl.h>
|
|
||||||
#ifndef ISWINDOWS
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
size_t start, count;
|
|
||||||
void *workspace;
|
|
||||||
len_t max_haystack_len;
|
|
||||||
bool started;
|
|
||||||
GlobalData *global;
|
|
||||||
} JobData;
|
|
||||||
|
|
||||||
|
|
||||||
static unsigned int STDCALL
|
|
||||||
run_scoring(JobData *job_data) {
|
|
||||||
GlobalData *global = job_data->global;
|
|
||||||
for (size_t i = job_data->start; i < job_data->start + job_data->count; i++) {
|
|
||||||
global->haystack[i].score = score_item(job_data->workspace, global->haystack[i].src, global->haystack[i].haystack_len, global->haystack[i].positions);
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void*
|
|
||||||
run_scoring_pthreads(void *job_data) {
|
|
||||||
run_scoring((JobData*)job_data);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
#ifdef ISWINDOWS
|
|
||||||
#define START_FUNC run_scoring
|
|
||||||
#else
|
|
||||||
#define START_FUNC run_scoring_pthreads
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static JobData*
|
|
||||||
create_job(size_t i, size_t blocksz, GlobalData *global) {
|
|
||||||
JobData *ans = (JobData*)calloc(1, sizeof(JobData));
|
|
||||||
if (ans == NULL) return NULL;
|
|
||||||
ans->start = i * blocksz;
|
|
||||||
if (ans->start >= global->haystack_count) ans->count = 0;
|
|
||||||
else ans->count = global->haystack_count - ans->start;
|
|
||||||
ans->max_haystack_len = 0;
|
|
||||||
for (size_t j = ans->start; j < ans->start + ans->count; j++) ans->max_haystack_len = MAX(ans->max_haystack_len, global->haystack[j].haystack_len);
|
|
||||||
if (ans->count > 0) {
|
|
||||||
ans->workspace = alloc_workspace(ans->max_haystack_len, global);
|
|
||||||
if (!ans->workspace) { free(ans); return NULL; }
|
|
||||||
}
|
|
||||||
ans->global = global;
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
static JobData*
|
|
||||||
free_job(JobData *job) {
|
|
||||||
if (job) {
|
|
||||||
if (job->workspace) free_workspace(job->workspace);
|
|
||||||
free(job);
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
run_threaded(int num_threads_asked, GlobalData *global) {
|
|
||||||
int ret = 0;
|
|
||||||
size_t i, blocksz;
|
|
||||||
size_t num_threads = MAX(1, num_threads_asked > 0 ? num_threads_asked : cpu_count());
|
|
||||||
if (global->haystack_size < 10000) num_threads = 1;
|
|
||||||
/* printf("num_threads: %lu asked: %d sysconf: %ld\n", num_threads, num_threads_asked, sysconf(_SC_NPROCESSORS_ONLN)); */
|
|
||||||
|
|
||||||
void *threads = alloc_threads(num_threads);
|
|
||||||
JobData **job_data = calloc(num_threads, sizeof(JobData*));
|
|
||||||
if (threads == NULL || job_data == NULL) { ret = 1; goto end; }
|
|
||||||
|
|
||||||
blocksz = global->haystack_count / num_threads + global->haystack_count % num_threads;
|
|
||||||
|
|
||||||
for (i = 0; i < num_threads; i++) {
|
|
||||||
job_data[i] = create_job(i, blocksz, global);
|
|
||||||
if (job_data[i] == NULL) { ret = 1; goto end; }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (num_threads == 1) {
|
|
||||||
run_scoring(job_data[0]);
|
|
||||||
} else {
|
|
||||||
for (i = 0; i < num_threads; i++) {
|
|
||||||
job_data[i]->started = false;
|
|
||||||
if (job_data[i]->count > 0) {
|
|
||||||
if (!start_thread(threads, i, START_FUNC, job_data[i])) ret = 1;
|
|
||||||
else job_data[i]->started = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
end:
|
|
||||||
if (num_threads > 1 && job_data) {
|
|
||||||
for (i = 0; i < num_threads; i++) {
|
|
||||||
if (job_data[i] && job_data[i]->started) wait_for_thread(threads, i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (job_data) { for (i = 0; i < num_threads; i++) job_data[i] = free_job(job_data[i]); }
|
|
||||||
free(job_data);
|
|
||||||
free_threads(threads);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static int
|
|
||||||
run_search(Options *opts, GlobalData *global, const char * const *lines, const size_t* sizes, size_t num_lines) {
|
|
||||||
const char *linebuf = NULL;
|
|
||||||
size_t idx = 0;
|
|
||||||
ssize_t sz = 0;
|
|
||||||
int ret = 0;
|
|
||||||
Candidates candidates = {0};
|
|
||||||
Chars chars = {0};
|
|
||||||
|
|
||||||
ALLOC_VEC(text_t, chars, 8192 * 20);
|
|
||||||
if (chars.data == NULL) return 1;
|
|
||||||
ALLOC_VEC(Candidate, candidates, 8192);
|
|
||||||
if (candidates.data == NULL) { FREE_VEC(chars); return 1; }
|
|
||||||
|
|
||||||
for (size_t i = 0; i < num_lines; i++) {
|
|
||||||
sz = sizes[i];
|
|
||||||
linebuf = lines[i];
|
|
||||||
if (sz > 0) {
|
|
||||||
ENSURE_SPACE(text_t, chars, sz);
|
|
||||||
ENSURE_SPACE(Candidate, candidates, 1);
|
|
||||||
sz = decode_utf8_string(linebuf, sz, &(NEXT(chars)));
|
|
||||||
NEXT(candidates).src_sz = sz;
|
|
||||||
NEXT(candidates).haystack_len = (len_t)(MIN(LEN_MAX, sz));
|
|
||||||
global->haystack_size += NEXT(candidates).haystack_len;
|
|
||||||
NEXT(candidates).idx = idx++;
|
|
||||||
INC(candidates, 1); INC(chars, sz);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare the haystack allocating space for positions arrays and settings
|
|
||||||
// up the src pointers to point to the correct locations
|
|
||||||
Candidate *haystack = &ITEM(candidates, 0);
|
|
||||||
len_t *positions = (len_t*)calloc(SIZE(candidates), sizeof(len_t) * global->needle_len);
|
|
||||||
if (positions) {
|
|
||||||
text_t *cdata = &ITEM(chars, 0);
|
|
||||||
for (size_t i = 0, off = 0; i < SIZE(candidates); i++) {
|
|
||||||
haystack[i].positions = positions + (i * global->needle_len);
|
|
||||||
haystack[i].src = cdata + off;
|
|
||||||
off += haystack[i].src_sz;
|
|
||||||
}
|
|
||||||
global->haystack = haystack;
|
|
||||||
global->haystack_count = SIZE(candidates);
|
|
||||||
ret = run_threaded(opts->num_threads, global);
|
|
||||||
if (ret == 0) output_results(global, haystack, SIZE(candidates), opts, global->needle_len);
|
|
||||||
else { REPORT_OOM; }
|
|
||||||
} else { ret = 1; REPORT_OOM; }
|
|
||||||
|
|
||||||
FREE_VEC(chars); free(positions); FREE_VEC(candidates);
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
static size_t
|
|
||||||
copy_unicode_object(PyObject *src, text_t *dest, size_t dest_sz) {
|
|
||||||
PyUnicode_READY(src);
|
|
||||||
int kind = PyUnicode_KIND(src);
|
|
||||||
void *data = PyUnicode_DATA(src);
|
|
||||||
size_t len = PyUnicode_GetLength(src);
|
|
||||||
for (size_t i = 0; i < len && i < dest_sz; i++) {
|
|
||||||
dest[i] = PyUnicode_READ(kind, data, i);
|
|
||||||
}
|
|
||||||
return len;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject*
|
|
||||||
match(PyObject *self, PyObject *args) {
|
|
||||||
(void)(self);
|
|
||||||
int output_positions;
|
|
||||||
unsigned long limit;
|
|
||||||
PyObject *lines, *levels, *needle, *mark_before, *mark_after, *delimiter;
|
|
||||||
Options opts = {0};
|
|
||||||
GlobalData global = {0};
|
|
||||||
if (!PyArg_ParseTuple(args, "O!O!UpkiUUU",
|
|
||||||
&PyList_Type, &lines, &PyTuple_Type, &levels, &needle,
|
|
||||||
&output_positions, &limit, &opts.num_threads,
|
|
||||||
&mark_before, &mark_after, &delimiter
|
|
||||||
)) return NULL;
|
|
||||||
opts.output_positions = output_positions ? true : false;
|
|
||||||
opts.limit = limit;
|
|
||||||
global.level1_len = copy_unicode_object(PyTuple_GET_ITEM(levels, 0), global.level1, arraysz(global.level1));
|
|
||||||
global.level2_len = copy_unicode_object(PyTuple_GET_ITEM(levels, 1), global.level2, arraysz(global.level2));
|
|
||||||
global.level3_len = copy_unicode_object(PyTuple_GET_ITEM(levels, 2), global.level3, arraysz(global.level3));
|
|
||||||
global.needle_len = copy_unicode_object(needle, global.needle, arraysz(global.needle));
|
|
||||||
opts.mark_before_sz = copy_unicode_object(mark_before, opts.mark_before, arraysz(opts.mark_before));
|
|
||||||
opts.mark_after_sz = copy_unicode_object(mark_after, opts.mark_after, arraysz(opts.mark_after));
|
|
||||||
opts.delimiter_sz = copy_unicode_object(delimiter, opts.delimiter, arraysz(opts.delimiter));
|
|
||||||
size_t num_lines = PyList_GET_SIZE(lines);
|
|
||||||
char **clines = malloc(sizeof(char*) * num_lines);
|
|
||||||
if (!clines) { return PyErr_NoMemory(); }
|
|
||||||
size_t *sizes = malloc(sizeof(size_t) * num_lines);
|
|
||||||
if (!sizes) { free(clines); clines = NULL; return PyErr_NoMemory(); }
|
|
||||||
for (size_t i = 0; i < num_lines; i++) {
|
|
||||||
clines[i] = PyBytes_AS_STRING(PyList_GET_ITEM(lines, i));
|
|
||||||
sizes[i] = PyBytes_GET_SIZE(PyList_GET_ITEM(lines, i));
|
|
||||||
}
|
|
||||||
Py_BEGIN_ALLOW_THREADS;
|
|
||||||
run_search(&opts, &global, (const char* const *)clines, sizes, num_lines);
|
|
||||||
Py_END_ALLOW_THREADS;
|
|
||||||
free(clines); free(sizes);
|
|
||||||
if (global.oom) { free(global.output); return PyErr_NoMemory(); }
|
|
||||||
if (global.output) {
|
|
||||||
PyObject *ans = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, global.output, global.output_pos);
|
|
||||||
free(global.output);
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
Py_RETURN_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyMethodDef module_methods[] = {
|
|
||||||
{"match", match, METH_VARARGS, ""},
|
|
||||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
|
||||||
};
|
|
||||||
|
|
||||||
static struct PyModuleDef module = {
|
|
||||||
.m_base = PyModuleDef_HEAD_INIT,
|
|
||||||
.m_name = "subseq_matcher", /* name of module */
|
|
||||||
.m_doc = NULL,
|
|
||||||
.m_size = -1,
|
|
||||||
.m_methods = module_methods
|
|
||||||
};
|
|
||||||
|
|
||||||
EXPORTED PyMODINIT_FUNC
|
|
||||||
PyInit_subseq_matcher(void) {
|
|
||||||
return PyModule_Create(&module);
|
|
||||||
}
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# License: GPL v3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
from kitty.key_encoding import KeyEvent
|
|
||||||
|
|
||||||
from ..tui.handler import Handler
|
|
||||||
from ..tui.loop import Loop
|
|
||||||
|
|
||||||
|
|
||||||
class ChooseHandler(Handler):
|
|
||||||
|
|
||||||
def initialize(self) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def on_text(self, text: str, in_bracketed_paste: bool = False) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def on_key(self, key_event: KeyEvent) -> None:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def on_interrupt(self) -> None:
|
|
||||||
self.quit_loop(1)
|
|
||||||
|
|
||||||
def on_eot(self) -> None:
|
|
||||||
self.quit_loop(1)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: List[str]) -> None:
|
|
||||||
loop = Loop()
|
|
||||||
handler = ChooseHandler()
|
|
||||||
loop.loop(handler)
|
|
||||||
raise SystemExit(loop.return_code)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(sys.argv)
|
|
||||||
@ -1,38 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# License: GPLv3 Copyright: 2021, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
from typing import Iterable, List, Union
|
|
||||||
|
|
||||||
from . import subseq_matcher
|
|
||||||
|
|
||||||
|
|
||||||
def match(
|
|
||||||
input_data: Union[str, bytes, Iterable[Union[str, bytes]]],
|
|
||||||
query: str,
|
|
||||||
threads: int = 0,
|
|
||||||
positions: bool = False,
|
|
||||||
level1: str = '/',
|
|
||||||
level2: str = '-_0123456789',
|
|
||||||
level3: str = '.',
|
|
||||||
limit: int = 0,
|
|
||||||
mark_before: str = '',
|
|
||||||
mark_after: str = '',
|
|
||||||
delimiter: str = '\n'
|
|
||||||
) -> List[str]:
|
|
||||||
if isinstance(input_data, str):
|
|
||||||
idata = [x.encode('utf-8') for x in input_data.split(delimiter)]
|
|
||||||
elif isinstance(input_data, bytes):
|
|
||||||
idata = input_data.split(delimiter.encode('utf-8'))
|
|
||||||
else:
|
|
||||||
idata = [x.encode('utf-8') if isinstance(x, str) else x for x in input_data]
|
|
||||||
query = query.lower()
|
|
||||||
level1 = level1.lower()
|
|
||||||
level2 = level2.lower()
|
|
||||||
level3 = level3.lower()
|
|
||||||
data = subseq_matcher.match(
|
|
||||||
idata, (level1, level2, level3), query,
|
|
||||||
positions, limit, threads,
|
|
||||||
mark_before, mark_after, delimiter)
|
|
||||||
if data is None:
|
|
||||||
return []
|
|
||||||
return list(filter(None, data.split(delimiter or '\n')))
|
|
||||||
@ -1,101 +0,0 @@
|
|||||||
/*
|
|
||||||
* output.c
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "choose-data-types.h"
|
|
||||||
#include "../../kitty/iqsort.h"
|
|
||||||
#include <string.h>
|
|
||||||
#include <ctype.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#ifdef ISWINDOWS
|
|
||||||
#include <io.h>
|
|
||||||
#define STDOUT_FILENO 1
|
|
||||||
static ssize_t ms_write(int fd, const void* buf, size_t count) { return _write(fd, buf, (unsigned int)count); }
|
|
||||||
#define write ms_write
|
|
||||||
#else
|
|
||||||
#include <unistd.h>
|
|
||||||
#endif
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
|
|
||||||
#define FIELD(x, which) (((Candidate*)(x))->which)
|
|
||||||
|
|
||||||
static bool
|
|
||||||
ensure_space(GlobalData *global, size_t sz) {
|
|
||||||
if (global->output_sz < sz + global->output_pos || !global->output) {
|
|
||||||
size_t before = global->output_sz;
|
|
||||||
global->output_sz += MAX(sz, (64u * 1024u));
|
|
||||||
global->output = realloc(global->output, sizeof(text_t) * global->output_sz);
|
|
||||||
if (!global->output) {
|
|
||||||
global->output_sz = before;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_text(GlobalData *global, const text_t *data, size_t sz) {
|
|
||||||
if (ensure_space(global, sz)) {
|
|
||||||
memcpy(global->output + global->output_pos, data, sizeof(text_t) * sz);
|
|
||||||
global->output_pos += sz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_with_marks(GlobalData *global, Options *opts, text_t *src, size_t src_sz, len_t *positions, len_t poslen) {
|
|
||||||
size_t pos, i = 0;
|
|
||||||
for (pos = 0; pos < poslen; pos++, i++) {
|
|
||||||
output_text(global, src + i, MIN(src_sz, positions[pos]) - i);
|
|
||||||
i = positions[pos];
|
|
||||||
if (i < src_sz) {
|
|
||||||
if (opts->mark_before_sz > 0) output_text(global, opts->mark_before, opts->mark_before_sz);
|
|
||||||
output_text(global, src + i, 1);
|
|
||||||
if (opts->mark_after_sz > 0) output_text(global, opts->mark_after, opts->mark_after_sz);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i = positions[poslen - 1];
|
|
||||||
if (i + 1 < src_sz) output_text(global, src + i + 1, src_sz - i - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_positions(GlobalData *global, len_t *positions, len_t num) {
|
|
||||||
wchar_t buf[128];
|
|
||||||
for (len_t i = 0; i < num; i++) {
|
|
||||||
int pnum = swprintf(buf, arraysz(buf), L"%u", positions[i]);
|
|
||||||
if (pnum > 0 && ensure_space(global, pnum + 1)) {
|
|
||||||
for (int k = 0; k < pnum; k++) global->output[global->output_pos++] = buf[k];
|
|
||||||
global->output[global->output_pos++] = (i == num - 1) ? ':' : ',';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_result(GlobalData *global, Candidate *c, Options *opts, len_t needle_len) {
|
|
||||||
if (opts->output_positions) output_positions(global, c->positions, needle_len);
|
|
||||||
if (opts->mark_before_sz > 0 || opts->mark_after_sz > 0) {
|
|
||||||
output_with_marks(global, opts, c->src, c->src_sz, c->positions, needle_len);
|
|
||||||
} else {
|
|
||||||
output_text(global, c->src, c->src_sz);
|
|
||||||
}
|
|
||||||
output_text(global, opts->delimiter, opts->delimiter_sz);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void
|
|
||||||
output_results(GlobalData *global, Candidate *haystack, size_t count, Options *opts, len_t needle_len) {
|
|
||||||
Candidate *c;
|
|
||||||
#define lt(b, a) ( (a)->score < (b)->score || ((a)->score == (b)->score && (a->idx < b->idx)) )
|
|
||||||
QSORT(Candidate, haystack, count, lt);
|
|
||||||
#undef lt
|
|
||||||
size_t left = opts->limit > 0 ? opts->limit : count;
|
|
||||||
for (size_t i = 0; i < left; i++) {
|
|
||||||
c = haystack + i;
|
|
||||||
if (c->score > 0) output_result(global, c, opts, needle_len);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,182 +0,0 @@
|
|||||||
/*
|
|
||||||
* score.c
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "choose-data-types.h"
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <float.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
len_t *positions_buf; // buffer to store positions for every char in needle
|
|
||||||
len_t **positions; // Array of pointers into positions_buf
|
|
||||||
len_t *positions_count; // Array of counts for positions
|
|
||||||
len_t needle_len; // Length of the needle
|
|
||||||
len_t max_haystack_len; // Max length of a string in the haystack
|
|
||||||
len_t haystack_len; // Length of the current string in the haystack
|
|
||||||
len_t *address; // Array of offsets into the positions array
|
|
||||||
double max_score_per_char;
|
|
||||||
uint8_t *level_factors; // Array of score factors for every character in the current haystack that matches a character in the needle
|
|
||||||
text_t *level1, *level2, *level3; // The characters in the levels
|
|
||||||
len_t level1_len, level2_len, level3_len;
|
|
||||||
text_t *needle; // The current needle
|
|
||||||
text_t *haystack; //The current haystack
|
|
||||||
} WorkSpace;
|
|
||||||
|
|
||||||
void*
|
|
||||||
alloc_workspace(len_t max_haystack_len, GlobalData *global) {
|
|
||||||
WorkSpace *ans = calloc(1, sizeof(WorkSpace));
|
|
||||||
if (ans == NULL) return NULL;
|
|
||||||
ans->positions_buf = (len_t*) calloc(global->needle_len, sizeof(len_t) * max_haystack_len);
|
|
||||||
ans->positions = (len_t**)calloc(global->needle_len, sizeof(len_t*));
|
|
||||||
ans->positions_count = (len_t*)calloc(2*global->needle_len, sizeof(len_t));
|
|
||||||
ans->level_factors = (uint8_t*)calloc(max_haystack_len, sizeof(uint8_t));
|
|
||||||
if (ans->positions == NULL || ans->positions_buf == NULL || ans->positions_count == NULL || ans->level_factors == NULL) { free_workspace(ans); return NULL; }
|
|
||||||
ans->needle = global->needle;
|
|
||||||
ans->needle_len = global->needle_len;
|
|
||||||
ans->max_haystack_len = max_haystack_len;
|
|
||||||
ans->level1 = global->level1; ans->level2 = global->level2; ans->level3 = global->level3;
|
|
||||||
ans->level1_len = global->level1_len; ans->level2_len = global->level2_len; ans->level3_len = global->level3_len;
|
|
||||||
ans->address = ans->positions_count + sizeof(len_t) * global->needle_len;
|
|
||||||
for (len_t i = 0; i < global->needle_len; i++) ans->positions[i] = ans->positions_buf + i * max_haystack_len;
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define NUKE(x) free(x); x = NULL;
|
|
||||||
|
|
||||||
void*
|
|
||||||
free_workspace(void *v) {
|
|
||||||
WorkSpace *w = (WorkSpace*)v;
|
|
||||||
NUKE(w->positions_buf);
|
|
||||||
NUKE(w->positions);
|
|
||||||
NUKE(w->positions_count);
|
|
||||||
NUKE(w->level_factors);
|
|
||||||
free(w);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
has_char(text_t *text, len_t sz, text_t ch) {
|
|
||||||
for(len_t i = 0; i < sz; i++) {
|
|
||||||
if(text[i] == ch) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint8_t
|
|
||||||
level_factor_for(text_t current, text_t last, WorkSpace *w) {
|
|
||||||
text_t lch = LOWERCASE(last);
|
|
||||||
if (has_char(w->level1, w->level1_len, lch)) return 90;
|
|
||||||
if (has_char(w->level2, w->level2_len, lch)) return 80;
|
|
||||||
if (IS_LOWERCASE(last) && IS_UPPERCASE(current)) return 80; // CamelCase
|
|
||||||
if (has_char(w->level3, w->level3_len, lch)) return 70;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
init_workspace(WorkSpace *w, text_t *haystack, len_t haystack_len) {
|
|
||||||
// Calculate the positions and level_factors arrays for the specified haystack
|
|
||||||
bool level_factor_calculated = false;
|
|
||||||
memset(w->positions_count, 0, sizeof(*(w->positions_count)) * 2 * w->needle_len);
|
|
||||||
memset(w->level_factors, 0, sizeof(*(w->level_factors)) * w->max_haystack_len);
|
|
||||||
for (len_t i = 0; i < haystack_len; i++) {
|
|
||||||
level_factor_calculated = false;
|
|
||||||
for (len_t j = 0; j < w->needle_len; j++) {
|
|
||||||
if (w->needle[j] == LOWERCASE(haystack[i])) {
|
|
||||||
if (!level_factor_calculated) {
|
|
||||||
level_factor_calculated = true;
|
|
||||||
w->level_factors[i] = i > 0 ? level_factor_for(haystack[i], haystack[i-1], w) : 0;
|
|
||||||
}
|
|
||||||
w->positions[j][w->positions_count[j]++] = i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
w->haystack = haystack;
|
|
||||||
w->haystack_len = haystack_len;
|
|
||||||
w->max_score_per_char = (1.0 / haystack_len + 1.0 / w->needle_len) / 2.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static bool
|
|
||||||
has_atleast_one_match(WorkSpace *w) {
|
|
||||||
int p = -1;
|
|
||||||
bool found;
|
|
||||||
for (len_t i = 0; i < w->needle_len; i++) {
|
|
||||||
if (w->positions_count[i] == 0) return false; // All characters of the needle are not present in the haystack
|
|
||||||
found = false;
|
|
||||||
for (len_t j = 0; j < w->positions_count[i]; j++) {
|
|
||||||
if (w->positions[i][j] > p) { p = w->positions[i][j]; found = true; break; }
|
|
||||||
}
|
|
||||||
if (!found) return false; // Characters of needle not present in sequence in haystack
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define POSITION(x) w->positions[x][w->address[x]]
|
|
||||||
|
|
||||||
static bool
|
|
||||||
increment_address(WorkSpace *w) {
|
|
||||||
len_t pos = w->needle_len - 1;
|
|
||||||
while(true) {
|
|
||||||
w->address[pos]++;
|
|
||||||
if (w->address[pos] < w->positions_count[pos]) return true;
|
|
||||||
if (pos == 0) break;
|
|
||||||
w->address[pos--] = 0;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
address_is_monotonic(WorkSpace *w) {
|
|
||||||
// Check if the character positions pointed to by the current address are monotonic
|
|
||||||
for (len_t i = 1; i < w->needle_len; i++) {
|
|
||||||
if (POSITION(i) <= POSITION(i-1)) return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static double
|
|
||||||
calc_score(WorkSpace *w) {
|
|
||||||
double ans = 0;
|
|
||||||
len_t distance, pos;
|
|
||||||
for (len_t i = 0; i < w->needle_len; i++) {
|
|
||||||
pos = POSITION(i);
|
|
||||||
if (i == 0) distance = pos < LEN_MAX ? pos + 1 : LEN_MAX;
|
|
||||||
else {
|
|
||||||
distance = pos - POSITION(i-1);
|
|
||||||
if (distance < 2) {
|
|
||||||
ans += w->max_score_per_char; // consecutive characters
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (w->level_factors[pos]) ans += (100 * w->max_score_per_char) / w->level_factors[pos]; // at a special location
|
|
||||||
else ans += (0.75 * w->max_score_per_char) / distance;
|
|
||||||
}
|
|
||||||
return ans;
|
|
||||||
}
|
|
||||||
|
|
||||||
static double
|
|
||||||
process_item(WorkSpace *w, len_t *match_positions) {
|
|
||||||
double highscore = 0, score;
|
|
||||||
do {
|
|
||||||
if (!address_is_monotonic(w)) continue;
|
|
||||||
score = calc_score(w);
|
|
||||||
if (score > highscore) {
|
|
||||||
highscore = score;
|
|
||||||
for (len_t i = 0; i < w->needle_len; i++) match_positions[i] = POSITION(i);
|
|
||||||
}
|
|
||||||
} while(increment_address(w));
|
|
||||||
return highscore;
|
|
||||||
}
|
|
||||||
|
|
||||||
double
|
|
||||||
score_item(void *v, text_t *haystack, len_t haystack_len, len_t *match_positions) {
|
|
||||||
WorkSpace *w = (WorkSpace*)v;
|
|
||||||
init_workspace(w, haystack, haystack_len);
|
|
||||||
if (!has_atleast_one_match(w)) return 0;
|
|
||||||
return process_item(w, match_positions);
|
|
||||||
}
|
|
||||||
@ -1,8 +0,0 @@
|
|||||||
from typing import List, Optional, Tuple
|
|
||||||
|
|
||||||
def match(
|
|
||||||
lines: List[bytes], levels: Tuple[str, str, str], needle: str,
|
|
||||||
output_positions: bool, limit: int, num_threads: int, mark_before: str,
|
|
||||||
mark_after: str, delimiter: str
|
|
||||||
) -> Optional[str]:
|
|
||||||
pass
|
|
||||||
@ -1,50 +0,0 @@
|
|||||||
/*
|
|
||||||
* unix_compat.c
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "choose-data-types.h"
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <pthread.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
#ifdef __APPLE__
|
|
||||||
#ifndef _SC_NPROCESSORS_ONLN
|
|
||||||
#define _SC_NPROCESSORS_ONLN 58
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
int
|
|
||||||
cpu_count(void) {
|
|
||||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void*
|
|
||||||
alloc_threads(size_t num_threads) {
|
|
||||||
return calloc(num_threads, sizeof(pthread_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
start_thread(void* threads, size_t i, void *(*start_routine) (void *), void *arg) {
|
|
||||||
int rc;
|
|
||||||
if ((rc = pthread_create(((pthread_t*)threads) + i, NULL, start_routine, arg))) {
|
|
||||||
fprintf(stderr, "Failed to create thread, with error: %s\n", strerror(rc));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
wait_for_thread(void *threads, size_t i) {
|
|
||||||
pthread_join(((pthread_t*)(threads))[i], NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
free_threads(void *threads) {
|
|
||||||
free(threads);
|
|
||||||
}
|
|
||||||
@ -1,42 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
|
|
||||||
#include "data-types.h"
|
|
||||||
|
|
||||||
#define REPORT_OOM global->oom = 1;
|
|
||||||
|
|
||||||
#define VECTOR_OF(TYPE, NAME) typedef struct { \
|
|
||||||
TYPE *data; \
|
|
||||||
size_t size; \
|
|
||||||
size_t capacity; \
|
|
||||||
} NAME;
|
|
||||||
|
|
||||||
#define ALLOC_VEC(TYPE, vec, cap) \
|
|
||||||
vec.size = 0; vec.capacity = cap; \
|
|
||||||
vec.data = (TYPE*)malloc(vec.capacity * sizeof(TYPE)); \
|
|
||||||
if (vec.data == NULL) { REPORT_OOM; }
|
|
||||||
|
|
||||||
#define FREE_VEC(vec) \
|
|
||||||
if (vec.data) { free(vec.data); vec.data = NULL; } \
|
|
||||||
vec.size = 0; vec.capacity = 0;
|
|
||||||
|
|
||||||
#define ENSURE_SPACE(TYPE, vec, amt) \
|
|
||||||
if (vec.size + amt >= vec.capacity) { \
|
|
||||||
vec.capacity = MAX(vec.capacity * 2, vec.size + amt); \
|
|
||||||
void *temp = realloc(vec.data, sizeof(TYPE) * vec.capacity); \
|
|
||||||
if (temp == NULL) { REPORT_OOM; ret = 1; free(vec.data); vec.data = NULL; vec.size = 0; vec.capacity = 0; break; } \
|
|
||||||
else vec.data = temp; \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define NEXT(vec) (vec.data[vec.size])
|
|
||||||
|
|
||||||
#define INC(vec, amt) vec.size += amt;
|
|
||||||
|
|
||||||
#define SIZE(vec) (vec.size)
|
|
||||||
|
|
||||||
#define ITEM(vec, n) (vec.data[n])
|
|
||||||
@ -1,107 +0,0 @@
|
|||||||
/*
|
|
||||||
* windows_compat.c
|
|
||||||
* Copyright (C) 2017 Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
*
|
|
||||||
* Distributed under terms of the GPL3 license.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "choose-data-types.h"
|
|
||||||
|
|
||||||
#include <windows.h>
|
|
||||||
#include <process.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
int
|
|
||||||
cpu_count() {
|
|
||||||
SYSTEM_INFO sysinfo;
|
|
||||||
GetSystemInfo(&sysinfo);
|
|
||||||
return sysinfo.dwNumberOfProcessors;
|
|
||||||
}
|
|
||||||
|
|
||||||
void*
|
|
||||||
alloc_threads(size_t num_threads) {
|
|
||||||
return calloc(num_threads, sizeof(uintptr_t));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool
|
|
||||||
start_thread(void* vt, size_t i, unsigned int (STDCALL *start_routine) (void *), void *arg) {
|
|
||||||
uintptr_t *threads = (uintptr_t*)vt;
|
|
||||||
errno = 0;
|
|
||||||
threads[i] = _beginthreadex(NULL, 0, start_routine, arg, 0, NULL);
|
|
||||||
if (threads[i] == 0) {
|
|
||||||
perror("Failed to create thread, with error");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
wait_for_thread(void *vt, size_t i) {
|
|
||||||
uintptr_t *threads = vt;
|
|
||||||
WaitForSingleObject((HANDLE)threads[i], INFINITE);
|
|
||||||
CloseHandle((HANDLE)threads[i]);
|
|
||||||
threads[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
|
||||||
free_threads(void *threads) {
|
|
||||||
free(threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
ssize_t
|
|
||||||
getdelim(char **lineptr, size_t *n, int delim, FILE *stream) {
|
|
||||||
char c, *cur_pos, *new_lineptr;
|
|
||||||
size_t new_lineptr_len;
|
|
||||||
|
|
||||||
if (lineptr == NULL || n == NULL || stream == NULL) {
|
|
||||||
errno = EINVAL;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (*lineptr == NULL) {
|
|
||||||
*n = 8192; /* init len */
|
|
||||||
if ((*lineptr = (char *)malloc(*n)) == NULL) {
|
|
||||||
errno = ENOMEM;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cur_pos = *lineptr;
|
|
||||||
for (;;) {
|
|
||||||
c = getc(stream);
|
|
||||||
|
|
||||||
if (ferror(stream) || (c == EOF && cur_pos == *lineptr))
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (c == EOF)
|
|
||||||
break;
|
|
||||||
|
|
||||||
if ((*lineptr + *n - cur_pos) < 2) {
|
|
||||||
if (SSIZE_MAX / 2 < *n) {
|
|
||||||
#ifdef EOVERFLOW
|
|
||||||
errno = EOVERFLOW;
|
|
||||||
#else
|
|
||||||
errno = ERANGE; /* no EOVERFLOW defined */
|
|
||||||
#endif
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
new_lineptr_len = *n * 2;
|
|
||||||
|
|
||||||
if ((new_lineptr = (char *)realloc(*lineptr, new_lineptr_len)) == NULL) {
|
|
||||||
errno = ENOMEM;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
*lineptr = new_lineptr;
|
|
||||||
*n = new_lineptr_len;
|
|
||||||
}
|
|
||||||
|
|
||||||
*cur_pos++ = c;
|
|
||||||
|
|
||||||
if (c == delim)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
*cur_pos = '\0';
|
|
||||||
return (ssize_t)(cur_pos - *lineptr);
|
|
||||||
}
|
|
||||||
@ -1,100 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
|
||||||
|
|
||||||
import random
|
|
||||||
import string
|
|
||||||
|
|
||||||
from . import BaseTest
|
|
||||||
|
|
||||||
|
|
||||||
def run(input_data, query, **kw):
|
|
||||||
kw['threads'] = kw.get('threads', 1)
|
|
||||||
mark = kw.pop('mark', False)
|
|
||||||
from kittens.choose.match import match
|
|
||||||
mark_before = mark_after = ''
|
|
||||||
if mark:
|
|
||||||
if mark is True:
|
|
||||||
mark_before, mark_after = '\033[32m', '\033[39m'
|
|
||||||
else:
|
|
||||||
mark_before = mark_after = mark
|
|
||||||
kw['mark_before'], kw['mark_after'] = mark_before, mark_after
|
|
||||||
return match(input_data, query, **kw)
|
|
||||||
|
|
||||||
|
|
||||||
class TestMatcher(BaseTest):
|
|
||||||
|
|
||||||
def run_matcher(self, *args, **kwargs):
|
|
||||||
result = run(*args, **kwargs)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def basic_test(self, inp, query, out, **k):
|
|
||||||
result = self.run_matcher(inp, query, **k)
|
|
||||||
if out is not None:
|
|
||||||
if hasattr(out, 'splitlines'):
|
|
||||||
out = list(filter(None, out.split(k.get('delimiter', '\n'))))
|
|
||||||
self.assertEqual(list(out), result)
|
|
||||||
return out
|
|
||||||
|
|
||||||
def test_filtering(self):
|
|
||||||
' Non matching entries must be removed '
|
|
||||||
self.basic_test('test\nxyz', 'te', 'test')
|
|
||||||
self.basic_test('abc\nxyz', 'ba', '')
|
|
||||||
self.basic_test('abc\n123', 'abc', 'abc')
|
|
||||||
|
|
||||||
def test_case_insensitive(self):
|
|
||||||
self.basic_test('test\nxyz', 'Te', 'test')
|
|
||||||
self.basic_test('test\nxyz', 'XY', 'xyz')
|
|
||||||
self.basic_test('test\nXYZ', 'xy', 'XYZ')
|
|
||||||
self.basic_test('test\nXYZ', 'mn', '')
|
|
||||||
|
|
||||||
def test_marking(self):
|
|
||||||
' Marking of matched characters '
|
|
||||||
self.basic_test(
|
|
||||||
'test\nxyz',
|
|
||||||
'ts',
|
|
||||||
'\x1b[32mt\x1b[39me\x1b[32ms\x1b[39mt',
|
|
||||||
mark=True)
|
|
||||||
|
|
||||||
def test_positions(self):
|
|
||||||
' Output of positions '
|
|
||||||
self.basic_test('abc\nac', 'ac', '0,1:ac\n0,2:abc', positions=True)
|
|
||||||
self.basic_test('abc\nv', 'a', '0:abc', positions=True)
|
|
||||||
|
|
||||||
def test_delimiter(self):
|
|
||||||
' Test using a custom line delimiter '
|
|
||||||
self.basic_test('abc\n21ac', 'ac', 'ac1abc\n2', delimiter='1')
|
|
||||||
|
|
||||||
def test_scoring(self):
|
|
||||||
' Scoring algorithm '
|
|
||||||
# Match at start
|
|
||||||
self.basic_test('archer\nelementary', 'e', 'elementary\narcher')
|
|
||||||
# Match at level factor
|
|
||||||
self.basic_test('xxxy\nxx/y', 'y', 'xx/y\nxxxy')
|
|
||||||
# CamelCase
|
|
||||||
self.basic_test('xxxy\nxxxY', 'y', 'xxxY\nxxxy')
|
|
||||||
# Total length
|
|
||||||
self.basic_test('xxxya\nxxxy', 'y', 'xxxy\nxxxya')
|
|
||||||
# Distance
|
|
||||||
self.basic_test('abbc\nabc', 'ac', 'abc\nabbc')
|
|
||||||
# Extreme chars
|
|
||||||
self.basic_test('xxa\naxx', 'a', 'axx\nxxa')
|
|
||||||
# Highest score
|
|
||||||
self.basic_test('xa/a', 'a', 'xa/|a|', mark='|')
|
|
||||||
|
|
||||||
def test_threading(self):
|
|
||||||
' Test matching on a large data set with different number of threads '
|
|
||||||
alphabet = string.ascii_lowercase + string.ascii_uppercase + string.digits
|
|
||||||
|
|
||||||
def random_word():
|
|
||||||
sz = random.randint(2, 10)
|
|
||||||
return ''.join(random.choice(alphabet) for x in range(sz))
|
|
||||||
words = [random_word() for i in range(400)]
|
|
||||||
|
|
||||||
def random_item():
|
|
||||||
num = random.randint(2, 7)
|
|
||||||
return '/'.join(random.choice(words) for w in range(num))
|
|
||||||
|
|
||||||
data = '\n'.join(random_item() for x in range(25123))
|
|
||||||
|
|
||||||
for threads in range(4):
|
|
||||||
self.basic_test(data, 'foo', None, threads=threads)
|
|
||||||
@ -172,11 +172,13 @@ def run_python_tests(args: Any, go_proc: 'Optional[subprocess.Popen[bytes]]' = N
|
|||||||
|
|
||||||
def print_go() -> None:
|
def print_go() -> None:
|
||||||
try:
|
try:
|
||||||
print(go_proc.stdout.read().decode('utf-8', 'replace'), end='', flush=True)
|
go_proc.wait()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
go_proc.terminate()
|
go_proc.terminate()
|
||||||
if go_proc.wait(0.1) is None:
|
if go_proc.wait(0.1) is None:
|
||||||
go_proc.kill()
|
go_proc.kill()
|
||||||
|
|
||||||
|
print(go_proc.stdout.read().decode('utf-8', 'replace'), end='', flush=True)
|
||||||
go_proc.stdout.close()
|
go_proc.stdout.close()
|
||||||
go_proc.wait()
|
go_proc.wait()
|
||||||
|
|
||||||
|
|||||||
234
tools/tui/subseq/score.go
Normal file
234
tools/tui/subseq/score.go
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
// License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
package subseq
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"kitty/tools/utils"
|
||||||
|
"kitty/tools/utils/images"
|
||||||
|
|
||||||
|
"golang.org/x/exp/slices"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = fmt.Print
|
||||||
|
|
||||||
|
const (
|
||||||
|
LEVEL1 = "/"
|
||||||
|
LEVEL2 = "-_0123456789"
|
||||||
|
LEVEL3 = "."
|
||||||
|
)
|
||||||
|
|
||||||
|
type resolved_options_type struct {
|
||||||
|
level1, level2, level3 []rune
|
||||||
|
}
|
||||||
|
|
||||||
|
type Options struct {
|
||||||
|
Level1, Level2, Level3 string
|
||||||
|
NumberOfThreads int
|
||||||
|
}
|
||||||
|
|
||||||
|
type Match struct {
|
||||||
|
Positions []int
|
||||||
|
Score float64
|
||||||
|
idx int
|
||||||
|
Text string
|
||||||
|
}
|
||||||
|
|
||||||
|
func level_factor_for(current_lcase, last_lcase, current_cased, last_cased rune, opts *resolved_options_type) int {
|
||||||
|
switch {
|
||||||
|
case slices.Contains(opts.level1, last_lcase):
|
||||||
|
return 90
|
||||||
|
case slices.Contains(opts.level2, last_lcase):
|
||||||
|
return 80
|
||||||
|
case last_lcase == last_cased && current_lcase != current_cased: // camelCase
|
||||||
|
return 80
|
||||||
|
case slices.Contains(opts.level3, last_lcase):
|
||||||
|
return 70
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type workspace_type struct {
|
||||||
|
positions [][]int // positions of each needle char in haystack
|
||||||
|
level_factors []int
|
||||||
|
address []int
|
||||||
|
max_score_per_char float64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *workspace_type) initialize(haystack_sz, needle_sz int) {
|
||||||
|
if cap(w.positions) < needle_sz {
|
||||||
|
w.positions = make([][]int, needle_sz)
|
||||||
|
} else {
|
||||||
|
w.positions = w.positions[:needle_sz]
|
||||||
|
}
|
||||||
|
if cap(w.level_factors) < haystack_sz {
|
||||||
|
w.level_factors = make([]int, 2*haystack_sz)
|
||||||
|
} else {
|
||||||
|
w.level_factors = w.level_factors[:haystack_sz]
|
||||||
|
}
|
||||||
|
for i, s := range w.positions {
|
||||||
|
if cap(s) < haystack_sz {
|
||||||
|
w.positions[i] = make([]int, 0, 2*haystack_sz)
|
||||||
|
} else {
|
||||||
|
w.positions[i] = w.positions[i][:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if cap(w.address) < needle_sz {
|
||||||
|
w.address = make([]int, needle_sz)
|
||||||
|
}
|
||||||
|
w.address = utils.Memset(w.address)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *workspace_type) position(x int) int { // the position of xth needle char in the haystack for the current address
|
||||||
|
return w.positions[x][w.address[x]]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *workspace_type) increment_address() bool {
|
||||||
|
pos := len(w.positions) - 1 // the last needle char
|
||||||
|
for {
|
||||||
|
w.address[pos]++
|
||||||
|
if w.address[pos] < len(w.positions[pos]) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
if pos == 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
w.address[pos] = 0
|
||||||
|
pos--
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *workspace_type) address_is_monotonic() bool {
|
||||||
|
// Check if the character positions pointed to by the current address are monotonic
|
||||||
|
for i := 1; i < len(w.positions); i++ {
|
||||||
|
if w.position(i) <= w.position(i-1) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (w *workspace_type) calc_score() (ans float64) {
|
||||||
|
distance, pos := 0, 0
|
||||||
|
for i := 0; i < len(w.positions); i++ {
|
||||||
|
pos = w.position(i)
|
||||||
|
if i == 0 {
|
||||||
|
distance = pos + 1
|
||||||
|
} else {
|
||||||
|
distance = pos - w.position(i-1)
|
||||||
|
if distance < 2 {
|
||||||
|
ans += w.max_score_per_char // consecutive chars
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if w.level_factors[pos] > 0 {
|
||||||
|
ans += (100.0 * w.max_score_per_char) / float64(w.level_factors[pos]) // at a special location
|
||||||
|
} else {
|
||||||
|
ans += (0.75 * w.max_score_per_char) / float64(distance)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func has_atleast_one_match(w *workspace_type) (found bool) {
|
||||||
|
p := -1
|
||||||
|
for i := 0; i < len(w.positions); i++ {
|
||||||
|
if len(w.positions[i]) == 0 { // all chars of needle not in haystack
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
found = false
|
||||||
|
for _, pos := range w.positions[i] {
|
||||||
|
if pos > p {
|
||||||
|
p = pos
|
||||||
|
found = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !found { // chars of needle not present in sequence in haystack
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func score_item(item string, idx int, needle []rune, opts *resolved_options_type, w *workspace_type) *Match {
|
||||||
|
ans := &Match{idx: idx, Text: item, Positions: make([]int, len(needle))}
|
||||||
|
haystack := []rune(strings.ToLower(item))
|
||||||
|
orig_haystack := []rune(item)
|
||||||
|
w.initialize(len(orig_haystack), len(needle))
|
||||||
|
for i := 0; i < len(haystack); i++ {
|
||||||
|
level_factor_calculated := false
|
||||||
|
for j := 0; j < len(needle); j++ {
|
||||||
|
if needle[j] == haystack[i] {
|
||||||
|
if !level_factor_calculated {
|
||||||
|
level_factor_calculated = true
|
||||||
|
if i > 0 {
|
||||||
|
w.level_factors[i] = level_factor_for(haystack[i], haystack[i-1], orig_haystack[i], orig_haystack[i-1], opts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.positions[j] = append(w.positions[j], i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.max_score_per_char = (1.0/float64(len(orig_haystack)) + 1.0/float64(len(needle))) / 2.0
|
||||||
|
if !has_atleast_one_match(w) {
|
||||||
|
return ans
|
||||||
|
}
|
||||||
|
var score float64
|
||||||
|
for {
|
||||||
|
if w.address_is_monotonic() {
|
||||||
|
score = w.calc_score()
|
||||||
|
if score > ans.Score {
|
||||||
|
ans.Score = score
|
||||||
|
for i := range ans.Positions {
|
||||||
|
ans.Positions[i] = w.position(i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !w.increment_address() {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ans.Score > 0 {
|
||||||
|
adjust := utils.RuneOffsetsToByteOffsets(item)
|
||||||
|
for i := range ans.Positions {
|
||||||
|
ans.Positions[i] = adjust(ans.Positions[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ans
|
||||||
|
}
|
||||||
|
|
||||||
|
func ScoreItems(query string, items []string, opts Options) []*Match {
|
||||||
|
ctx := images.Context{}
|
||||||
|
ctx.SetNumberOfThreads(opts.NumberOfThreads)
|
||||||
|
ans := make([]*Match, len(items))
|
||||||
|
results := make(chan *Match, len(items))
|
||||||
|
nr := []rune(strings.ToLower(query))
|
||||||
|
if opts.Level1 == "" {
|
||||||
|
opts.Level1 = LEVEL1
|
||||||
|
}
|
||||||
|
if opts.Level2 == "" {
|
||||||
|
opts.Level2 = LEVEL2
|
||||||
|
}
|
||||||
|
if opts.Level3 == "" {
|
||||||
|
opts.Level3 = LEVEL3
|
||||||
|
}
|
||||||
|
ropts := resolved_options_type{
|
||||||
|
level1: []rune(opts.Level1), level2: []rune(opts.Level2), level3: []rune(opts.Level3),
|
||||||
|
}
|
||||||
|
ctx.Parallel(0, len(items), func(nums <-chan int) {
|
||||||
|
w := workspace_type{}
|
||||||
|
for i := range nums {
|
||||||
|
results <- score_item(items[i], i, nr, &ropts, &w)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
close(results)
|
||||||
|
for x := range results {
|
||||||
|
ans[x.idx] = x
|
||||||
|
}
|
||||||
|
return ans
|
||||||
|
}
|
||||||
91
tools/tui/subseq/score_test.go
Normal file
91
tools/tui/subseq/score_test.go
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
// License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
package subseq
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"kitty/tools/utils"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = fmt.Print
|
||||||
|
|
||||||
|
func TestSubseq(t *testing.T) {
|
||||||
|
var positions [][]int
|
||||||
|
sort_by_score := false
|
||||||
|
|
||||||
|
simple := func(items, query string, expected ...string) {
|
||||||
|
matches := ScoreItems(query, utils.Splitlines(items), Options{})
|
||||||
|
if sort_by_score {
|
||||||
|
matches = utils.StableSort(matches, func(a, b *Match) bool { return a.Score > b.Score })
|
||||||
|
}
|
||||||
|
actual := make([]string, 0, len(matches))
|
||||||
|
actual_positions := make([][]int, 0, len(matches))
|
||||||
|
for _, m := range matches {
|
||||||
|
if m.Score > 0 {
|
||||||
|
actual = append(actual, m.Text)
|
||||||
|
actual_positions = append(actual_positions, m.Positions)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if expected == nil {
|
||||||
|
expected = []string{}
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(expected, actual); diff != "" {
|
||||||
|
t.Fatalf("Failed for items: %v\nMatches: %#v\n%s", utils.Splitlines(items), matches, diff)
|
||||||
|
}
|
||||||
|
if positions != nil {
|
||||||
|
if diff := cmp.Diff(positions, actual_positions); diff != "" {
|
||||||
|
t.Fatalf("Failed positions for items: %v\n%s", utils.Splitlines(items), diff)
|
||||||
|
}
|
||||||
|
positions = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
simple("test\nxyz", "te", "test")
|
||||||
|
simple("abc\nxyz", "ba")
|
||||||
|
simple("abc\n123", "abc", "abc")
|
||||||
|
simple("test\nxyz", "Te", "test")
|
||||||
|
simple("test\nxyz", "XY", "xyz")
|
||||||
|
simple("test\nXYZ", "xy", "XYZ")
|
||||||
|
simple("test\nXYZ", "mn")
|
||||||
|
|
||||||
|
positions = [][]int{{0, 2}, {0, 1}}
|
||||||
|
simple("abc\nac", "ac", "abc", "ac")
|
||||||
|
positions = [][]int{{0}}
|
||||||
|
simple("abc\nv", "a", "abc")
|
||||||
|
positions = [][]int{{len("汉"), 7}}
|
||||||
|
simple("汉a字b\nxyz", "ab", "汉a字b")
|
||||||
|
|
||||||
|
sort_by_score = true
|
||||||
|
// Match at start
|
||||||
|
simple("archer\nelementary", "e", "elementary", "archer")
|
||||||
|
// Match at level factor
|
||||||
|
simple("xxxy\nxx/y", "y", "xx/y", "xxxy")
|
||||||
|
// CamelCase
|
||||||
|
simple("xxxy\nxxxY", "y", "xxxY", "xxxy")
|
||||||
|
// Total length
|
||||||
|
simple("xxxya\nxxxy", "y", "xxxy", "xxxya")
|
||||||
|
// Distance
|
||||||
|
simple("abbc\nabc", "ac", "abc", "abbc")
|
||||||
|
// Extreme chars
|
||||||
|
simple("xxa\naxx", "a", "axx", "xxa")
|
||||||
|
// Highest score
|
||||||
|
positions = [][]int{{3}}
|
||||||
|
simple("xa/a", "a", "xa/a")
|
||||||
|
|
||||||
|
sort_by_score = false
|
||||||
|
items := make([]string, 256)
|
||||||
|
for i := range items {
|
||||||
|
items[i] = strconv.Itoa(i)
|
||||||
|
}
|
||||||
|
expected := make([]string, 0, len(items))
|
||||||
|
for _, x := range items {
|
||||||
|
if strings.ContainsRune(x, rune('2')) {
|
||||||
|
expected = append(expected, x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
simple(strings.Join(items, "\n"), "2", expected...)
|
||||||
|
}
|
||||||
@ -139,6 +139,9 @@ func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
|
|||||||
return NewLineScanner("").Split(x, expected_number_of_lines...)
|
return NewLineScanner("").Split(x, expected_number_of_lines...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return a function that can be called sequentially with rune based offsets
|
||||||
|
// converting them to byte based offsets. The rune offsets must be monotonic,
|
||||||
|
// otherwise the function returns -1
|
||||||
func RuneOffsetsToByteOffsets(text string) func(int) int {
|
func RuneOffsetsToByteOffsets(text string) func(int) int {
|
||||||
self := struct {
|
self := struct {
|
||||||
char_offset, byte_offset, last int
|
char_offset, byte_offset, last int
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user