Implement a dedicated function for word matching rather than relying on a regex and being at the mercy of the vagaries of regex implementations

2023-05-12 15:43:56 +05:30 · 2023-05-12 15:43:56 +05:30 · c101a6acb0
commit c101a6acb0
parent 65f8bb7397
2 changed files with 69 additions and 11 deletions
--- a/kittens/hints/marks.go
+++ b/kittens/hints/marks.go
@ -15,6 +15,7 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
 	"unicode"
 	"unicode/utf8"
 	"github.com/dlclark/regexp2"
@ -254,15 +255,6 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
 			// IPv6 with no validation
 			`(?:[a-fA-F0-9]{0,4}:){2,7}[a-fA-F0-9]{1,4})`)
 		post_processors = append(post_processors, PostProcessorMap()["ip"])
 	case "word":
 		chars := opts.WordCharacters
 		if chars == "" {
 			chars = RelevantKittyOpts().Select_by_word_characters
 		}
 		chars = regexp2.Escape(chars)
 		chars = strings.ReplaceAll(chars, "-", "\\-")
 		pattern = fmt.Sprintf(`(?u)[%s\w\d]{%d,}`, chars, opts.MinimumMatchLength)
 		post_processors = append(post_processors, PostProcessorMap()["brackets"], PostProcessorMap()["quotes"])
 	default:
 		pattern = opts.Regex
 		if opts.Type == "linenum" {
@ -435,6 +427,71 @@ func mark(r *regexp2.Regexp, post_processors []PostProcessorFunc, group_processo
 type ErrNoMatches struct{ Type string }
 func is_word_char(ch rune, current_chars []rune) bool {
 	return unicode.IsLetter(ch) || unicode.IsNumber(ch) || (unicode.IsMark(ch) && len(current_chars) > 0 && unicode.IsLetter(current_chars[len(current_chars)-1]))
 }
 func mark_words(text string, opts *Options) (ans []Mark) {
 	left := text
 	var current_run struct {
 		chars       []rune
 		start, size int
 	}
 	chars := opts.WordCharacters
 	if chars == "" {
 		chars = RelevantKittyOpts().Select_by_word_characters
 	}
 	allowed_chars := make(map[rune]bool, len(chars))
 	for _, ch := range chars {
 		allowed_chars[ch] = true
 	}
 	pos := 0
 	post_processors := []PostProcessorFunc{PostProcessorMap()["brackets"], PostProcessorMap()["quotes"]}
 	commit_run := func() {
 		if len(current_run.chars) >= opts.MinimumMatchLength {
 			match_start, match_end := current_run.start, current_run.start+current_run.size
 			for _, f := range post_processors {
 				match_start, match_end = f(text, match_start, match_end)
 				if match_start < 0 {
 					break
 				}
 			}
 			if match_start > -1 && match_end > match_start {
 				full_match := text[match_start:match_end]
 				if len([]rune(full_match)) >= opts.MinimumMatchLength {
 					ans = append(ans, Mark{
 						Index: len(ans), Start: match_start, End: match_end, Text: full_match,
 					})
 				}
 			}
 		}
 		current_run.chars = nil
 		current_run.start = 0
 		current_run.size = 0
 	}
 	for {
 		ch, size := utf8.DecodeRuneInString(left)
 		if ch == utf8.RuneError {
 			break
 		}
 		if allowed_chars[ch] || is_word_char(ch, current_run.chars) {
 			if len(current_run.chars) == 0 {
 				current_run.start = pos
 			}
 			current_run.chars = append(current_run.chars, ch)
 			current_run.size += size
 		} else {
 			commit_run()
 		}
 		left = left[size:]
 		pos += size
 	}
 	commit_run()
 	return
 }
 func adjust_python_offsets(text string, marks []Mark) error {
 	// python returns rune based offsets (unicode chars not utf-8 bytes)
 	adjust := utils.RuneOffsetsToByteOffsets(text)
@ -505,6 +562,8 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
 		}
 	} else if opts.Type == "hyperlink" {
 		ans = hyperlinks
 	} else if opts.Type == "word" {
 		ans = mark_words(text, opts)
 	} else {
 		err = run_basic_matching()
 		if err != nil {
--- a/kittens/hints/marks_test.go
+++ b/kittens/hints/marks_test.go
@ -117,8 +117,7 @@ func TestHintMarking(t *testing.T) {
 	reset()
 	opts.Type = "word"
 	r(`#one (two) 😍 a-1b `, `#one`, `two`, `a-1b`)
-	// non-ascii words dont match because of https://github.com/dlclark/regexp2/issues/65
+	r("fōtiz час a\u0310b ", `fōtiz`, `час`, "a\u0310b")
 	// r(`fōtiz час`, `fōtiz`, `час`)
 	reset()
 	tdir := t.TempDir()