Implement a dedicated function for word matching rather than relying on a regex and being at the mercy of the vagaries of regex implementations
This commit is contained in:
parent
65f8bb7397
commit
c101a6acb0
@ -15,6 +15,7 @@ import (
|
|||||||
"regexp"
|
"regexp"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"unicode"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/dlclark/regexp2"
|
"github.com/dlclark/regexp2"
|
||||||
@ -254,15 +255,6 @@ func functions_for(opts *Options) (pattern string, post_processors []PostProcess
|
|||||||
// IPv6 with no validation
|
// IPv6 with no validation
|
||||||
`(?:[a-fA-F0-9]{0,4}:){2,7}[a-fA-F0-9]{1,4})`)
|
`(?:[a-fA-F0-9]{0,4}:){2,7}[a-fA-F0-9]{1,4})`)
|
||||||
post_processors = append(post_processors, PostProcessorMap()["ip"])
|
post_processors = append(post_processors, PostProcessorMap()["ip"])
|
||||||
case "word":
|
|
||||||
chars := opts.WordCharacters
|
|
||||||
if chars == "" {
|
|
||||||
chars = RelevantKittyOpts().Select_by_word_characters
|
|
||||||
}
|
|
||||||
chars = regexp2.Escape(chars)
|
|
||||||
chars = strings.ReplaceAll(chars, "-", "\\-")
|
|
||||||
pattern = fmt.Sprintf(`(?u)[%s\w\d]{%d,}`, chars, opts.MinimumMatchLength)
|
|
||||||
post_processors = append(post_processors, PostProcessorMap()["brackets"], PostProcessorMap()["quotes"])
|
|
||||||
default:
|
default:
|
||||||
pattern = opts.Regex
|
pattern = opts.Regex
|
||||||
if opts.Type == "linenum" {
|
if opts.Type == "linenum" {
|
||||||
@ -435,6 +427,71 @@ func mark(r *regexp2.Regexp, post_processors []PostProcessorFunc, group_processo
|
|||||||
|
|
||||||
type ErrNoMatches struct{ Type string }
|
type ErrNoMatches struct{ Type string }
|
||||||
|
|
||||||
|
func is_word_char(ch rune, current_chars []rune) bool {
|
||||||
|
return unicode.IsLetter(ch) || unicode.IsNumber(ch) || (unicode.IsMark(ch) && len(current_chars) > 0 && unicode.IsLetter(current_chars[len(current_chars)-1]))
|
||||||
|
}
|
||||||
|
|
||||||
|
func mark_words(text string, opts *Options) (ans []Mark) {
|
||||||
|
left := text
|
||||||
|
var current_run struct {
|
||||||
|
chars []rune
|
||||||
|
start, size int
|
||||||
|
}
|
||||||
|
chars := opts.WordCharacters
|
||||||
|
if chars == "" {
|
||||||
|
chars = RelevantKittyOpts().Select_by_word_characters
|
||||||
|
}
|
||||||
|
allowed_chars := make(map[rune]bool, len(chars))
|
||||||
|
for _, ch := range chars {
|
||||||
|
allowed_chars[ch] = true
|
||||||
|
}
|
||||||
|
pos := 0
|
||||||
|
post_processors := []PostProcessorFunc{PostProcessorMap()["brackets"], PostProcessorMap()["quotes"]}
|
||||||
|
|
||||||
|
commit_run := func() {
|
||||||
|
if len(current_run.chars) >= opts.MinimumMatchLength {
|
||||||
|
match_start, match_end := current_run.start, current_run.start+current_run.size
|
||||||
|
for _, f := range post_processors {
|
||||||
|
match_start, match_end = f(text, match_start, match_end)
|
||||||
|
if match_start < 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if match_start > -1 && match_end > match_start {
|
||||||
|
full_match := text[match_start:match_end]
|
||||||
|
if len([]rune(full_match)) >= opts.MinimumMatchLength {
|
||||||
|
ans = append(ans, Mark{
|
||||||
|
Index: len(ans), Start: match_start, End: match_end, Text: full_match,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_run.chars = nil
|
||||||
|
current_run.start = 0
|
||||||
|
current_run.size = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
ch, size := utf8.DecodeRuneInString(left)
|
||||||
|
if ch == utf8.RuneError {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
if allowed_chars[ch] || is_word_char(ch, current_run.chars) {
|
||||||
|
if len(current_run.chars) == 0 {
|
||||||
|
current_run.start = pos
|
||||||
|
}
|
||||||
|
current_run.chars = append(current_run.chars, ch)
|
||||||
|
current_run.size += size
|
||||||
|
} else {
|
||||||
|
commit_run()
|
||||||
|
}
|
||||||
|
left = left[size:]
|
||||||
|
pos += size
|
||||||
|
}
|
||||||
|
commit_run()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
func adjust_python_offsets(text string, marks []Mark) error {
|
func adjust_python_offsets(text string, marks []Mark) error {
|
||||||
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
||||||
adjust := utils.RuneOffsetsToByteOffsets(text)
|
adjust := utils.RuneOffsetsToByteOffsets(text)
|
||||||
@ -505,6 +562,8 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
|
|||||||
}
|
}
|
||||||
} else if opts.Type == "hyperlink" {
|
} else if opts.Type == "hyperlink" {
|
||||||
ans = hyperlinks
|
ans = hyperlinks
|
||||||
|
} else if opts.Type == "word" {
|
||||||
|
ans = mark_words(text, opts)
|
||||||
} else {
|
} else {
|
||||||
err = run_basic_matching()
|
err = run_basic_matching()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@ -117,8 +117,7 @@ func TestHintMarking(t *testing.T) {
|
|||||||
reset()
|
reset()
|
||||||
opts.Type = "word"
|
opts.Type = "word"
|
||||||
r(`#one (two) 😍 a-1b `, `#one`, `two`, `a-1b`)
|
r(`#one (two) 😍 a-1b `, `#one`, `two`, `a-1b`)
|
||||||
// non-ascii words dont match because of https://github.com/dlclark/regexp2/issues/65
|
r("fōtiz час a\u0310b ", `fōtiz`, `час`, "a\u0310b")
|
||||||
// r(`fōtiz час`, `fōtiz`, `час`)
|
|
||||||
|
|
||||||
reset()
|
reset()
|
||||||
tdir := t.TempDir()
|
tdir := t.TempDir()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user