Fix offsets incorrect for non-ASCII chars when using custom processing
python gives us offsets in unicode characters. Go uses offsets in utf8 bytes. Translate.
This commit is contained in:
parent
b76b0c61ed
commit
e78c398243
@ -322,6 +322,30 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor
|
||||
|
||||
type ErrNoMatches struct{ Type string }
|
||||
|
||||
func adjust_python_offsets(text string, marks []Mark) {
|
||||
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
||||
// this adjustment function assumes the marks are non overlapping
|
||||
bytes := utils.UnsafeStringToBytes(text)
|
||||
char_offset, byte_offset := 0, 0
|
||||
|
||||
adjust := func(x int) (sz int) {
|
||||
x -= char_offset
|
||||
for x > 0 {
|
||||
_, d := utf8.DecodeRune(bytes)
|
||||
sz += d
|
||||
bytes = bytes[d:]
|
||||
x--
|
||||
char_offset++
|
||||
}
|
||||
byte_offset += sz
|
||||
return byte_offset
|
||||
}
|
||||
for i := range marks {
|
||||
mark := &marks[i]
|
||||
mark.Start = adjust(mark.Start)
|
||||
mark.End = adjust(mark.End)
|
||||
}
|
||||
}
|
||||
func (self *ErrNoMatches) Error() string {
|
||||
none_of := "matches"
|
||||
switch self.Type {
|
||||
@ -369,6 +393,7 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
|
||||
if err != nil {
|
||||
return "", nil, nil, fmt.Errorf("Failed to load output from custom processor %#v with error: %w", opts.CustomizeProcessing, err)
|
||||
}
|
||||
adjust_python_offsets(sanitized_text, ans)
|
||||
} else if opts.Type == "hyperlink" {
|
||||
ans = hyperlinks
|
||||
} else {
|
||||
|
||||
@ -10,6 +10,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/google/go-cmp/cmp"
|
||||
@ -31,7 +32,7 @@ func TestHintMarking(t *testing.T) {
|
||||
|
||||
r := func(text string, url ...string) (marks []Mark) {
|
||||
ptext := convert_text(text, cols)
|
||||
_, marks, _, err := find_marks(ptext, opts, cli_args...)
|
||||
ptext, marks, _, err := find_marks(ptext, opts, cli_args...)
|
||||
if err != nil {
|
||||
var e *ErrNoMatches
|
||||
if len(url) != 0 || !errors.As(err, &e) {
|
||||
@ -43,6 +44,12 @@ func TestHintMarking(t *testing.T) {
|
||||
if diff := cmp.Diff(url, actual); diff != "" {
|
||||
t.Fatalf("%#v failed:\n%s", text, diff)
|
||||
}
|
||||
for _, m := range marks {
|
||||
q := strings.NewReplacer("\n", "", "\r", "", "\x00", "").Replace(ptext[m.Start:m.End])
|
||||
if diff := cmp.Diff(m.Text, q); diff != "" {
|
||||
t.Fatalf("Mark start and end dont point to correct offset in text for %#v\n%s", text, diff)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
@ -114,7 +121,7 @@ def mark(text, args, Mark, extra_cli_args, *a):
|
||||
`), 0o600)
|
||||
opts.Type = "regex"
|
||||
opts.CustomizeProcessing = simple
|
||||
marks := r("a b", `a`, `b`)
|
||||
marks := r("漢字 b", `漢字`, `b`)
|
||||
if diff := cmp.Diff(marks[0].Groupdict, map[string]any{"idx": float64(0), "args": []any{"extra1"}}); diff != "" {
|
||||
t.Fatalf("Did not get expected groupdict from custom processor:\n%s", diff)
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user