Fix offsets incorrect for non-ASCII chars when using custom processing

python gives us offsets in unicode characters. Go uses offsets in utf8
bytes. Translate.
This commit is contained in:
Kovid Goyal 2023-03-10 12:41:56 +05:30
parent b76b0c61ed
commit e78c398243
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 34 additions and 2 deletions

View File

@ -322,6 +322,30 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor
type ErrNoMatches struct{ Type string } type ErrNoMatches struct{ Type string }
func adjust_python_offsets(text string, marks []Mark) {
// python returns rune based offsets (unicode chars not utf-8 bytes)
// this adjustment function assumes the marks are non overlapping
bytes := utils.UnsafeStringToBytes(text)
char_offset, byte_offset := 0, 0
adjust := func(x int) (sz int) {
x -= char_offset
for x > 0 {
_, d := utf8.DecodeRune(bytes)
sz += d
bytes = bytes[d:]
x--
char_offset++
}
byte_offset += sz
return byte_offset
}
for i := range marks {
mark := &marks[i]
mark.Start = adjust(mark.Start)
mark.End = adjust(mark.End)
}
}
func (self *ErrNoMatches) Error() string { func (self *ErrNoMatches) Error() string {
none_of := "matches" none_of := "matches"
switch self.Type { switch self.Type {
@ -369,6 +393,7 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
if err != nil { if err != nil {
return "", nil, nil, fmt.Errorf("Failed to load output from custom processor %#v with error: %w", opts.CustomizeProcessing, err) return "", nil, nil, fmt.Errorf("Failed to load output from custom processor %#v with error: %w", opts.CustomizeProcessing, err)
} }
adjust_python_offsets(sanitized_text, ans)
} else if opts.Type == "hyperlink" { } else if opts.Type == "hyperlink" {
ans = hyperlinks ans = hyperlinks
} else { } else {

View File

@ -10,6 +10,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings"
"testing" "testing"
"github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp"
@ -31,7 +32,7 @@ func TestHintMarking(t *testing.T) {
r := func(text string, url ...string) (marks []Mark) { r := func(text string, url ...string) (marks []Mark) {
ptext := convert_text(text, cols) ptext := convert_text(text, cols)
_, marks, _, err := find_marks(ptext, opts, cli_args...) ptext, marks, _, err := find_marks(ptext, opts, cli_args...)
if err != nil { if err != nil {
var e *ErrNoMatches var e *ErrNoMatches
if len(url) != 0 || !errors.As(err, &e) { if len(url) != 0 || !errors.As(err, &e) {
@ -43,6 +44,12 @@ func TestHintMarking(t *testing.T) {
if diff := cmp.Diff(url, actual); diff != "" { if diff := cmp.Diff(url, actual); diff != "" {
t.Fatalf("%#v failed:\n%s", text, diff) t.Fatalf("%#v failed:\n%s", text, diff)
} }
for _, m := range marks {
q := strings.NewReplacer("\n", "", "\r", "", "\x00", "").Replace(ptext[m.Start:m.End])
if diff := cmp.Diff(m.Text, q); diff != "" {
t.Fatalf("Mark start and end dont point to correct offset in text for %#v\n%s", text, diff)
}
}
return return
} }
@ -114,7 +121,7 @@ def mark(text, args, Mark, extra_cli_args, *a):
`), 0o600) `), 0o600)
opts.Type = "regex" opts.Type = "regex"
opts.CustomizeProcessing = simple opts.CustomizeProcessing = simple
marks := r("a b", `a`, `b`) marks := r("漢字 b", `漢字`, `b`)
if diff := cmp.Diff(marks[0].Groupdict, map[string]any{"idx": float64(0), "args": []any{"extra1"}}); diff != "" { if diff := cmp.Diff(marks[0].Groupdict, map[string]any{"idx": float64(0), "args": []any{"extra1"}}); diff != "" {
t.Fatalf("Did not get expected groupdict from custom processor:\n%s", diff) t.Fatalf("Did not get expected groupdict from custom processor:\n%s", diff)
} }