Fix offsets incorrect for non-ASCII chars when using custom processing
python gives us offsets in unicode characters. Go uses offsets in utf8 bytes. Translate.
This commit is contained in:
parent
b76b0c61ed
commit
e78c398243
@ -322,6 +322,30 @@ func mark(r *regexp.Regexp, post_processors []PostProcessorFunc, group_processor
|
|||||||
|
|
||||||
type ErrNoMatches struct{ Type string }
|
type ErrNoMatches struct{ Type string }
|
||||||
|
|
||||||
|
func adjust_python_offsets(text string, marks []Mark) {
|
||||||
|
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
||||||
|
// this adjustment function assumes the marks are non overlapping
|
||||||
|
bytes := utils.UnsafeStringToBytes(text)
|
||||||
|
char_offset, byte_offset := 0, 0
|
||||||
|
|
||||||
|
adjust := func(x int) (sz int) {
|
||||||
|
x -= char_offset
|
||||||
|
for x > 0 {
|
||||||
|
_, d := utf8.DecodeRune(bytes)
|
||||||
|
sz += d
|
||||||
|
bytes = bytes[d:]
|
||||||
|
x--
|
||||||
|
char_offset++
|
||||||
|
}
|
||||||
|
byte_offset += sz
|
||||||
|
return byte_offset
|
||||||
|
}
|
||||||
|
for i := range marks {
|
||||||
|
mark := &marks[i]
|
||||||
|
mark.Start = adjust(mark.Start)
|
||||||
|
mark.End = adjust(mark.End)
|
||||||
|
}
|
||||||
|
}
|
||||||
func (self *ErrNoMatches) Error() string {
|
func (self *ErrNoMatches) Error() string {
|
||||||
none_of := "matches"
|
none_of := "matches"
|
||||||
switch self.Type {
|
switch self.Type {
|
||||||
@ -369,6 +393,7 @@ func find_marks(text string, opts *Options, cli_args ...string) (sanitized_text
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, nil, fmt.Errorf("Failed to load output from custom processor %#v with error: %w", opts.CustomizeProcessing, err)
|
return "", nil, nil, fmt.Errorf("Failed to load output from custom processor %#v with error: %w", opts.CustomizeProcessing, err)
|
||||||
}
|
}
|
||||||
|
adjust_python_offsets(sanitized_text, ans)
|
||||||
} else if opts.Type == "hyperlink" {
|
} else if opts.Type == "hyperlink" {
|
||||||
ans = hyperlinks
|
ans = hyperlinks
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -10,6 +10,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"github.com/google/go-cmp/cmp"
|
"github.com/google/go-cmp/cmp"
|
||||||
@ -31,7 +32,7 @@ func TestHintMarking(t *testing.T) {
|
|||||||
|
|
||||||
r := func(text string, url ...string) (marks []Mark) {
|
r := func(text string, url ...string) (marks []Mark) {
|
||||||
ptext := convert_text(text, cols)
|
ptext := convert_text(text, cols)
|
||||||
_, marks, _, err := find_marks(ptext, opts, cli_args...)
|
ptext, marks, _, err := find_marks(ptext, opts, cli_args...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
var e *ErrNoMatches
|
var e *ErrNoMatches
|
||||||
if len(url) != 0 || !errors.As(err, &e) {
|
if len(url) != 0 || !errors.As(err, &e) {
|
||||||
@ -43,6 +44,12 @@ func TestHintMarking(t *testing.T) {
|
|||||||
if diff := cmp.Diff(url, actual); diff != "" {
|
if diff := cmp.Diff(url, actual); diff != "" {
|
||||||
t.Fatalf("%#v failed:\n%s", text, diff)
|
t.Fatalf("%#v failed:\n%s", text, diff)
|
||||||
}
|
}
|
||||||
|
for _, m := range marks {
|
||||||
|
q := strings.NewReplacer("\n", "", "\r", "", "\x00", "").Replace(ptext[m.Start:m.End])
|
||||||
|
if diff := cmp.Diff(m.Text, q); diff != "" {
|
||||||
|
t.Fatalf("Mark start and end dont point to correct offset in text for %#v\n%s", text, diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,7 +121,7 @@ def mark(text, args, Mark, extra_cli_args, *a):
|
|||||||
`), 0o600)
|
`), 0o600)
|
||||||
opts.Type = "regex"
|
opts.Type = "regex"
|
||||||
opts.CustomizeProcessing = simple
|
opts.CustomizeProcessing = simple
|
||||||
marks := r("a b", `a`, `b`)
|
marks := r("漢字 b", `漢字`, `b`)
|
||||||
if diff := cmp.Diff(marks[0].Groupdict, map[string]any{"idx": float64(0), "args": []any{"extra1"}}); diff != "" {
|
if diff := cmp.Diff(marks[0].Groupdict, map[string]any{"idx": float64(0), "args": []any{"extra1"}}); diff != "" {
|
||||||
t.Fatalf("Did not get expected groupdict from custom processor:\n%s", diff)
|
t.Fatalf("Did not get expected groupdict from custom processor:\n%s", diff)
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user