Make code to convert rune offsets to byte offsets re-useable

This commit is contained in:
Kovid Goyal 2023-03-12 13:12:17 +05:30
parent dd783c842f
commit b088ab91cf
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 30 additions and 21 deletions

View File

@ -325,34 +325,17 @@ type ErrNoMatches struct{ Type string }
func adjust_python_offsets(text string, marks []Mark) error {
// python returns rune based offsets (unicode chars not utf-8 bytes)
// this adjustment function assumes the marks are non overlapping
bytes := utils.UnsafeStringToBytes(text)
char_offset, byte_offset := 0, 0
adjust := func(x int) (sz int) {
x -= char_offset
for x > 0 {
_, d := utf8.DecodeRune(bytes)
sz += d
bytes = bytes[d:]
x--
char_offset++
}
byte_offset += sz
return byte_offset
}
last := 0
adjust := utils.RuneOffsetsToByteOffsets(text)
for i := range marks {
mark := &marks[i]
if mark.End < mark.Start {
return fmt.Errorf("The end of a mark must not be before its start")
}
if mark.Start < last {
s, e := adjust(mark.Start), adjust(mark.End)
if s < 0 || e < 0 {
return fmt.Errorf("Overlapping marks are not supported")
}
last = mark.Start
mark.Start = adjust(mark.Start)
mark.End = adjust(mark.End)
mark.Start, mark.End = s, e
}
return nil
}

View File

@ -138,3 +138,29 @@ func NewSeparatorScanner(text, separator string) *StringScanner {
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
return NewLineScanner("").Split(x, expected_number_of_lines...)
}
func RuneOffsetsToByteOffsets(text string) func(int) int {
self := struct {
char_offset, byte_offset, last int
bytes []byte
}{bytes: UnsafeStringToBytes(text)}
return func(x int) (sz int) {
switch {
case x == self.last:
return self.byte_offset
case x < self.last:
return -1
}
self.last = x
x -= self.char_offset
for x > 0 {
_, d := utf8.DecodeRune(self.bytes)
sz += d
self.bytes = self.bytes[d:]
x--
self.char_offset++
}
self.byte_offset += sz
return self.byte_offset
}
}