Make code to convert rune offsets to byte offsets re-useable

This commit is contained in:
Kovid Goyal 2023-03-12 13:12:17 +05:30
parent dd783c842f
commit b088ab91cf
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 30 additions and 21 deletions

View File

@ -325,34 +325,17 @@ type ErrNoMatches struct{ Type string }
func adjust_python_offsets(text string, marks []Mark) error { func adjust_python_offsets(text string, marks []Mark) error {
// python returns rune based offsets (unicode chars not utf-8 bytes) // python returns rune based offsets (unicode chars not utf-8 bytes)
// this adjustment function assumes the marks are non overlapping adjust := utils.RuneOffsetsToByteOffsets(text)
bytes := utils.UnsafeStringToBytes(text)
char_offset, byte_offset := 0, 0
adjust := func(x int) (sz int) {
x -= char_offset
for x > 0 {
_, d := utf8.DecodeRune(bytes)
sz += d
bytes = bytes[d:]
x--
char_offset++
}
byte_offset += sz
return byte_offset
}
last := 0
for i := range marks { for i := range marks {
mark := &marks[i] mark := &marks[i]
if mark.End < mark.Start { if mark.End < mark.Start {
return fmt.Errorf("The end of a mark must not be before its start") return fmt.Errorf("The end of a mark must not be before its start")
} }
if mark.Start < last { s, e := adjust(mark.Start), adjust(mark.End)
if s < 0 || e < 0 {
return fmt.Errorf("Overlapping marks are not supported") return fmt.Errorf("Overlapping marks are not supported")
} }
last = mark.Start mark.Start, mark.End = s, e
mark.Start = adjust(mark.Start)
mark.End = adjust(mark.End)
} }
return nil return nil
} }

View File

@ -138,3 +138,29 @@ func NewSeparatorScanner(text, separator string) *StringScanner {
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) { func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
return NewLineScanner("").Split(x, expected_number_of_lines...) return NewLineScanner("").Split(x, expected_number_of_lines...)
} }
func RuneOffsetsToByteOffsets(text string) func(int) int {
self := struct {
char_offset, byte_offset, last int
bytes []byte
}{bytes: UnsafeStringToBytes(text)}
return func(x int) (sz int) {
switch {
case x == self.last:
return self.byte_offset
case x < self.last:
return -1
}
self.last = x
x -= self.char_offset
for x > 0 {
_, d := utf8.DecodeRune(self.bytes)
sz += d
self.bytes = self.bytes[d:]
x--
self.char_offset++
}
self.byte_offset += sz
return self.byte_offset
}
}