Make code to convert rune offsets to byte offsets re-useable
This commit is contained in:
parent
dd783c842f
commit
b088ab91cf
@ -325,34 +325,17 @@ type ErrNoMatches struct{ Type string }
|
|||||||
|
|
||||||
func adjust_python_offsets(text string, marks []Mark) error {
|
func adjust_python_offsets(text string, marks []Mark) error {
|
||||||
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
// python returns rune based offsets (unicode chars not utf-8 bytes)
|
||||||
// this adjustment function assumes the marks are non overlapping
|
adjust := utils.RuneOffsetsToByteOffsets(text)
|
||||||
bytes := utils.UnsafeStringToBytes(text)
|
|
||||||
char_offset, byte_offset := 0, 0
|
|
||||||
|
|
||||||
adjust := func(x int) (sz int) {
|
|
||||||
x -= char_offset
|
|
||||||
for x > 0 {
|
|
||||||
_, d := utf8.DecodeRune(bytes)
|
|
||||||
sz += d
|
|
||||||
bytes = bytes[d:]
|
|
||||||
x--
|
|
||||||
char_offset++
|
|
||||||
}
|
|
||||||
byte_offset += sz
|
|
||||||
return byte_offset
|
|
||||||
}
|
|
||||||
last := 0
|
|
||||||
for i := range marks {
|
for i := range marks {
|
||||||
mark := &marks[i]
|
mark := &marks[i]
|
||||||
if mark.End < mark.Start {
|
if mark.End < mark.Start {
|
||||||
return fmt.Errorf("The end of a mark must not be before its start")
|
return fmt.Errorf("The end of a mark must not be before its start")
|
||||||
}
|
}
|
||||||
if mark.Start < last {
|
s, e := adjust(mark.Start), adjust(mark.End)
|
||||||
|
if s < 0 || e < 0 {
|
||||||
return fmt.Errorf("Overlapping marks are not supported")
|
return fmt.Errorf("Overlapping marks are not supported")
|
||||||
}
|
}
|
||||||
last = mark.Start
|
mark.Start, mark.End = s, e
|
||||||
mark.Start = adjust(mark.Start)
|
|
||||||
mark.End = adjust(mark.End)
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@ -138,3 +138,29 @@ func NewSeparatorScanner(text, separator string) *StringScanner {
|
|||||||
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
|
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
|
||||||
return NewLineScanner("").Split(x, expected_number_of_lines...)
|
return NewLineScanner("").Split(x, expected_number_of_lines...)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func RuneOffsetsToByteOffsets(text string) func(int) int {
|
||||||
|
self := struct {
|
||||||
|
char_offset, byte_offset, last int
|
||||||
|
bytes []byte
|
||||||
|
}{bytes: UnsafeStringToBytes(text)}
|
||||||
|
return func(x int) (sz int) {
|
||||||
|
switch {
|
||||||
|
case x == self.last:
|
||||||
|
return self.byte_offset
|
||||||
|
case x < self.last:
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
self.last = x
|
||||||
|
x -= self.char_offset
|
||||||
|
for x > 0 {
|
||||||
|
_, d := utf8.DecodeRune(self.bytes)
|
||||||
|
sz += d
|
||||||
|
self.bytes = self.bytes[d:]
|
||||||
|
x--
|
||||||
|
self.char_offset++
|
||||||
|
}
|
||||||
|
self.byte_offset += sz
|
||||||
|
return self.byte_offset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user