A new string scanner thats faster than bufio.Scanner and has zero-allocation
This commit is contained in:
parent
ebc1a0f0aa
commit
b8ce441453
@ -3,7 +3,6 @@
|
|||||||
package utils
|
package utils
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
"unicode/utf8"
|
"unicode/utf8"
|
||||||
@ -23,7 +22,7 @@ func Capitalize(x string) string {
|
|||||||
type ScanLines struct {
|
type ScanLines struct {
|
||||||
entries []string
|
entries []string
|
||||||
|
|
||||||
scanner *bufio.Scanner
|
scanner *StringScanner
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScanLines(entries ...string) *ScanLines {
|
func NewScanLines(entries ...string) *ScanLines {
|
||||||
@ -35,7 +34,7 @@ func (self *ScanLines) Scan() bool {
|
|||||||
if len(self.entries) == 0 {
|
if len(self.entries) == 0 {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
self.scanner = bufio.NewScanner(strings.NewReader(self.entries[0]))
|
self.scanner = NewLineScanner(self.entries[0])
|
||||||
self.entries = self.entries[1:]
|
self.entries = self.entries[1:]
|
||||||
return self.Scan()
|
return self.Scan()
|
||||||
} else {
|
} else {
|
||||||
@ -54,15 +53,86 @@ func (self *ScanLines) Text() string {
|
|||||||
return self.scanner.Text()
|
return self.scanner.Text()
|
||||||
}
|
}
|
||||||
|
|
||||||
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
|
type StringScannerScanFunc = func(data string) (remaining_data, token string)
|
||||||
if len(expected_number_of_lines) > 0 {
|
type StringScannerPostprocessFunc = func(token string) string
|
||||||
ans = make([]string, 0, expected_number_of_lines[0])
|
|
||||||
} else {
|
func ScanFuncForSeparator(sep string) StringScannerScanFunc {
|
||||||
ans = make([]string, 0, 8)
|
if len(sep) == 1 {
|
||||||
|
sb := sep[0]
|
||||||
|
return func(data string) (remaining_data, token string) {
|
||||||
|
idx := strings.IndexByte(data, sb)
|
||||||
|
if idx < 0 {
|
||||||
|
return "", data
|
||||||
|
}
|
||||||
|
return data[idx+len(sep):], data[:idx]
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
scanner := bufio.NewScanner(strings.NewReader(x))
|
return func(data string) (remaining_data, token string) {
|
||||||
for scanner.Scan() {
|
idx := strings.Index(data, sep)
|
||||||
ans = append(ans, scanner.Text())
|
if idx < 0 {
|
||||||
|
return "", data
|
||||||
|
}
|
||||||
|
return data[idx+len(sep):], data[:idx]
|
||||||
}
|
}
|
||||||
return ans
|
}
|
||||||
|
|
||||||
|
// Faster, better designed, zero-allocation version of bufio.Scanner for strings
|
||||||
|
type StringScanner struct {
|
||||||
|
ScanFunc StringScannerScanFunc
|
||||||
|
PostProcessTokenFunc StringScannerPostprocessFunc
|
||||||
|
|
||||||
|
data string
|
||||||
|
token string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (self *StringScanner) Scan() bool {
|
||||||
|
if self.data == "" {
|
||||||
|
self.token = ""
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
self.data, self.token = self.ScanFunc(self.data)
|
||||||
|
if self.PostProcessTokenFunc != nil {
|
||||||
|
self.token = self.PostProcessTokenFunc(self.token)
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (self *StringScanner) Text() string {
|
||||||
|
return self.token
|
||||||
|
}
|
||||||
|
|
||||||
|
func (self *StringScanner) Split(data string, expected_number ...int) (ans []string) {
|
||||||
|
if len(expected_number) != 0 {
|
||||||
|
ans = make([]string, 0, expected_number[0])
|
||||||
|
} else {
|
||||||
|
ans = []string{}
|
||||||
|
}
|
||||||
|
self.data = data
|
||||||
|
for self.Scan() {
|
||||||
|
ans = append(ans, self.Text())
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewLineScanner(text string) *StringScanner {
|
||||||
|
return &StringScanner{
|
||||||
|
data: text, ScanFunc: ScanFuncForSeparator("\n"),
|
||||||
|
PostProcessTokenFunc: func(s string) string {
|
||||||
|
if len(s) > 0 && s[len(s)-1] == '\r' {
|
||||||
|
s = s[:len(s)-1]
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewSeparatorScanner(text, separator string) *StringScanner {
|
||||||
|
return &StringScanner{
|
||||||
|
data: text, ScanFunc: ScanFuncForSeparator(separator),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
|
||||||
|
return NewLineScanner("").Split(x, expected_number_of_lines...)
|
||||||
}
|
}
|
||||||
|
|||||||
35
tools/utils/strings_test.go
Normal file
35
tools/utils/strings_test.go
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
// License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
|
package utils
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/go-cmp/cmp"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _ = fmt.Print
|
||||||
|
|
||||||
|
func TestStringScanner(t *testing.T) {
|
||||||
|
for _, text := range []string{
|
||||||
|
"a\nb\nc",
|
||||||
|
"a\nb\nc\r",
|
||||||
|
"a\n\n\nb\nc",
|
||||||
|
"a\r\r\nb\r\nc\n",
|
||||||
|
"\n1",
|
||||||
|
"",
|
||||||
|
} {
|
||||||
|
actual := Splitlines(text)
|
||||||
|
expected := make([]string, 0, len(actual))
|
||||||
|
s := bufio.NewScanner(strings.NewReader(text))
|
||||||
|
for s.Scan() {
|
||||||
|
expected = append(expected, s.Text())
|
||||||
|
}
|
||||||
|
if diff := cmp.Diff(expected, actual); diff != "" {
|
||||||
|
t.Fatalf("Failed for: %#v\n%s", text, diff)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user