A new string scanner thats faster than bufio.Scanner and has zero-allocation

This commit is contained in:
Kovid Goyal 2023-03-08 13:19:37 +05:30
parent ebc1a0f0aa
commit b8ce441453
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 117 additions and 12 deletions

View File

@ -3,7 +3,6 @@
package utils package utils
import ( import (
"bufio"
"fmt" "fmt"
"strings" "strings"
"unicode/utf8" "unicode/utf8"
@ -23,7 +22,7 @@ func Capitalize(x string) string {
type ScanLines struct { type ScanLines struct {
entries []string entries []string
scanner *bufio.Scanner scanner *StringScanner
} }
func NewScanLines(entries ...string) *ScanLines { func NewScanLines(entries ...string) *ScanLines {
@ -35,7 +34,7 @@ func (self *ScanLines) Scan() bool {
if len(self.entries) == 0 { if len(self.entries) == 0 {
return false return false
} }
self.scanner = bufio.NewScanner(strings.NewReader(self.entries[0])) self.scanner = NewLineScanner(self.entries[0])
self.entries = self.entries[1:] self.entries = self.entries[1:]
return self.Scan() return self.Scan()
} else { } else {
@ -54,15 +53,86 @@ func (self *ScanLines) Text() string {
return self.scanner.Text() return self.scanner.Text()
} }
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) { type StringScannerScanFunc = func(data string) (remaining_data, token string)
if len(expected_number_of_lines) > 0 { type StringScannerPostprocessFunc = func(token string) string
ans = make([]string, 0, expected_number_of_lines[0])
func ScanFuncForSeparator(sep string) StringScannerScanFunc {
if len(sep) == 1 {
sb := sep[0]
return func(data string) (remaining_data, token string) {
idx := strings.IndexByte(data, sb)
if idx < 0 {
return "", data
}
return data[idx+len(sep):], data[:idx]
}
}
return func(data string) (remaining_data, token string) {
idx := strings.Index(data, sep)
if idx < 0 {
return "", data
}
return data[idx+len(sep):], data[:idx]
}
}
// Faster, better designed, zero-allocation version of bufio.Scanner for strings
type StringScanner struct {
ScanFunc StringScannerScanFunc
PostProcessTokenFunc StringScannerPostprocessFunc
data string
token string
}
func (self *StringScanner) Scan() bool {
if self.data == "" {
self.token = ""
return false
}
self.data, self.token = self.ScanFunc(self.data)
if self.PostProcessTokenFunc != nil {
self.token = self.PostProcessTokenFunc(self.token)
}
return true
}
func (self *StringScanner) Text() string {
return self.token
}
func (self *StringScanner) Split(data string, expected_number ...int) (ans []string) {
if len(expected_number) != 0 {
ans = make([]string, 0, expected_number[0])
} else { } else {
ans = make([]string, 0, 8) ans = []string{}
} }
scanner := bufio.NewScanner(strings.NewReader(x)) self.data = data
for scanner.Scan() { for self.Scan() {
ans = append(ans, scanner.Text()) ans = append(ans, self.Text())
} }
return ans return
}
func NewLineScanner(text string) *StringScanner {
return &StringScanner{
data: text, ScanFunc: ScanFuncForSeparator("\n"),
PostProcessTokenFunc: func(s string) string {
if len(s) > 0 && s[len(s)-1] == '\r' {
s = s[:len(s)-1]
}
return s
},
}
}
func NewSeparatorScanner(text, separator string) *StringScanner {
return &StringScanner{
data: text, ScanFunc: ScanFuncForSeparator(separator),
}
}
func Splitlines(x string, expected_number_of_lines ...int) (ans []string) {
return NewLineScanner("").Split(x, expected_number_of_lines...)
} }

View File

@ -0,0 +1,35 @@
// License: GPLv3 Copyright: 2023, Kovid Goyal, <kovid at kovidgoyal.net>
package utils
import (
"bufio"
"fmt"
"strings"
"testing"
"github.com/google/go-cmp/cmp"
)
var _ = fmt.Print
func TestStringScanner(t *testing.T) {
for _, text := range []string{
"a\nb\nc",
"a\nb\nc\r",
"a\n\n\nb\nc",
"a\r\r\nb\r\nc\n",
"\n1",
"",
} {
actual := Splitlines(text)
expected := make([]string, 0, len(actual))
s := bufio.NewScanner(strings.NewReader(text))
for s.Scan() {
expected = append(expected, s.Text())
}
if diff := cmp.Diff(expected, actual); diff != "" {
t.Fatalf("Failed for: %#v\n%s", text, diff)
}
}
}