diff --git a/tools/utils/strings.go b/tools/utils/strings.go index 39932c63a..83c41e17f 100644 --- a/tools/utils/strings.go +++ b/tools/utils/strings.go @@ -3,7 +3,6 @@ package utils import ( - "bufio" "fmt" "strings" "unicode/utf8" @@ -23,7 +22,7 @@ func Capitalize(x string) string { type ScanLines struct { entries []string - scanner *bufio.Scanner + scanner *StringScanner } func NewScanLines(entries ...string) *ScanLines { @@ -35,7 +34,7 @@ func (self *ScanLines) Scan() bool { if len(self.entries) == 0 { return false } - self.scanner = bufio.NewScanner(strings.NewReader(self.entries[0])) + self.scanner = NewLineScanner(self.entries[0]) self.entries = self.entries[1:] return self.Scan() } else { @@ -54,15 +53,86 @@ func (self *ScanLines) Text() string { return self.scanner.Text() } -func Splitlines(x string, expected_number_of_lines ...int) (ans []string) { - if len(expected_number_of_lines) > 0 { - ans = make([]string, 0, expected_number_of_lines[0]) - } else { - ans = make([]string, 0, 8) +type StringScannerScanFunc = func(data string) (remaining_data, token string) +type StringScannerPostprocessFunc = func(token string) string + +func ScanFuncForSeparator(sep string) StringScannerScanFunc { + if len(sep) == 1 { + sb := sep[0] + return func(data string) (remaining_data, token string) { + idx := strings.IndexByte(data, sb) + if idx < 0 { + return "", data + } + return data[idx+len(sep):], data[:idx] + } + } - scanner := bufio.NewScanner(strings.NewReader(x)) - for scanner.Scan() { - ans = append(ans, scanner.Text()) + return func(data string) (remaining_data, token string) { + idx := strings.Index(data, sep) + if idx < 0 { + return "", data + } + return data[idx+len(sep):], data[:idx] } - return ans +} + +// Faster, better designed, zero-allocation version of bufio.Scanner for strings +type StringScanner struct { + ScanFunc StringScannerScanFunc + PostProcessTokenFunc StringScannerPostprocessFunc + + data string + token string +} + +func (self *StringScanner) Scan() bool { + if self.data == "" { + self.token = "" + return false + } + self.data, self.token = self.ScanFunc(self.data) + if self.PostProcessTokenFunc != nil { + self.token = self.PostProcessTokenFunc(self.token) + } + return true +} + +func (self *StringScanner) Text() string { + return self.token +} + +func (self *StringScanner) Split(data string, expected_number ...int) (ans []string) { + if len(expected_number) != 0 { + ans = make([]string, 0, expected_number[0]) + } else { + ans = []string{} + } + self.data = data + for self.Scan() { + ans = append(ans, self.Text()) + } + return +} + +func NewLineScanner(text string) *StringScanner { + return &StringScanner{ + data: text, ScanFunc: ScanFuncForSeparator("\n"), + PostProcessTokenFunc: func(s string) string { + if len(s) > 0 && s[len(s)-1] == '\r' { + s = s[:len(s)-1] + } + return s + }, + } +} + +func NewSeparatorScanner(text, separator string) *StringScanner { + return &StringScanner{ + data: text, ScanFunc: ScanFuncForSeparator(separator), + } +} + +func Splitlines(x string, expected_number_of_lines ...int) (ans []string) { + return NewLineScanner("").Split(x, expected_number_of_lines...) } diff --git a/tools/utils/strings_test.go b/tools/utils/strings_test.go new file mode 100644 index 000000000..d6694fc4d --- /dev/null +++ b/tools/utils/strings_test.go @@ -0,0 +1,35 @@ +// License: GPLv3 Copyright: 2023, Kovid Goyal, + +package utils + +import ( + "bufio" + "fmt" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" +) + +var _ = fmt.Print + +func TestStringScanner(t *testing.T) { + for _, text := range []string{ + "a\nb\nc", + "a\nb\nc\r", + "a\n\n\nb\nc", + "a\r\r\nb\r\nc\n", + "\n1", + "", + } { + actual := Splitlines(text) + expected := make([]string, 0, len(actual)) + s := bufio.NewScanner(strings.NewReader(text)) + for s.Scan() { + expected = append(expected, s.Text()) + } + if diff := cmp.Diff(expected, actual); diff != "" { + t.Fatalf("Failed for: %#v\n%s", text, diff) + } + } +}