diff --git a/tools/utils/shlex/shlex.go b/tools/utils/shlex/shlex.go index d98308bce..70f908ceb 100644 --- a/tools/utils/shlex/shlex.go +++ b/tools/utils/shlex/shlex.go @@ -20,27 +20,26 @@ shell-style rules for quoting and commenting. The basic use case uses the default ASCII lexer to split a string into sub-strings: - shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"} + shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"} To process a stream of strings: - l := NewLexer(os.Stdin) - for ; token, err := l.Next(); err != nil { - // process token - } + l := NewLexer(os.Stdin) + for ; token, err := l.Next(); err != nil { + // process token + } To access the raw token stream (which includes tokens for comments): - t := NewTokenizer(os.Stdin) - for ; token, err := t.Next(); err != nil { - // process token - } - + t := NewTokenizer(os.Stdin) + for ; token, err := t.Next(); err != nil { + // process token + } */ package shlex import ( - "bufio" + "errors" "fmt" "io" "strings" @@ -142,9 +141,9 @@ func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass { type Lexer Tokenizer // NewLexer creates a new lexer from an input stream. -func NewLexer(r io.Reader) *Lexer { +func NewLexer(x io.RuneReader) *Lexer { - return (*Lexer)(NewTokenizer(r)) + return (*Lexer)(NewTokenizer(x)) } // Next returns the next word, or an error. If there are no more words, @@ -168,19 +167,24 @@ func (l *Lexer) Next() (string, error) { // Tokenizer turns an input stream into a sequence of typed tokens type Tokenizer struct { - input bufio.Reader + input io.RuneReader classifier tokenClassifier + pos int64 } // NewTokenizer creates a new tokenizer from an input stream. -func NewTokenizer(r io.Reader) *Tokenizer { - input := bufio.NewReader(r) +func NewTokenizer(input io.RuneReader) *Tokenizer { classifier := newDefaultClassifier() return &Tokenizer{ - input: *input, + input: input, classifier: classifier} } +var ErrTrailingEscape error = errors.New("EOF found after escape character") +var ErrTrailingQuoteEscape error = errors.New("EOF found after escape character for double quote") +var ErrUnclosedDoubleQuote error = errors.New("EOF found when expecting closing double quote") +var ErrUnclosedSingleQuote error = errors.New("EOF found when expecting closing single quote") + // scanStream scans the stream for the next token using the internal state machine. // It will panic if it encounters a rune which it does not know how to handle. func (t *Tokenizer) scanStream() (*Token, error) { @@ -190,9 +194,10 @@ func (t *Tokenizer) scanStream() (*Token, error) { var nextRune rune var nextRuneType runeTokenClass var err error + var sz int for { - nextRune, _, err = t.input.ReadRune() + nextRune, sz, err = t.input.ReadRune() nextRuneType = t.classifier.ClassifyRune(nextRune) if err == io.EOF { @@ -201,6 +206,7 @@ func (t *Tokenizer) scanStream() (*Token, error) { } else if err != nil { return nil, err } + t.pos += int64(sz) switch state { case startState: // no runes read yet @@ -281,7 +287,7 @@ func (t *Tokenizer) scanStream() (*Token, error) { switch nextRuneType { case eofRuneClass: { - err = fmt.Errorf("EOF found after escape character") + err = ErrTrailingEscape token := &Token{ tokenType: tokenType, value: string(value)} @@ -299,7 +305,7 @@ func (t *Tokenizer) scanStream() (*Token, error) { switch nextRuneType { case eofRuneClass: { - err = fmt.Errorf("EOF found after escape character") + err = ErrTrailingQuoteEscape token := &Token{ tokenType: tokenType, value: string(value)} @@ -317,7 +323,7 @@ func (t *Tokenizer) scanStream() (*Token, error) { switch nextRuneType { case eofRuneClass: { - err = fmt.Errorf("EOF found when expecting closing quote") + err = ErrUnclosedDoubleQuote token := &Token{ tokenType: tokenType, value: string(value)} @@ -342,7 +348,7 @@ func (t *Tokenizer) scanStream() (*Token, error) { switch nextRuneType { case eofRuneClass: { - err = fmt.Errorf("EOF found when expecting closing quote") + err = ErrUnclosedSingleQuote token := &Token{ tokenType: tokenType, value: string(value)} @@ -399,6 +405,11 @@ func (t *Tokenizer) Next() (*Token, error) { return t.scanStream() } +// Pos returns the current position in the string as a byte offset +func (t *Tokenizer) Pos() int64 { + return t.pos +} + // Split partitions a string into a slice of strings. func Split(s string) ([]string, error) { l := NewLexer(strings.NewReader(s)) diff --git a/tools/utils/shlex/shlex_test.go b/tools/utils/shlex/shlex_test.go index f9f9e0c79..c53cff01a 100644 --- a/tools/utils/shlex/shlex_test.go +++ b/tools/utils/shlex/shlex_test.go @@ -43,20 +43,20 @@ func TestClassifier(t *testing.T) { } func TestTokenizer(t *testing.T) { - testInput := strings.NewReader(testString) + testInput := testString expectedTokens := []*Token{ - &Token{WordToken, "one"}, - &Token{WordToken, "two"}, - &Token{WordToken, "three four"}, - &Token{WordToken, "five \"six\""}, - &Token{WordToken, "seven#eight"}, - &Token{CommentToken, " nine # ten"}, - &Token{WordToken, "eleven"}, - &Token{WordToken, "twelve\\"}, - &Token{WordToken, "thirteen=13"}, - &Token{WordToken, "fourteen/14"}} + {WordToken, "one"}, + {WordToken, "two"}, + {WordToken, "three four"}, + {WordToken, "five \"six\""}, + {WordToken, "seven#eight"}, + {CommentToken, " nine # ten"}, + {WordToken, "eleven"}, + {WordToken, "twelve\\"}, + {WordToken, "thirteen=13"}, + {WordToken, "fourteen/14"}} - tokenizer := NewTokenizer(testInput) + tokenizer := NewTokenizer(strings.NewReader(testInput)) for i, want := range expectedTokens { got, err := tokenizer.Next() if err != nil { @@ -69,10 +69,10 @@ func TestTokenizer(t *testing.T) { } func TestLexer(t *testing.T) { - testInput := strings.NewReader(testString) + testInput := testString expectedStrings := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "eleven", "twelve\\", "thirteen=13", "fourteen/14"} - lexer := NewLexer(testInput) + lexer := NewLexer(strings.NewReader(testInput)) for i, want := range expectedStrings { got, err := lexer.Next() if err != nil {