Have the shlex tokenizer track position in stream and return defined error types

This commit is contained in:
Kovid Goyal 2022-11-10 19:36:45 +05:30
parent 8d76cf8d32
commit 1485981b11
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 47 additions and 36 deletions

View File

@ -20,27 +20,26 @@ shell-style rules for quoting and commenting.
The basic use case uses the default ASCII lexer to split a string into sub-strings:
shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
To process a stream of strings:
l := NewLexer(os.Stdin)
for ; token, err := l.Next(); err != nil {
// process token
}
l := NewLexer(os.Stdin)
for ; token, err := l.Next(); err != nil {
// process token
}
To access the raw token stream (which includes tokens for comments):
t := NewTokenizer(os.Stdin)
for ; token, err := t.Next(); err != nil {
// process token
}
t := NewTokenizer(os.Stdin)
for ; token, err := t.Next(); err != nil {
// process token
}
*/
package shlex
import (
"bufio"
"errors"
"fmt"
"io"
"strings"
@ -142,9 +141,9 @@ func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
type Lexer Tokenizer
// NewLexer creates a new lexer from an input stream.
func NewLexer(r io.Reader) *Lexer {
func NewLexer(x io.RuneReader) *Lexer {
return (*Lexer)(NewTokenizer(r))
return (*Lexer)(NewTokenizer(x))
}
// Next returns the next word, or an error. If there are no more words,
@ -168,19 +167,24 @@ func (l *Lexer) Next() (string, error) {
// Tokenizer turns an input stream into a sequence of typed tokens
type Tokenizer struct {
input bufio.Reader
input io.RuneReader
classifier tokenClassifier
pos int64
}
// NewTokenizer creates a new tokenizer from an input stream.
func NewTokenizer(r io.Reader) *Tokenizer {
input := bufio.NewReader(r)
func NewTokenizer(input io.RuneReader) *Tokenizer {
classifier := newDefaultClassifier()
return &Tokenizer{
input: *input,
input: input,
classifier: classifier}
}
var ErrTrailingEscape error = errors.New("EOF found after escape character")
var ErrTrailingQuoteEscape error = errors.New("EOF found after escape character for double quote")
var ErrUnclosedDoubleQuote error = errors.New("EOF found when expecting closing double quote")
var ErrUnclosedSingleQuote error = errors.New("EOF found when expecting closing single quote")
// scanStream scans the stream for the next token using the internal state machine.
// It will panic if it encounters a rune which it does not know how to handle.
func (t *Tokenizer) scanStream() (*Token, error) {
@ -190,9 +194,10 @@ func (t *Tokenizer) scanStream() (*Token, error) {
var nextRune rune
var nextRuneType runeTokenClass
var err error
var sz int
for {
nextRune, _, err = t.input.ReadRune()
nextRune, sz, err = t.input.ReadRune()
nextRuneType = t.classifier.ClassifyRune(nextRune)
if err == io.EOF {
@ -201,6 +206,7 @@ func (t *Tokenizer) scanStream() (*Token, error) {
} else if err != nil {
return nil, err
}
t.pos += int64(sz)
switch state {
case startState: // no runes read yet
@ -281,7 +287,7 @@ func (t *Tokenizer) scanStream() (*Token, error) {
switch nextRuneType {
case eofRuneClass:
{
err = fmt.Errorf("EOF found after escape character")
err = ErrTrailingEscape
token := &Token{
tokenType: tokenType,
value: string(value)}
@ -299,7 +305,7 @@ func (t *Tokenizer) scanStream() (*Token, error) {
switch nextRuneType {
case eofRuneClass:
{
err = fmt.Errorf("EOF found after escape character")
err = ErrTrailingQuoteEscape
token := &Token{
tokenType: tokenType,
value: string(value)}
@ -317,7 +323,7 @@ func (t *Tokenizer) scanStream() (*Token, error) {
switch nextRuneType {
case eofRuneClass:
{
err = fmt.Errorf("EOF found when expecting closing quote")
err = ErrUnclosedDoubleQuote
token := &Token{
tokenType: tokenType,
value: string(value)}
@ -342,7 +348,7 @@ func (t *Tokenizer) scanStream() (*Token, error) {
switch nextRuneType {
case eofRuneClass:
{
err = fmt.Errorf("EOF found when expecting closing quote")
err = ErrUnclosedSingleQuote
token := &Token{
tokenType: tokenType,
value: string(value)}
@ -399,6 +405,11 @@ func (t *Tokenizer) Next() (*Token, error) {
return t.scanStream()
}
// Pos returns the current position in the string as a byte offset
func (t *Tokenizer) Pos() int64 {
return t.pos
}
// Split partitions a string into a slice of strings.
func Split(s string) ([]string, error) {
l := NewLexer(strings.NewReader(s))

View File

@ -43,20 +43,20 @@ func TestClassifier(t *testing.T) {
}
func TestTokenizer(t *testing.T) {
testInput := strings.NewReader(testString)
testInput := testString
expectedTokens := []*Token{
&Token{WordToken, "one"},
&Token{WordToken, "two"},
&Token{WordToken, "three four"},
&Token{WordToken, "five \"six\""},
&Token{WordToken, "seven#eight"},
&Token{CommentToken, " nine # ten"},
&Token{WordToken, "eleven"},
&Token{WordToken, "twelve\\"},
&Token{WordToken, "thirteen=13"},
&Token{WordToken, "fourteen/14"}}
{WordToken, "one"},
{WordToken, "two"},
{WordToken, "three four"},
{WordToken, "five \"six\""},
{WordToken, "seven#eight"},
{CommentToken, " nine # ten"},
{WordToken, "eleven"},
{WordToken, "twelve\\"},
{WordToken, "thirteen=13"},
{WordToken, "fourteen/14"}}
tokenizer := NewTokenizer(testInput)
tokenizer := NewTokenizer(strings.NewReader(testInput))
for i, want := range expectedTokens {
got, err := tokenizer.Next()
if err != nil {
@ -69,10 +69,10 @@ func TestTokenizer(t *testing.T) {
}
func TestLexer(t *testing.T) {
testInput := strings.NewReader(testString)
testInput := testString
expectedStrings := []string{"one", "two", "three four", "five \"six\"", "seven#eight", "eleven", "twelve\\", "thirteen=13", "fourteen/14"}
lexer := NewLexer(testInput)
lexer := NewLexer(strings.NewReader(testInput))
for i, want := range expectedStrings {
got, err := lexer.Next()
if err != nil {