kitty/tools/utils/shlex/shlex.go
Kovid Goyal 4974219e0f
Add function to shlex for completion
Also remove the google header since we have diverged from
the original a fair bit. Add a link to the original for credit.
2022-11-14 15:42:08 +05:30

431 lines
9.8 KiB
Go

/*
Package shlex implements a simple lexer which splits input in to tokens using
shell-style rules for quoting.
The basic use case uses the default ASCII lexer to split a string into sub-strings:
shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
To process a stream of strings:
l := NewLexer(os.Stdin)
for ; token, err := l.Next(); err != nil {
// process token
}
To access the raw token stream (which includes tokens for spaces):
t := NewTokenizer(os.Stdin)
for ; token, err := t.Next(); err != nil {
// process token
}
*/
package shlex
// Based on https://pkg.go.dev/github.com/google/shlex with many improvements
// Relicensed to GPLv3 since all my additions.changes are GPLv3 which makes the
// original work with was APL2 also GPLv3
import (
"errors"
"fmt"
"io"
"strings"
)
// TokenType is a top-level token classification: A word, space, unknown.
type TokenType int
// runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
type runeTokenClass int
// the internal state used by the lexer state machine
type lexerState int
// Token is a (type, value) pair representing a lexographical token.
type Token struct {
Type TokenType
Value string
Pos int64
}
// Named classes of UTF-8 runes
const (
spaceRunes = " \t\r\n"
escapingQuoteRunes = `"`
nonEscapingQuoteRunes = "'"
escapeRunes = `\`
)
// Classes of rune token
const (
unknownRuneClass runeTokenClass = iota
spaceRuneClass
escapingQuoteRuneClass
nonEscapingQuoteRuneClass
escapeRuneClass
eofRuneClass
)
// Classes of lexographic token
const (
UnknownToken TokenType = iota
WordToken
SpaceToken
)
func (t TokenType) String() string {
switch t {
default:
return "UnknownToken"
case WordToken:
return "WordToken"
case SpaceToken:
return "SpaceToken"
}
}
// Lexer state machine states
const (
startState lexerState = iota // no runes have been seen
inWordState // processing regular runes in a word
inSpaceState // processing runes in a space
escapingState // we have just consumed an escape rune; the next rune is literal
escapingQuotedState // we have just consumed an escape rune within a quoted string
quotingEscapingState // we are within a quoted string that supports escaping ("...")
quotingState // we are within a string that does not support escaping ('...')
)
// tokenClassifier is used for classifying rune characters.
type tokenClassifier map[rune]runeTokenClass
func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
for _, runeChar := range runes {
typeMap[runeChar] = tokenType
}
}
// newDefaultClassifier creates a new classifier for ASCII characters.
func newDefaultClassifier() tokenClassifier {
t := tokenClassifier{}
t.addRuneClass(spaceRunes, spaceRuneClass)
t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
t.addRuneClass(escapeRunes, escapeRuneClass)
return t
}
// ClassifyRune classifiees a rune
func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
return t[runeVal]
}
// Lexer turns an input stream into a sequence of tokens. Whitespace is skipped.
type Lexer Tokenizer
// NewLexer creates a new lexer from an input stream.
func NewLexer(x io.RuneReader) *Lexer {
return (*Lexer)(NewTokenizer(x))
}
// Next returns the next word, or an error. If there are no more words,
// the error will be io.EOF.
func (l *Lexer) Next() (string, error) {
for {
token, err := (*Tokenizer)(l).Next()
if err != nil {
return "", err
}
switch token.Type {
case WordToken:
return token.Value, nil
case SpaceToken:
// skip spaces
default:
return "", fmt.Errorf("Unknown token type: %s", token.Type)
}
}
}
// Tokenizer turns an input stream into a sequence of typed tokens
type Tokenizer struct {
input io.RuneReader
classifier tokenClassifier
pos int64
redo_rune struct {
char rune
sz int
rune_type runeTokenClass
}
}
// NewTokenizer creates a new tokenizer from an input stream.
func NewTokenizer(input io.RuneReader) *Tokenizer {
classifier := newDefaultClassifier()
return &Tokenizer{
input: input,
classifier: classifier}
}
var ErrTrailingEscape error = errors.New("EOF found after escape character")
var ErrTrailingQuoteEscape error = errors.New("EOF found after escape character for double quote")
var ErrUnclosedDoubleQuote error = errors.New("EOF found when expecting closing double quote")
var ErrUnclosedSingleQuote error = errors.New("EOF found when expecting closing single quote")
// scanStream scans the stream for the next token using the internal state machine.
// It will panic if it encounters a rune which it does not know how to handle.
func (t *Tokenizer) scanStream() (*Token, error) {
state := startState
var tokenType TokenType
var nextRune rune
var nextRuneType runeTokenClass
var err error
var sz int
value := strings.Builder{}
pos_at_start := t.pos
unread_rune := func() {
t.redo_rune.sz = sz
t.redo_rune.char = nextRune
t.redo_rune.rune_type = nextRuneType
t.pos -= int64(sz)
}
token := func() *Token {
return &Token{tokenType, value.String(), pos_at_start}
}
for {
if t.redo_rune.sz > 0 {
nextRune, sz = t.redo_rune.char, t.redo_rune.sz
nextRuneType = t.redo_rune.rune_type
t.redo_rune.sz = 0
} else {
nextRune, sz, err = t.input.ReadRune()
nextRuneType = t.classifier.ClassifyRune(nextRune)
}
if err == io.EOF {
nextRuneType = eofRuneClass
err = nil
} else if err != nil {
return nil, err
}
t.pos += int64(sz)
switch state {
case startState: // no runes read yet
{
switch nextRuneType {
case eofRuneClass:
{
return nil, io.EOF
}
case spaceRuneClass:
{
tokenType = SpaceToken
value.WriteRune(nextRune)
state = inSpaceState
}
case escapingQuoteRuneClass:
{
tokenType = WordToken
state = quotingEscapingState
}
case nonEscapingQuoteRuneClass:
{
tokenType = WordToken
state = quotingState
}
case escapeRuneClass:
{
tokenType = WordToken
state = escapingState
}
default:
{
tokenType = WordToken
value.WriteRune(nextRune)
state = inWordState
}
}
}
case inSpaceState: // in a sequence of spaces separating words
{
switch nextRuneType {
case spaceRuneClass:
{
value.WriteRune(nextRune)
}
default:
{
unread_rune()
return token(), err
}
}
}
case inWordState: // in a regular word
{
switch nextRuneType {
case eofRuneClass:
{
return token(), err
}
case spaceRuneClass:
{
unread_rune()
return token(), err
}
case escapingQuoteRuneClass:
{
state = quotingEscapingState
}
case nonEscapingQuoteRuneClass:
{
state = quotingState
}
case escapeRuneClass:
{
state = escapingState
}
default:
{
value.WriteRune(nextRune)
}
}
}
case escapingState: // the rune after an escape character
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrTrailingEscape
return token(), err
}
default:
{
state = inWordState
value.WriteRune(nextRune)
}
}
}
case escapingQuotedState: // the next rune after an escape character, in double quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrTrailingQuoteEscape
return token(), err
}
default:
{
state = quotingEscapingState
value.WriteRune(nextRune)
}
}
}
case quotingEscapingState: // in escaping double quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrUnclosedDoubleQuote
return token(), err
}
case escapingQuoteRuneClass:
{
state = inWordState
}
case escapeRuneClass:
{
state = escapingQuotedState
}
default:
{
value.WriteRune(nextRune)
}
}
}
case quotingState: // in non-escaping single quotes
{
switch nextRuneType {
case eofRuneClass:
{
err = ErrUnclosedSingleQuote
return token(), err
}
case nonEscapingQuoteRuneClass:
{
state = inWordState
}
default:
{
value.WriteRune(nextRune)
}
}
}
default:
{
return nil, fmt.Errorf("Unexpected state: %v", state)
}
}
}
}
// Next returns the next token in the stream.
func (t *Tokenizer) Next() (*Token, error) {
return t.scanStream()
}
// Pos returns the current position in the string as a byte offset
func (t *Tokenizer) Pos() int64 {
return t.pos
}
// Split partitions a string into a slice of strings.
func Split(s string) ([]string, error) {
l := NewLexer(strings.NewReader(s))
subStrings := make([]string, 0)
for {
word, err := l.Next()
if err != nil {
if err == io.EOF {
return subStrings, nil
}
return subStrings, err
}
subStrings = append(subStrings, word)
}
}
// SplitForCompletion partitions a string into a slice of strings. It differs from Split in being
// more relaxed about errors and also adding an empty string at the end if s ends with a SpaceToken.
func SplitForCompletion(s string) (argv []string, position_of_last_arg int) {
t := NewTokenizer(strings.NewReader(s))
argv = make([]string, 0, len(s)/4)
token := &Token{}
for {
ntoken, err := t.Next()
if err == io.EOF {
if token.Type == SpaceToken {
argv = append(argv, "")
token.Pos += int64(len(token.Value))
}
return argv, int(token.Pos)
}
if ntoken == nil {
return []string{}, -1
}
switch ntoken.Type {
case WordToken:
argv = append(argv, ntoken.Value)
case SpaceToken:
// skip spaces
default:
return []string{}, -1
}
token = ntoken
}
}