Fix the escape code parser not preserving complete utf-8 state between calls to ParseBytes()

Also allow it to be driven byte-by-byte
This commit is contained in:
Kovid Goyal 2022-10-23 13:00:19 +05:30
parent d260d2f480
commit 5436408463
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 59 additions and 30 deletions

View File

@ -4,10 +4,12 @@ package wcswidth
import ( import (
"bytes" "bytes"
"errors" "fmt"
"kitty/tools/utils" "kitty/tools/utils"
) )
var _ = fmt.Print
type parser_state uint8 type parser_state uint8
type csi_state uint8 type csi_state uint8
type csi_char_type uint8 type csi_char_type uint8
@ -39,12 +41,14 @@ const (
type EscapeCodeParser struct { type EscapeCodeParser struct {
state parser_state state parser_state
utf8_state utils.UTF8State utf8_state, utf8_codep utils.UTF8State
csi_state csi_state csi_state csi_state
current_buffer []byte current_buffer []byte
bracketed_paste_buffer []utils.UTF8State bracketed_paste_buffer []utils.UTF8State
current_callback func([]byte) error current_callback func([]byte) error
ReplaceInvalidUtf8Bytes bool
// Callbacks // Callbacks
HandleRune func(rune) error HandleRune func(rune) error
HandleEndOfBracketedPaste func() HandleEndOfBracketedPaste func()
@ -58,42 +62,50 @@ type EscapeCodeParser struct {
func (self *EscapeCodeParser) InBracketedPaste() bool { return self.state == bracketed_paste } func (self *EscapeCodeParser) InBracketedPaste() bool { return self.state == bracketed_paste }
var reparse_byte = errors.New("")
func (self *EscapeCodeParser) ParseString(s string) error { func (self *EscapeCodeParser) ParseString(s string) error {
return self.Parse(utils.UnsafeStringToBytes(s)) return self.Parse(utils.UnsafeStringToBytes(s))
} }
func (self *EscapeCodeParser) Parse(data []byte) error { func (self *EscapeCodeParser) ParseByte(b byte) error {
prev := utils.UTF8_ACCEPT switch self.state {
codep := utils.UTF8_ACCEPT case normal, bracketed_paste:
for i := 0; i < len(data); i++ { prev_utf8_state := self.utf8_state
switch self.state { switch utils.DecodeUtf8(&self.utf8_state, &self.utf8_codep, b) {
case normal, bracketed_paste: case utils.UTF8_ACCEPT:
switch utils.DecodeUtf8(&self.utf8_state, &codep, data[i]) { err := self.dispatch_char(self.utf8_codep)
case utils.UTF8_ACCEPT:
err := self.dispatch_char(codep)
if err != nil {
self.Reset()
return err
}
case utils.UTF8_REJECT:
self.utf8_state = utils.UTF8_ACCEPT
if prev != utils.UTF8_ACCEPT && i > 0 {
i--
}
}
prev = self.utf8_state
default:
err := self.dispatch_byte(data[i])
if err != nil { if err != nil {
self.reset_state() self.reset_state()
if err != reparse_byte { return err
}
case utils.UTF8_REJECT:
self.utf8_state = utils.UTF8_ACCEPT
if prev_utf8_state != utils.UTF8_ACCEPT {
// reparse this byte with state set to UTF8_ACCEPT
return self.ParseByte(b)
}
if self.ReplaceInvalidUtf8Bytes {
err := self.dispatch_char(utils.UTF8State(0xfffd))
if err != nil {
return err return err
} }
i--
} }
} }
default:
err := self.dispatch_byte(b)
if err != nil {
self.reset_state()
return err
}
}
return nil
}
func (self *EscapeCodeParser) Parse(data []byte) error {
for _, b := range data {
err := self.ParseByte(b)
if err != nil {
return err
}
} }
return nil return nil
} }
@ -124,6 +136,7 @@ func (self *EscapeCodeParser) reset_state() {
self.bracketed_paste_buffer = self.bracketed_paste_buffer[:0] self.bracketed_paste_buffer = self.bracketed_paste_buffer[:0]
self.state = normal self.state = normal
self.utf8_state = utils.UTF8_ACCEPT self.utf8_state = utils.UTF8_ACCEPT
self.utf8_codep = utils.UTF8_ACCEPT
self.current_callback = nil self.current_callback = nil
self.csi_state = parameter self.csi_state = parameter
} }
@ -260,7 +273,9 @@ func (self *EscapeCodeParser) dispatch_byte(ch byte) error {
self.current_callback = self.HandleAPC self.current_callback = self.HandleAPC
case 'D', 'E', 'H', 'M', 'N', 'O', 'Z', '6', '7', '8', '9', '=', '>', 'F', 'c', 'l', 'm', 'n', 'o', '|', '}', '~': case 'D', 'E', 'H', 'M', 'N', 'O', 'Z', '6', '7', '8', '9', '=', '>', 'F', 'c', 'l', 'm', 'n', 'o', '|', '}', '~':
default: default:
return reparse_byte // we drop this dangling Esc and reparse the byte after the esc
self.reset_state()
return self.ParseByte(ch)
} }
case csi: case csi:
self.write_ch(ch) self.write_ch(ch)

View File

@ -2,7 +2,13 @@
package wcswidth package wcswidth
import "kitty/tools/utils" import (
"fmt"
"kitty/tools/utils"
)
var _ = fmt.Print
func IsFlagCodepoint(ch rune) bool { func IsFlagCodepoint(ch rune) bool {
return 0x1F1E6 <= ch && ch <= 0x1F1FF return 0x1F1E6 <= ch && ch <= 0x1F1FF
@ -19,6 +25,7 @@ type WCWidthIterator struct {
prev_width, current_width int prev_width, current_width int
parser EscapeCodeParser parser EscapeCodeParser
state ecparser_state state ecparser_state
rune_count uint
} }
func CreateWCWidthIterator() *WCWidthIterator { func CreateWCWidthIterator() *WCWidthIterator {
@ -31,10 +38,12 @@ func (self *WCWidthIterator) Reset() {
self.prev_ch = 0 self.prev_ch = 0
self.prev_width = 0 self.prev_width = 0
self.current_width = 0 self.current_width = 0
self.rune_count = 0
self.parser.Reset() self.parser.Reset()
} }
func (self *WCWidthIterator) handle_rune(ch rune) error { func (self *WCWidthIterator) handle_rune(ch rune) error {
self.rune_count += 1
const ( const (
normal ecparser_state = 0 normal ecparser_state = 0
flag_pair_started ecparser_state = 3 flag_pair_started ecparser_state = 3
@ -83,6 +92,11 @@ func (self *WCWidthIterator) handle_rune(ch rune) error {
return nil return nil
} }
func (self *WCWidthIterator) ParseByte(b byte) (ans int) {
self.parser.ParseByte(b)
return self.current_width
}
func (self *WCWidthIterator) Parse(b []byte) (ans int) { func (self *WCWidthIterator) Parse(b []byte) (ans int) {
self.current_width = 0 self.current_width = 0
self.parser.Parse(b) self.parser.Parse(b)