Fix the escape code parser not preserving complete utf-8 state between calls to ParseBytes()

Also allow it to be driven byte-by-byte
This commit is contained in:
Kovid Goyal 2022-10-23 13:00:19 +05:30
parent d260d2f480
commit 5436408463
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 59 additions and 30 deletions

View File

@ -4,10 +4,12 @@ package wcswidth
import (
"bytes"
"errors"
"fmt"
"kitty/tools/utils"
)
var _ = fmt.Print
type parser_state uint8
type csi_state uint8
type csi_char_type uint8
@ -39,12 +41,14 @@ const (
type EscapeCodeParser struct {
state parser_state
utf8_state utils.UTF8State
utf8_state, utf8_codep utils.UTF8State
csi_state csi_state
current_buffer []byte
bracketed_paste_buffer []utils.UTF8State
current_callback func([]byte) error
ReplaceInvalidUtf8Bytes bool
// Callbacks
HandleRune func(rune) error
HandleEndOfBracketedPaste func()
@ -58,42 +62,50 @@ type EscapeCodeParser struct {
func (self *EscapeCodeParser) InBracketedPaste() bool { return self.state == bracketed_paste }
var reparse_byte = errors.New("")
func (self *EscapeCodeParser) ParseString(s string) error {
return self.Parse(utils.UnsafeStringToBytes(s))
}
func (self *EscapeCodeParser) Parse(data []byte) error {
prev := utils.UTF8_ACCEPT
codep := utils.UTF8_ACCEPT
for i := 0; i < len(data); i++ {
switch self.state {
case normal, bracketed_paste:
switch utils.DecodeUtf8(&self.utf8_state, &codep, data[i]) {
case utils.UTF8_ACCEPT:
err := self.dispatch_char(codep)
if err != nil {
self.Reset()
return err
}
case utils.UTF8_REJECT:
self.utf8_state = utils.UTF8_ACCEPT
if prev != utils.UTF8_ACCEPT && i > 0 {
i--
}
}
prev = self.utf8_state
default:
err := self.dispatch_byte(data[i])
func (self *EscapeCodeParser) ParseByte(b byte) error {
switch self.state {
case normal, bracketed_paste:
prev_utf8_state := self.utf8_state
switch utils.DecodeUtf8(&self.utf8_state, &self.utf8_codep, b) {
case utils.UTF8_ACCEPT:
err := self.dispatch_char(self.utf8_codep)
if err != nil {
self.reset_state()
if err != reparse_byte {
return err
}
case utils.UTF8_REJECT:
self.utf8_state = utils.UTF8_ACCEPT
if prev_utf8_state != utils.UTF8_ACCEPT {
// reparse this byte with state set to UTF8_ACCEPT
return self.ParseByte(b)
}
if self.ReplaceInvalidUtf8Bytes {
err := self.dispatch_char(utils.UTF8State(0xfffd))
if err != nil {
return err
}
i--
}
}
default:
err := self.dispatch_byte(b)
if err != nil {
self.reset_state()
return err
}
}
return nil
}
func (self *EscapeCodeParser) Parse(data []byte) error {
for _, b := range data {
err := self.ParseByte(b)
if err != nil {
return err
}
}
return nil
}
@ -124,6 +136,7 @@ func (self *EscapeCodeParser) reset_state() {
self.bracketed_paste_buffer = self.bracketed_paste_buffer[:0]
self.state = normal
self.utf8_state = utils.UTF8_ACCEPT
self.utf8_codep = utils.UTF8_ACCEPT
self.current_callback = nil
self.csi_state = parameter
}
@ -260,7 +273,9 @@ func (self *EscapeCodeParser) dispatch_byte(ch byte) error {
self.current_callback = self.HandleAPC
case 'D', 'E', 'H', 'M', 'N', 'O', 'Z', '6', '7', '8', '9', '=', '>', 'F', 'c', 'l', 'm', 'n', 'o', '|', '}', '~':
default:
return reparse_byte
// we drop this dangling Esc and reparse the byte after the esc
self.reset_state()
return self.ParseByte(ch)
}
case csi:
self.write_ch(ch)

View File

@ -2,7 +2,13 @@
package wcswidth
import "kitty/tools/utils"
import (
"fmt"
"kitty/tools/utils"
)
var _ = fmt.Print
func IsFlagCodepoint(ch rune) bool {
return 0x1F1E6 <= ch && ch <= 0x1F1FF
@ -19,6 +25,7 @@ type WCWidthIterator struct {
prev_width, current_width int
parser EscapeCodeParser
state ecparser_state
rune_count uint
}
func CreateWCWidthIterator() *WCWidthIterator {
@ -31,10 +38,12 @@ func (self *WCWidthIterator) Reset() {
self.prev_ch = 0
self.prev_width = 0
self.current_width = 0
self.rune_count = 0
self.parser.Reset()
}
func (self *WCWidthIterator) handle_rune(ch rune) error {
self.rune_count += 1
const (
normal ecparser_state = 0
flag_pair_started ecparser_state = 3
@ -83,6 +92,11 @@ func (self *WCWidthIterator) handle_rune(ch rune) error {
return nil
}
func (self *WCWidthIterator) ParseByte(b byte) (ans int) {
self.parser.ParseByte(b)
return self.current_width
}
func (self *WCWidthIterator) Parse(b []byte) (ans int) {
self.current_width = 0
self.parser.Parse(b)