From 5436408463298ecdccce2a13ee68ed476fc8d486 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 23 Oct 2022 13:00:19 +0530 Subject: [PATCH] Fix the escape code parser not preserving complete utf-8 state between calls to ParseBytes() Also allow it to be driven byte-by-byte --- tools/wcswidth/escape-code-parser.go | 73 +++++++++++++++++----------- tools/wcswidth/wcswidth.go | 16 +++++- 2 files changed, 59 insertions(+), 30 deletions(-) diff --git a/tools/wcswidth/escape-code-parser.go b/tools/wcswidth/escape-code-parser.go index 816ac854c..6786d6d9e 100644 --- a/tools/wcswidth/escape-code-parser.go +++ b/tools/wcswidth/escape-code-parser.go @@ -4,10 +4,12 @@ package wcswidth import ( "bytes" - "errors" + "fmt" "kitty/tools/utils" ) +var _ = fmt.Print + type parser_state uint8 type csi_state uint8 type csi_char_type uint8 @@ -39,12 +41,14 @@ const ( type EscapeCodeParser struct { state parser_state - utf8_state utils.UTF8State + utf8_state, utf8_codep utils.UTF8State csi_state csi_state current_buffer []byte bracketed_paste_buffer []utils.UTF8State current_callback func([]byte) error + ReplaceInvalidUtf8Bytes bool + // Callbacks HandleRune func(rune) error HandleEndOfBracketedPaste func() @@ -58,42 +62,50 @@ type EscapeCodeParser struct { func (self *EscapeCodeParser) InBracketedPaste() bool { return self.state == bracketed_paste } -var reparse_byte = errors.New("") - func (self *EscapeCodeParser) ParseString(s string) error { return self.Parse(utils.UnsafeStringToBytes(s)) } -func (self *EscapeCodeParser) Parse(data []byte) error { - prev := utils.UTF8_ACCEPT - codep := utils.UTF8_ACCEPT - for i := 0; i < len(data); i++ { - switch self.state { - case normal, bracketed_paste: - switch utils.DecodeUtf8(&self.utf8_state, &codep, data[i]) { - case utils.UTF8_ACCEPT: - err := self.dispatch_char(codep) - if err != nil { - self.Reset() - return err - } - case utils.UTF8_REJECT: - self.utf8_state = utils.UTF8_ACCEPT - if prev != utils.UTF8_ACCEPT && i > 0 { - i-- - } - } - prev = self.utf8_state - default: - err := self.dispatch_byte(data[i]) +func (self *EscapeCodeParser) ParseByte(b byte) error { + switch self.state { + case normal, bracketed_paste: + prev_utf8_state := self.utf8_state + switch utils.DecodeUtf8(&self.utf8_state, &self.utf8_codep, b) { + case utils.UTF8_ACCEPT: + err := self.dispatch_char(self.utf8_codep) if err != nil { self.reset_state() - if err != reparse_byte { + return err + } + case utils.UTF8_REJECT: + self.utf8_state = utils.UTF8_ACCEPT + if prev_utf8_state != utils.UTF8_ACCEPT { + // reparse this byte with state set to UTF8_ACCEPT + return self.ParseByte(b) + } + if self.ReplaceInvalidUtf8Bytes { + err := self.dispatch_char(utils.UTF8State(0xfffd)) + if err != nil { return err } - i-- } } + default: + err := self.dispatch_byte(b) + if err != nil { + self.reset_state() + return err + } + } + return nil +} + +func (self *EscapeCodeParser) Parse(data []byte) error { + for _, b := range data { + err := self.ParseByte(b) + if err != nil { + return err + } } return nil } @@ -124,6 +136,7 @@ func (self *EscapeCodeParser) reset_state() { self.bracketed_paste_buffer = self.bracketed_paste_buffer[:0] self.state = normal self.utf8_state = utils.UTF8_ACCEPT + self.utf8_codep = utils.UTF8_ACCEPT self.current_callback = nil self.csi_state = parameter } @@ -260,7 +273,9 @@ func (self *EscapeCodeParser) dispatch_byte(ch byte) error { self.current_callback = self.HandleAPC case 'D', 'E', 'H', 'M', 'N', 'O', 'Z', '6', '7', '8', '9', '=', '>', 'F', 'c', 'l', 'm', 'n', 'o', '|', '}', '~': default: - return reparse_byte + // we drop this dangling Esc and reparse the byte after the esc + self.reset_state() + return self.ParseByte(ch) } case csi: self.write_ch(ch) diff --git a/tools/wcswidth/wcswidth.go b/tools/wcswidth/wcswidth.go index 1d3f40707..a424c8398 100644 --- a/tools/wcswidth/wcswidth.go +++ b/tools/wcswidth/wcswidth.go @@ -2,7 +2,13 @@ package wcswidth -import "kitty/tools/utils" +import ( + "fmt" + + "kitty/tools/utils" +) + +var _ = fmt.Print func IsFlagCodepoint(ch rune) bool { return 0x1F1E6 <= ch && ch <= 0x1F1FF @@ -19,6 +25,7 @@ type WCWidthIterator struct { prev_width, current_width int parser EscapeCodeParser state ecparser_state + rune_count uint } func CreateWCWidthIterator() *WCWidthIterator { @@ -31,10 +38,12 @@ func (self *WCWidthIterator) Reset() { self.prev_ch = 0 self.prev_width = 0 self.current_width = 0 + self.rune_count = 0 self.parser.Reset() } func (self *WCWidthIterator) handle_rune(ch rune) error { + self.rune_count += 1 const ( normal ecparser_state = 0 flag_pair_started ecparser_state = 3 @@ -83,6 +92,11 @@ func (self *WCWidthIterator) handle_rune(ch rune) error { return nil } +func (self *WCWidthIterator) ParseByte(b byte) (ans int) { + self.parser.ParseByte(b) + return self.current_width +} + func (self *WCWidthIterator) Parse(b []byte) (ans int) { self.current_width = 0 self.parser.Parse(b)