Allowing using the anchored diff from the Go stdlib as the diff implementation

This commit is contained in:
Kovid Goyal 2023-03-23 12:30:56 +05:30
parent 9c188096d0
commit 2ac170c1b1
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 321 additions and 31 deletions

View File

@ -124,7 +124,7 @@ The diff kitten makes use of various features that are :doc:`kitty only
</graphics-protocol>`, the :doc:`extended keyboard protocol </graphics-protocol>`, the :doc:`extended keyboard protocol
</keyboard-protocol>`, etc. It also leverages terminal program infrastructure </keyboard-protocol>`, etc. It also leverages terminal program infrastructure
I created for all of kitty's other kittens to reduce the amount of code needed I created for all of kitty's other kittens to reduce the amount of code needed
(the entire implementation is under 2000 lines of code). (the entire implementation is under 3000 lines of code).
And fundamentally, it's kitty only because I wrote it for myself, and I am And fundamentally, it's kitty only because I wrote it for myself, and I am
highly unlikely to use any other terminals :) highly unlikely to use any other terminals :)

View File

@ -37,9 +37,11 @@ opt('num_context_lines', '3',
opt('diff_cmd', 'auto', opt('diff_cmd', 'auto',
long_text=''' long_text='''
The diff command to use. Must contain the placeholder :code:`_CONTEXT_` which The diff command to use. Must contain the placeholder :code:`_CONTEXT_` which
will be replaced by the number of lines of context. The default special value will be replaced by the number of lines of context. A few special values are allowed:
:code:`auto` is to search the system for either :program:`git` or :code:`auto` will automatically pick an available diff implementation. :code:`builtin`
:program:`diff` and use that, if found. will use the anchored diff algorithm from the Go standard library. :code:`git` will
use the git command to do the diffing. :code:`diff` will use the diff command to
do the diffing.
''' '''
) )

264
tools/cmd/diff/diff.go Normal file
View File

@ -0,0 +1,264 @@
// Copied from the Go stdlib, with modifications.
//https://github.com/golang/go/raw/master/src/internal/diff/diff.go
// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package diff
import (
"bytes"
"fmt"
"sort"
"strings"
)
// A pair is a pair of values tracked for both the x and y side of a diff.
// It is typically a pair of line indexes.
type pair struct{ x, y int }
// Diff returns an anchored diff of the two texts old and new
// in the “unified diff” format. If old and new are identical,
// Diff returns a nil slice (no output).
//
// Unix diff implementations typically look for a diff with
// the smallest number of lines inserted and removed,
// which can in the worst case take time quadratic in the
// number of lines in the texts. As a result, many implementations
// either can be made to run for a long time or cut off the search
// after a predetermined amount of work.
//
// In contrast, this implementation looks for a diff with the
// smallest number of “unique” lines inserted and removed,
// where unique means a line that appears just once in both old and new.
// We call this an “anchored diff” because the unique lines anchor
// the chosen matching regions. An anchored diff is usually clearer
// than a standard diff, because the algorithm does not try to
// reuse unrelated blank lines or closing braces.
// The algorithm also guarantees to run in O(n log n) time
// instead of the standard O(n²) time.
//
// Some systems call this approach a “patience diff,” named for
// the “patience sorting” algorithm, itself named for a solitaire card game.
// We avoid that name for two reasons. First, the name has been used
// for a few different variants of the algorithm, so it is imprecise.
// Second, the name is frequently interpreted as meaning that you have
// to wait longer (to be patient) for the diff, meaning that it is a slower algorithm,
// when in fact the algorithm is faster than the standard one.
func Diff(oldName, old, newName, new string, num_of_context_lines int) []byte {
if old == new {
return nil
}
x := lines(old)
y := lines(new)
// Print diff header.
var out bytes.Buffer
fmt.Fprintf(&out, "diff %s %s\n", oldName, newName)
fmt.Fprintf(&out, "--- %s\n", oldName)
fmt.Fprintf(&out, "+++ %s\n", newName)
// Loop over matches to consider,
// expanding each match to include surrounding lines,
// and then printing diff chunks.
// To avoid setup/teardown cases outside the loop,
// tgs returns a leading {0,0} and trailing {len(x), len(y)} pair
// in the sequence of matches.
var (
done pair // printed up to x[:done.x] and y[:done.y]
chunk pair // start lines of current chunk
count pair // number of lines from each side in current chunk
ctext []string // lines for current chunk
)
for _, m := range tgs(x, y) {
if m.x < done.x {
// Already handled scanning forward from earlier match.
continue
}
// Expand matching lines as far possible,
// establishing that x[start.x:end.x] == y[start.y:end.y].
// Note that on the first (or last) iteration we may (or definitey do)
// have an empty match: start.x==end.x and start.y==end.y.
start := m
for start.x > done.x && start.y > done.y && x[start.x-1] == y[start.y-1] {
start.x--
start.y--
}
end := m
for end.x < len(x) && end.y < len(y) && x[end.x] == y[end.y] {
end.x++
end.y++
}
// Emit the mismatched lines before start into this chunk.
// (No effect on first sentinel iteration, when start = {0,0}.)
for _, s := range x[done.x:start.x] {
ctext = append(ctext, "-"+s)
count.x++
}
for _, s := range y[done.y:start.y] {
ctext = append(ctext, "+"+s)
count.y++
}
// If we're not at EOF and have too few common lines,
// the chunk includes all the common lines and continues.
C := num_of_context_lines // number of context lines
if (end.x < len(x) || end.y < len(y)) &&
(end.x-start.x < C || (len(ctext) > 0 && end.x-start.x < 2*C)) {
for _, s := range x[start.x:end.x] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = end
continue
}
// End chunk with common lines for context.
if len(ctext) > 0 {
n := end.x - start.x
if n > C {
n = C
}
for _, s := range x[start.x : start.x+n] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = pair{start.x + n, start.y + n}
// Format and emit chunk.
// Convert line numbers to 1-indexed.
// Special case: empty file shows up as 0,0 not 1,0.
if count.x > 0 {
chunk.x++
}
if count.y > 0 {
chunk.y++
}
fmt.Fprintf(&out, "@@ -%d,%d +%d,%d @@\n", chunk.x, count.x, chunk.y, count.y)
for _, s := range ctext {
out.WriteString(s)
}
count.x = 0
count.y = 0
ctext = ctext[:0]
}
// If we reached EOF, we're done.
if end.x >= len(x) && end.y >= len(y) {
break
}
// Otherwise start a new chunk.
chunk = pair{end.x - C, end.y - C}
for _, s := range x[chunk.x:end.x] {
ctext = append(ctext, " "+s)
count.x++
count.y++
}
done = end
}
return out.Bytes()
}
// lines returns the lines in the file x, including newlines.
// If the file does not end in a newline, one is supplied
// along with a warning about the missing newline.
func lines(x string) []string {
l := strings.SplitAfter(x, "\n")
if l[len(l)-1] == "" {
l = l[:len(l)-1]
} else {
// Treat last line as having a message about the missing newline attached,
// using the same text as BSD/GNU diff (including the leading backslash).
l[len(l)-1] += "\n\\ No newline at end of file\n"
}
return l
}
// tgs returns the pairs of indexes of the longest common subsequence
// of unique lines in x and y, where a unique line is one that appears
// once in x and once in y.
//
// The longest common subsequence algorithm is as described in
// Thomas G. Szymanski, “A Special Case of the Maximal Common
// Subsequence Problem,” Princeton TR #170 (January 1975),
// available at https://research.swtch.com/tgs170.pdf.
func tgs(x, y []string) []pair {
// Count the number of times each string appears in a and b.
// We only care about 0, 1, many, counted as 0, -1, -2
// for the x side and 0, -4, -8 for the y side.
// Using negative numbers now lets us distinguish positive line numbers later.
m := make(map[string]int)
for _, s := range x {
if c := m[s]; c > -2 {
m[s] = c - 1
}
}
for _, s := range y {
if c := m[s]; c > -8 {
m[s] = c - 4
}
}
// Now unique strings can be identified by m[s] = -1+-4.
//
// Gather the indexes of those strings in x and y, building:
// xi[i] = increasing indexes of unique strings in x.
// yi[i] = increasing indexes of unique strings in y.
// inv[i] = index j such that x[xi[i]] = y[yi[j]].
var xi, yi, inv []int
for i, s := range y {
if m[s] == -1+-4 {
m[s] = len(yi)
yi = append(yi, i)
}
}
for i, s := range x {
if j, ok := m[s]; ok && j >= 0 {
xi = append(xi, i)
inv = append(inv, j)
}
}
// Apply Algorithm A from Szymanski's paper.
// In those terms, A = J = inv and B = [0, n).
// We add sentinel pairs {0,0}, and {len(x),len(y)}
// to the returned sequence, to help the processing loop.
J := inv
n := len(xi)
T := make([]int, n)
L := make([]int, n)
for i := range T {
T[i] = n + 1
}
for i := 0; i < n; i++ {
k := sort.Search(n, func(k int) bool {
return T[k] >= J[i]
})
T[k] = J[i]
L[i] = k + 1
}
k := 0
for _, v := range L {
if k < v {
k = v
}
}
seq := make([]pair, 2+k)
seq[1+k] = pair{len(x), len(y)} // sentinel at end
lastj := n
for i := n - 1; i >= 0; i-- {
if L[i] == k && J[i] < lastj {
seq[k] = pair{xi[i], yi[J[i]]}
k--
}
}
seq[0] = pair{0, 0} // sentinel at start
return seq
}

View File

@ -30,27 +30,34 @@ var DiffExe = (&utils.Once[string]{Run: func() string {
return utils.FindExe("diff") return utils.FindExe("diff")
}}).Get }}).Get
func find_differ() error { func find_differ() {
if GitExe() != "git" && exec.Command(GitExe(), "--help").Run() == nil { if GitExe() != "git" && exec.Command(GitExe(), "--help").Run() == nil {
diff_cmd, _ = shlex.Split(GIT_DIFF) diff_cmd, _ = shlex.Split(GIT_DIFF)
return nil } else if DiffExe() != "diff" && exec.Command(DiffExe(), "--help").Run() == nil {
}
if DiffExe() != "diff" && exec.Command(DiffExe(), "--help").Run() == nil {
diff_cmd, _ = shlex.Split(DIFF_DIFF) diff_cmd, _ = shlex.Split(DIFF_DIFF)
return nil } else {
diff_cmd = []string{}
} }
return fmt.Errorf("Neither the git nor the diff programs were found in PATH")
} }
func set_diff_command(q string) error { func set_diff_command(q string) error {
if q == "auto" { switch q {
return find_differ() case "auto":
} find_differ()
case "builtin", "":
diff_cmd = []string{}
case "diff":
diff_cmd, _ = shlex.Split(DIFF_DIFF)
case "git":
diff_cmd, _ = shlex.Split(GIT_DIFF)
default:
c, err := shlex.Split(q) c, err := shlex.Split(q)
if err == nil { if err != nil {
return err
}
diff_cmd = c diff_cmd = c
} }
return err return nil
} }
type Center struct{ prefix_count, suffix_count int } type Center struct{ prefix_count, suffix_count int }
@ -273,10 +280,6 @@ func parse_patch(raw string, left_lines, right_lines []string) (ans *Patch, err
} }
func run_diff(file1, file2 string, num_of_context_lines int) (ok, is_different bool, patch string, err error) { func run_diff(file1, file2 string, num_of_context_lines int) (ok, is_different bool, patch string, err error) {
context := strconv.Itoa(num_of_context_lines)
cmd := utils.Map(func(x string) string {
return strings.ReplaceAll(x, "_CONTEXT_", context)
}, diff_cmd)
// we resolve symlinks because git diff does not follow symlinks, while diff // we resolve symlinks because git diff does not follow symlinks, while diff
// does. We want consistent behavior, also for integration with git difftool // does. We want consistent behavior, also for integration with git difftool
// we always want symlinks to be followed. // we always want symlinks to be followed.
@ -288,6 +291,26 @@ func run_diff(file1, file2 string, num_of_context_lines int) (ok, is_different b
if err != nil { if err != nil {
return return
} }
if len(diff_cmd) == 0 {
data1, err := data_for_path(path1)
if err != nil {
return false, false, "", err
}
data2, err := data_for_path(path2)
if err != nil {
return false, false, "", err
}
patchb := Diff(path1, data1, path2, data2, num_of_context_lines)
if patchb == nil {
return true, false, "", nil
}
return true, len(patchb) > 0, utils.UnsafeBytesToString(patchb), nil
} else {
context := strconv.Itoa(num_of_context_lines)
cmd := utils.Map(func(x string) string {
return strings.ReplaceAll(x, "_CONTEXT_", context)
}, diff_cmd)
cmd = append(cmd, path1, path2) cmd = append(cmd, path1, path2)
c := exec.Command(cmd[0], cmd[1:]...) c := exec.Command(cmd[0], cmd[1:]...)
stdout, stderr := bytes.Buffer{}, bytes.Buffer{} stdout, stderr := bytes.Buffer{}, bytes.Buffer{}
@ -301,6 +324,7 @@ func run_diff(file1, file2 string, num_of_context_lines int) (ok, is_different b
return false, false, stderr.String(), err return false, false, stderr.String(), err
} }
return true, false, stdout.String(), nil return true, false, stdout.String(), nil
}
} }
func do_diff(file1, file2 string, context_count int) (ans *Patch, err error) { func do_diff(file1, file2 string, context_count int) (ans *Patch, err error) {