kitty/kittens/diff/collect.py
Kovid Goyal 75a94bcd96
Improving MIME type detection for some common file types when they are missing from the system MIME database
Also allow the user to specify their own database via mime.types in the
kitty config directory. See #3056
2020-10-25 13:42:11 +05:30

204 lines
6.3 KiB
Python

#!/usr/bin/env python3
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
import os
import re
from contextlib import suppress
from functools import lru_cache
from hashlib import md5
from kitty.guess_mime_type import guess_type
from typing import TYPE_CHECKING, Dict, List, Set, Optional, Iterator, Tuple, Union
if TYPE_CHECKING:
from .highlight import DiffHighlight # noqa
path_name_map: Dict[str, str] = {}
class Segment:
__slots__ = ('start', 'end', 'start_code', 'end_code')
def __init__(self, start: int, start_code: str):
self.start = start
self.start_code = start_code
self.end: Optional[int] = None
self.end_code: Optional[str] = None
def __repr__(self) -> str:
return 'Segment(start={!r}, start_code={!r}, end={!r}, end_code={!r})'.format(
self.start, self.start_code, self.end, self.end_code)
class Collection:
def __init__(self) -> None:
self.changes: Dict[str, str] = {}
self.renames: Dict[str, str] = {}
self.adds: Set[str] = set()
self.removes: Set[str] = set()
self.all_paths: List[str] = []
self.type_map: Dict[str, str] = {}
self.added_count = self.removed_count = 0
def add_change(self, left_path: str, right_path: str) -> None:
self.changes[left_path] = right_path
self.all_paths.append(left_path)
self.type_map[left_path] = 'diff'
def add_rename(self, left_path: str, right_path: str) -> None:
self.renames[left_path] = right_path
self.all_paths.append(left_path)
self.type_map[left_path] = 'rename'
def add_add(self, right_path: str) -> None:
self.adds.add(right_path)
self.all_paths.append(right_path)
self.type_map[right_path] = 'add'
if isinstance(data_for_path(right_path), str):
self.added_count += len(lines_for_path(right_path))
def add_removal(self, left_path: str) -> None:
self.removes.add(left_path)
self.all_paths.append(left_path)
self.type_map[left_path] = 'removal'
if isinstance(data_for_path(left_path), str):
self.removed_count += len(lines_for_path(left_path))
def finalize(self) -> None:
def key(x: str) -> str:
return path_name_map.get(x, '')
self.all_paths.sort(key=key)
def __iter__(self) -> Iterator[Tuple[str, str, Optional[str]]]:
for path in self.all_paths:
typ = self.type_map[path]
if typ == 'diff':
data: Optional[str] = self.changes[path]
elif typ == 'rename':
data = self.renames[path]
else:
data = None
yield path, typ, data
def __len__(self) -> int:
return len(self.all_paths)
def collect_files(collection: Collection, left: str, right: str) -> None:
left_names: Set[str] = set()
right_names: Set[str] = set()
left_path_map: Dict[str, str] = {}
right_path_map: Dict[str, str] = {}
def walk(base: str, names: Set[str], pmap: Dict[str, str]) -> None:
for dirpath, dirnames, filenames in os.walk(base):
for filename in filenames:
path = os.path.abspath(os.path.join(dirpath, filename))
path_name_map[path] = name = os.path.relpath(path, base)
names.add(name)
pmap[name] = path
walk(left, left_names, left_path_map)
walk(right, right_names, right_path_map)
common_names = left_names & right_names
changed_names = {n for n in common_names if data_for_path(left_path_map[n]) != data_for_path(right_path_map[n])}
for n in changed_names:
collection.add_change(left_path_map[n], right_path_map[n])
removed = left_names - common_names
added = right_names - common_names
ahash = {a: hash_for_path(right_path_map[a]) for a in added}
rhash = {r: hash_for_path(left_path_map[r]) for r in removed}
for name, rh in rhash.items():
for n, ah in ahash.items():
if ah == rh and data_for_path(left_path_map[name]) == data_for_path(right_path_map[n]):
collection.add_rename(left_path_map[name], right_path_map[n])
added.discard(n)
break
else:
collection.add_removal(left_path_map[name])
for name in added:
collection.add_add(right_path_map[name])
sanitize_pat = re.compile('[\x00-\x09\x0b-\x1f\x7f\x80-\x9f]')
def sanitize(text: str) -> str:
ntext = text.replace('\r\n', '\n')
return sanitize_pat.sub('', ntext)
@lru_cache(maxsize=1024)
def mime_type_for_path(path: str) -> str:
return guess_type(path) or 'application/octet-stream'
@lru_cache(maxsize=1024)
def raw_data_for_path(path: str) -> bytes:
with open(path, 'rb') as f:
return f.read()
def is_image(path: Optional[str]) -> bool:
return mime_type_for_path(path).startswith('image/') if path else False
@lru_cache(maxsize=1024)
def data_for_path(path: str) -> Union[str, bytes]:
raw_bytes = raw_data_for_path(path)
if not is_image(path) and not os.path.samefile(path, os.devnull):
with suppress(UnicodeDecodeError):
return raw_bytes.decode('utf-8')
return raw_bytes
class LinesForPath:
replace_tab_by = ' ' * 4
@lru_cache(maxsize=1024)
def __call__(self, path: str) -> Tuple[str, ...]:
data = data_for_path(path)
assert isinstance(data, str)
data = data.replace('\t', self.replace_tab_by)
return tuple(sanitize(data).splitlines())
lines_for_path = LinesForPath()
@lru_cache(maxsize=1024)
def hash_for_path(path: str) -> bytes:
return md5(raw_data_for_path(path)).digest()
def create_collection(left: str, right: str) -> Collection:
collection = Collection()
if os.path.isdir(left):
collect_files(collection, left, right)
else:
pl, pr = os.path.abspath(left), os.path.abspath(right)
path_name_map[pl] = left
path_name_map[pr] = right
collection.add_change(pl, pr)
collection.finalize()
return collection
highlight_data: Dict[str, 'DiffHighlight'] = {}
def set_highlight_data(data: Dict[str, 'DiffHighlight']) -> None:
global highlight_data
highlight_data = data
def highlights_for_path(path: str) -> 'DiffHighlight':
return highlight_data.get(path, [])