542 lines
20 KiB
Python
542 lines
20 KiB
Python
# diff.py
|
|
# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors
|
|
#
|
|
# This module is part of GitPython and is released under
|
|
# the BSD License: http://www.opensource.org/licenses/bsd-license.php
|
|
import re
|
|
|
|
from git.cmd import handle_process_output
|
|
from git.compat import (
|
|
defenc,
|
|
PY3
|
|
)
|
|
from git.util import finalize_process, hex_to_bin
|
|
|
|
from .compat import binary_type
|
|
from .objects.blob import Blob
|
|
from .objects.util import mode_str_to_int
|
|
|
|
|
|
__all__ = ('Diffable', 'DiffIndex', 'Diff', 'NULL_TREE')
|
|
|
|
# Special object to compare against the empty tree in diffs
|
|
NULL_TREE = object()
|
|
|
|
_octal_byte_re = re.compile(b'\\\\([0-9]{3})')
|
|
|
|
|
|
def _octal_repl(matchobj):
|
|
value = matchobj.group(1)
|
|
value = int(value, 8)
|
|
if PY3:
|
|
value = bytes(bytearray((value,)))
|
|
else:
|
|
value = chr(value)
|
|
return value
|
|
|
|
|
|
def decode_path(path, has_ab_prefix=True):
|
|
if path == b'/dev/null':
|
|
return None
|
|
|
|
if path.startswith(b'"') and path.endswith(b'"'):
|
|
path = (path[1:-1].replace(b'\\n', b'\n')
|
|
.replace(b'\\t', b'\t')
|
|
.replace(b'\\"', b'"')
|
|
.replace(b'\\\\', b'\\'))
|
|
|
|
path = _octal_byte_re.sub(_octal_repl, path)
|
|
|
|
if has_ab_prefix:
|
|
assert path.startswith(b'a/') or path.startswith(b'b/')
|
|
path = path[2:]
|
|
|
|
return path
|
|
|
|
|
|
class Diffable(object):
|
|
|
|
"""Common interface for all object that can be diffed against another object of compatible type.
|
|
|
|
:note:
|
|
Subclasses require a repo member as it is the case for Object instances, for practical
|
|
reasons we do not derive from Object."""
|
|
__slots__ = ()
|
|
|
|
# standin indicating you want to diff against the index
|
|
class Index(object):
|
|
pass
|
|
|
|
def _process_diff_args(self, args):
|
|
"""
|
|
:return:
|
|
possibly altered version of the given args list.
|
|
Method is called right before git command execution.
|
|
Subclasses can use it to alter the behaviour of the superclass"""
|
|
return args
|
|
|
|
def diff(self, other=Index, paths=None, create_patch=False, **kwargs):
|
|
"""Creates diffs between two items being trees, trees and index or an
|
|
index and the working tree. It will detect renames automatically.
|
|
|
|
:param other:
|
|
Is the item to compare us with.
|
|
If None, we will be compared to the working tree.
|
|
If Treeish, it will be compared against the respective tree
|
|
If Index ( type ), it will be compared against the index.
|
|
If git.NULL_TREE, it will compare against the empty tree.
|
|
It defaults to Index to assure the method will not by-default fail
|
|
on bare repositories.
|
|
|
|
:param paths:
|
|
is a list of paths or a single path to limit the diff to.
|
|
It will only include at least one of the given path or paths.
|
|
|
|
:param create_patch:
|
|
If True, the returned Diff contains a detailed patch that if applied
|
|
makes the self to other. Patches are somewhat costly as blobs have to be read
|
|
and diffed.
|
|
|
|
:param kwargs:
|
|
Additional arguments passed to git-diff, such as
|
|
R=True to swap both sides of the diff.
|
|
|
|
:return: git.DiffIndex
|
|
|
|
:note:
|
|
On a bare repository, 'other' needs to be provided as Index or as
|
|
as Tree/Commit, or a git command error will occur"""
|
|
args = []
|
|
args.append("--abbrev=40") # we need full shas
|
|
args.append("--full-index") # get full index paths, not only filenames
|
|
|
|
args.append("-M") # check for renames, in both formats
|
|
if create_patch:
|
|
args.append("-p")
|
|
else:
|
|
args.append("--raw")
|
|
|
|
# in any way, assure we don't see colored output,
|
|
# fixes https://github.com/gitpython-developers/GitPython/issues/172
|
|
args.append('--no-color')
|
|
|
|
if paths is not None and not isinstance(paths, (tuple, list)):
|
|
paths = [paths]
|
|
|
|
diff_cmd = self.repo.git.diff
|
|
if other is self.Index:
|
|
args.insert(0, '--cached')
|
|
elif other is NULL_TREE:
|
|
args.insert(0, '-r') # recursive diff-tree
|
|
args.insert(0, '--root')
|
|
diff_cmd = self.repo.git.diff_tree
|
|
elif other is not None:
|
|
args.insert(0, '-r') # recursive diff-tree
|
|
args.insert(0, other)
|
|
diff_cmd = self.repo.git.diff_tree
|
|
|
|
args.insert(0, self)
|
|
|
|
# paths is list here or None
|
|
if paths:
|
|
args.append("--")
|
|
args.extend(paths)
|
|
# END paths handling
|
|
|
|
kwargs['as_process'] = True
|
|
proc = diff_cmd(*self._process_diff_args(args), **kwargs)
|
|
|
|
diff_method = (Diff._index_from_patch_format
|
|
if create_patch
|
|
else Diff._index_from_raw_format)
|
|
index = diff_method(self.repo, proc)
|
|
|
|
proc.wait()
|
|
return index
|
|
|
|
|
|
class DiffIndex(list):
|
|
|
|
"""Implements an Index for diffs, allowing a list of Diffs to be queried by
|
|
the diff properties.
|
|
|
|
The class improves the diff handling convenience"""
|
|
# change type invariant identifying possible ways a blob can have changed
|
|
# A = Added
|
|
# D = Deleted
|
|
# R = Renamed
|
|
# M = Modified
|
|
# T = Changed in the type
|
|
change_type = ("A", "C", "D", "R", "M", "T")
|
|
|
|
def iter_change_type(self, change_type):
|
|
"""
|
|
:return:
|
|
iterator yielding Diff instances that match the given change_type
|
|
|
|
:param change_type:
|
|
Member of DiffIndex.change_type, namely:
|
|
|
|
* 'A' for added paths
|
|
* 'D' for deleted paths
|
|
* 'R' for renamed paths
|
|
* 'M' for paths with modified data
|
|
* 'T' for changed in the type paths
|
|
"""
|
|
if change_type not in self.change_type:
|
|
raise ValueError("Invalid change type: %s" % change_type)
|
|
|
|
for diff in self:
|
|
if diff.change_type == change_type:
|
|
yield diff
|
|
elif change_type == "A" and diff.new_file:
|
|
yield diff
|
|
elif change_type == "D" and diff.deleted_file:
|
|
yield diff
|
|
elif change_type == "C" and diff.copied_file:
|
|
yield diff
|
|
elif change_type == "R" and diff.renamed:
|
|
yield diff
|
|
elif change_type == "M" and diff.a_blob and diff.b_blob and diff.a_blob != diff.b_blob:
|
|
yield diff
|
|
# END for each diff
|
|
|
|
|
|
class Diff(object):
|
|
|
|
"""A Diff contains diff information between two Trees.
|
|
|
|
It contains two sides a and b of the diff, members are prefixed with
|
|
"a" and "b" respectively to inidcate that.
|
|
|
|
Diffs keep information about the changed blob objects, the file mode, renames,
|
|
deletions and new files.
|
|
|
|
There are a few cases where None has to be expected as member variable value:
|
|
|
|
``New File``::
|
|
|
|
a_mode is None
|
|
a_blob is None
|
|
a_path is None
|
|
|
|
``Deleted File``::
|
|
|
|
b_mode is None
|
|
b_blob is None
|
|
b_path is None
|
|
|
|
``Working Tree Blobs``
|
|
|
|
When comparing to working trees, the working tree blob will have a null hexsha
|
|
as a corresponding object does not yet exist. The mode will be null as well.
|
|
But the path will be available though.
|
|
If it is listed in a diff the working tree version of the file must
|
|
be different to the version in the index or tree, and hence has been modified."""
|
|
|
|
# precompiled regex
|
|
re_header = re.compile(br"""
|
|
^diff[ ]--git
|
|
[ ](?P<a_path_fallback>"?[ab]/.+?"?)[ ](?P<b_path_fallback>"?[ab]/.+?"?)\n
|
|
(?:^old[ ]mode[ ](?P<old_mode>\d+)\n
|
|
^new[ ]mode[ ](?P<new_mode>\d+)(?:\n|$))?
|
|
(?:^similarity[ ]index[ ]\d+%\n
|
|
^rename[ ]from[ ](?P<rename_from>.*)\n
|
|
^rename[ ]to[ ](?P<rename_to>.*)(?:\n|$))?
|
|
(?:^new[ ]file[ ]mode[ ](?P<new_file_mode>.+)(?:\n|$))?
|
|
(?:^deleted[ ]file[ ]mode[ ](?P<deleted_file_mode>.+)(?:\n|$))?
|
|
(?:^similarity[ ]index[ ]\d+%\n
|
|
^copy[ ]from[ ].*\n
|
|
^copy[ ]to[ ](?P<copied_file_name>.*)(?:\n|$))?
|
|
(?:^index[ ](?P<a_blob_id>[0-9A-Fa-f]+)
|
|
\.\.(?P<b_blob_id>[0-9A-Fa-f]+)[ ]?(?P<b_mode>.+)?(?:\n|$))?
|
|
(?:^---[ ](?P<a_path>[^\t\n\r\f\v]*)[\t\r\f\v]*(?:\n|$))?
|
|
(?:^\+\+\+[ ](?P<b_path>[^\t\n\r\f\v]*)[\t\r\f\v]*(?:\n|$))?
|
|
""", re.VERBOSE | re.MULTILINE)
|
|
# can be used for comparisons
|
|
NULL_HEX_SHA = "0" * 40
|
|
NULL_BIN_SHA = b"\0" * 20
|
|
|
|
__slots__ = ("a_blob", "b_blob", "a_mode", "b_mode", "a_rawpath", "b_rawpath",
|
|
"new_file", "deleted_file", "copied_file", "raw_rename_from",
|
|
"raw_rename_to", "diff", "change_type", "score")
|
|
|
|
def __init__(self, repo, a_rawpath, b_rawpath, a_blob_id, b_blob_id, a_mode,
|
|
b_mode, new_file, deleted_file, copied_file, raw_rename_from,
|
|
raw_rename_to, diff, change_type, score):
|
|
|
|
self.a_mode = a_mode
|
|
self.b_mode = b_mode
|
|
|
|
assert a_rawpath is None or isinstance(a_rawpath, binary_type)
|
|
assert b_rawpath is None or isinstance(b_rawpath, binary_type)
|
|
self.a_rawpath = a_rawpath
|
|
self.b_rawpath = b_rawpath
|
|
|
|
if self.a_mode:
|
|
self.a_mode = mode_str_to_int(self.a_mode)
|
|
if self.b_mode:
|
|
self.b_mode = mode_str_to_int(self.b_mode)
|
|
|
|
# Determine whether this diff references a submodule, if it does then
|
|
# we need to overwrite "repo" to the corresponding submodule's repo instead
|
|
if repo and a_rawpath:
|
|
for submodule in repo.submodules:
|
|
if submodule.path == a_rawpath.decode("utf-8"):
|
|
if submodule.module_exists():
|
|
repo = submodule.module()
|
|
break
|
|
|
|
if a_blob_id is None or a_blob_id == self.NULL_HEX_SHA:
|
|
self.a_blob = None
|
|
else:
|
|
self.a_blob = Blob(repo, hex_to_bin(a_blob_id), mode=self.a_mode, path=self.a_path)
|
|
|
|
if b_blob_id is None or b_blob_id == self.NULL_HEX_SHA:
|
|
self.b_blob = None
|
|
else:
|
|
self.b_blob = Blob(repo, hex_to_bin(b_blob_id), mode=self.b_mode, path=self.b_path)
|
|
|
|
self.new_file = new_file
|
|
self.deleted_file = deleted_file
|
|
self.copied_file = copied_file
|
|
|
|
# be clear and use None instead of empty strings
|
|
assert raw_rename_from is None or isinstance(raw_rename_from, binary_type)
|
|
assert raw_rename_to is None or isinstance(raw_rename_to, binary_type)
|
|
self.raw_rename_from = raw_rename_from or None
|
|
self.raw_rename_to = raw_rename_to or None
|
|
|
|
self.diff = diff
|
|
self.change_type = change_type
|
|
self.score = score
|
|
|
|
def __eq__(self, other):
|
|
for name in self.__slots__:
|
|
if getattr(self, name) != getattr(other, name):
|
|
return False
|
|
# END for each name
|
|
return True
|
|
|
|
def __ne__(self, other):
|
|
return not (self == other)
|
|
|
|
def __hash__(self):
|
|
return hash(tuple(getattr(self, n) for n in self.__slots__))
|
|
|
|
def __str__(self):
|
|
h = "%s"
|
|
if self.a_blob:
|
|
h %= self.a_blob.path
|
|
elif self.b_blob:
|
|
h %= self.b_blob.path
|
|
|
|
msg = ''
|
|
line = None # temp line
|
|
line_length = 0 # line length
|
|
for b, n in zip((self.a_blob, self.b_blob), ('lhs', 'rhs')):
|
|
if b:
|
|
line = "\n%s: %o | %s" % (n, b.mode, b.hexsha)
|
|
else:
|
|
line = "\n%s: None" % n
|
|
# END if blob is not None
|
|
line_length = max(len(line), line_length)
|
|
msg += line
|
|
# END for each blob
|
|
|
|
# add headline
|
|
h += '\n' + '=' * line_length
|
|
|
|
if self.deleted_file:
|
|
msg += '\nfile deleted in rhs'
|
|
if self.new_file:
|
|
msg += '\nfile added in rhs'
|
|
if self.copied_file:
|
|
msg += '\nfile %r copied from %r' % (self.b_path, self.a_path)
|
|
if self.rename_from:
|
|
msg += '\nfile renamed from %r' % self.rename_from
|
|
if self.rename_to:
|
|
msg += '\nfile renamed to %r' % self.rename_to
|
|
if self.diff:
|
|
msg += '\n---'
|
|
try:
|
|
msg += self.diff.decode(defenc)
|
|
except UnicodeDecodeError:
|
|
msg += 'OMITTED BINARY DATA'
|
|
# end handle encoding
|
|
msg += '\n---'
|
|
# END diff info
|
|
|
|
# Python2 silliness: have to assure we convert our likely to be unicode object to a string with the
|
|
# right encoding. Otherwise it tries to convert it using ascii, which may fail ungracefully
|
|
res = h + msg
|
|
if not PY3:
|
|
res = res.encode(defenc)
|
|
# end
|
|
return res
|
|
|
|
@property
|
|
def a_path(self):
|
|
return self.a_rawpath.decode(defenc, 'replace') if self.a_rawpath else None
|
|
|
|
@property
|
|
def b_path(self):
|
|
return self.b_rawpath.decode(defenc, 'replace') if self.b_rawpath else None
|
|
|
|
@property
|
|
def rename_from(self):
|
|
return self.raw_rename_from.decode(defenc, 'replace') if self.raw_rename_from else None
|
|
|
|
@property
|
|
def rename_to(self):
|
|
return self.raw_rename_to.decode(defenc, 'replace') if self.raw_rename_to else None
|
|
|
|
@property
|
|
def renamed(self):
|
|
""":returns: True if the blob of our diff has been renamed
|
|
:note: This property is deprecated, please use ``renamed_file`` instead.
|
|
"""
|
|
return self.renamed_file
|
|
|
|
@property
|
|
def renamed_file(self):
|
|
""":returns: True if the blob of our diff has been renamed
|
|
"""
|
|
return self.rename_from != self.rename_to
|
|
|
|
@classmethod
|
|
def _pick_best_path(cls, path_match, rename_match, path_fallback_match):
|
|
if path_match:
|
|
return decode_path(path_match)
|
|
|
|
if rename_match:
|
|
return decode_path(rename_match, has_ab_prefix=False)
|
|
|
|
if path_fallback_match:
|
|
return decode_path(path_fallback_match)
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def _index_from_patch_format(cls, repo, proc):
|
|
"""Create a new DiffIndex from the given text which must be in patch format
|
|
:param repo: is the repository we are operating on - it is required
|
|
:param stream: result of 'git diff' as a stream (supporting file protocol)
|
|
:return: git.DiffIndex """
|
|
|
|
## FIXME: Here SLURPING raw, need to re-phrase header-regexes linewise.
|
|
text = []
|
|
handle_process_output(proc, text.append, None, finalize_process, decode_streams=False)
|
|
|
|
# for now, we have to bake the stream
|
|
text = b''.join(text)
|
|
index = DiffIndex()
|
|
previous_header = None
|
|
header = None
|
|
for _header in cls.re_header.finditer(text):
|
|
a_path_fallback, b_path_fallback, \
|
|
old_mode, new_mode, \
|
|
rename_from, rename_to, \
|
|
new_file_mode, deleted_file_mode, copied_file_name, \
|
|
a_blob_id, b_blob_id, b_mode, \
|
|
a_path, b_path = _header.groups()
|
|
|
|
new_file, deleted_file, copied_file = \
|
|
bool(new_file_mode), bool(deleted_file_mode), bool(copied_file_name)
|
|
|
|
a_path = cls._pick_best_path(a_path, rename_from, a_path_fallback)
|
|
b_path = cls._pick_best_path(b_path, rename_to, b_path_fallback)
|
|
|
|
# Our only means to find the actual text is to see what has not been matched by our regex,
|
|
# and then retro-actively assign it to our index
|
|
if previous_header is not None:
|
|
index[-1].diff = text[previous_header.end():_header.start()]
|
|
# end assign actual diff
|
|
|
|
# Make sure the mode is set if the path is set. Otherwise the resulting blob is invalid
|
|
# We just use the one mode we should have parsed
|
|
a_mode = old_mode or deleted_file_mode or (a_path and (b_mode or new_mode or new_file_mode))
|
|
b_mode = b_mode or new_mode or new_file_mode or (b_path and a_mode)
|
|
index.append(Diff(repo,
|
|
a_path,
|
|
b_path,
|
|
a_blob_id and a_blob_id.decode(defenc),
|
|
b_blob_id and b_blob_id.decode(defenc),
|
|
a_mode and a_mode.decode(defenc),
|
|
b_mode and b_mode.decode(defenc),
|
|
new_file, deleted_file, copied_file,
|
|
rename_from,
|
|
rename_to,
|
|
None, None, None))
|
|
|
|
previous_header = _header
|
|
header = _header
|
|
# end for each header we parse
|
|
if index:
|
|
index[-1].diff = text[header.end():]
|
|
# end assign last diff
|
|
|
|
return index
|
|
|
|
@classmethod
|
|
def _index_from_raw_format(cls, repo, proc):
|
|
"""Create a new DiffIndex from the given stream which must be in raw format.
|
|
:return: git.DiffIndex"""
|
|
# handles
|
|
# :100644 100644 687099101... 37c5e30c8... M .gitignore
|
|
|
|
index = DiffIndex()
|
|
|
|
def handle_diff_line(line):
|
|
line = line.decode(defenc)
|
|
if not line.startswith(":"):
|
|
return
|
|
|
|
meta, _, path = line[1:].partition('\t')
|
|
old_mode, new_mode, a_blob_id, b_blob_id, _change_type = meta.split(None, 4)
|
|
# Change type can be R100
|
|
# R: status letter
|
|
# 100: score (in case of copy and rename)
|
|
change_type = _change_type[0]
|
|
score_str = ''.join(_change_type[1:])
|
|
score = int(score_str) if score_str.isdigit() else None
|
|
path = path.strip()
|
|
a_path = path.encode(defenc)
|
|
b_path = path.encode(defenc)
|
|
deleted_file = False
|
|
new_file = False
|
|
copied_file = False
|
|
rename_from = None
|
|
rename_to = None
|
|
|
|
# NOTE: We cannot conclude from the existence of a blob to change type
|
|
# as diffs with the working do not have blobs yet
|
|
if change_type == 'D':
|
|
b_blob_id = None
|
|
deleted_file = True
|
|
elif change_type == 'A':
|
|
a_blob_id = None
|
|
new_file = True
|
|
elif change_type == 'C':
|
|
copied_file = True
|
|
a_path, b_path = path.split('\t', 1)
|
|
a_path = a_path.encode(defenc)
|
|
b_path = b_path.encode(defenc)
|
|
elif change_type == 'R':
|
|
a_path, b_path = path.split('\t', 1)
|
|
a_path = a_path.encode(defenc)
|
|
b_path = b_path.encode(defenc)
|
|
rename_from, rename_to = a_path, b_path
|
|
elif change_type == 'T':
|
|
# Nothing to do
|
|
pass
|
|
# END add/remove handling
|
|
|
|
diff = Diff(repo, a_path, b_path, a_blob_id, b_blob_id, old_mode, new_mode,
|
|
new_file, deleted_file, copied_file, rename_from, rename_to,
|
|
'', change_type, score)
|
|
index.append(diff)
|
|
|
|
handle_process_output(proc, handle_diff_line, None, finalize_process, decode_streams=False)
|
|
|
|
return index
|