dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00306
[PATCH 28/28] diff: Add find_copies_harder to consider unmodified files.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
Change-Id: Ic2ed78f174c9f25e184a45c5ee9991fca330ba84
---
dulwich/diff.py | 43 +++++++++++++++++++++++++++++-----
dulwich/tests/test_diff.py | 53 ++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+), 7 deletions(-)
diff --git a/dulwich/diff.py b/dulwich/diff.py
index f88f06a..3e0f713 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -287,7 +287,8 @@ class RenameDetector(object):
def __init__(self, store, tree1_id, tree2_id,
rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
- rewrite_threshold=_REWRITE_THRESHOLD):
+ rewrite_threshold=_REWRITE_THRESHOLD,
+ find_copies_harder=False):
"""Initialize the rename detector.
:param store: An ObjectStore for looking up objects.
@@ -303,6 +304,8 @@ class RenameDetector(object):
:param rewrite_threshold: The threshold similarity score below which a
modify should be considered a delete/add, or None to not break
modifies; see _similarity_score.
+ :param find_copies_harder: If True, consider unmodified files when
+ detecting copies.
"""
self._tree1_id = tree1_id
self._tree2_id = tree2_id
@@ -310,6 +313,7 @@ class RenameDetector(object):
self._rename_threshold = rename_threshold
self._rewrite_threshold = rewrite_threshold
self._max_files = max_files
+ self._find_copies_harder = find_copies_harder
self._adds = []
self._deletes = []
@@ -324,7 +328,8 @@ class RenameDetector(object):
return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
def _collect_changes(self):
- for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
+ for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
+ want_unchanged=self._find_copies_harder):
if change.type == CHANGE_ADD:
self._adds.append(change)
elif change.type == CHANGE_DELETE:
@@ -332,6 +337,11 @@ class RenameDetector(object):
elif self._should_split(change):
self._deletes.append(TreeChange.delete(change.old))
self._adds.append(TreeChange.add(change.new))
+ elif (self._find_copies_harder and (
+ change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
+ # Treat modified/unchanged as deleted rather than splitting it,
+ # to avoid spurious renames.
+ self._deletes.append(change)
else:
self._changes.append(change)
@@ -346,22 +356,27 @@ class RenameDetector(object):
add_map[add.new.sha].append(add.new)
delete_map = defaultdict(list)
for delete in self._deletes:
- delete_map[delete.old.sha].append(delete.old)
+ # Keep track of whether the delete was actually marked as a delete.
+ # If not, it must have been added due to find_copies_harder, and
+ # needs to be marked as a copy.
+ is_delete = delete.type == CHANGE_DELETE
+ delete_map[delete.old.sha].append((delete.old, is_delete))
add_paths = set()
delete_paths = set()
for sha, sha_deletes in delete_map.iteritems():
sha_adds = add_map[sha]
- for old, new in itertools.izip(sha_deletes, sha_adds):
+ for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
continue
delete_paths.add(old.path)
add_paths.add(new.path)
- self._changes.append(TreeChange(CHANGE_RENAME, old, new))
+ new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
+ self._changes.append(TreeChange(new_type, old, new))
num_extra_adds = len(sha_adds) - len(sha_deletes)
# TODO(dborowitz): Less arbitrary way of dealing with extra copies.
- old = sha_deletes[0]
+ old = sha_deletes[0][0]
if num_extra_adds:
for new in sha_adds[-num_extra_adds:]:
add_paths.add(new.path)
@@ -397,6 +412,11 @@ class RenameDetector(object):
# If the paths match, this must be a split modify, so
# make sure it comes out as a modify.
new_type = CHANGE_MODIFY
+ elif delete.type != CHANGE_DELETE:
+ # If it's in deletes but not marked as a delete, it must
+ # have been added due to find_copies_harder, and needs
+ # to be marked as a copy.
+ new_type = CHANGE_COPY
else:
new_type = CHANGE_RENAME
rename = TreeChange(new_type, delete.old, add.new)
@@ -412,9 +432,14 @@ class RenameDetector(object):
if new_path in add_paths:
continue
old_path = change.old.path
+ orig_type = change.type
if old_path in delete_paths:
change = TreeChange(CHANGE_COPY, change.old, change.new)
- delete_paths.add(old_path)
+
+ # If the candidate was originally a copy, that means it came from a
+ # modified or unchanged path, so we don't want to prune it.
+ if orig_type != CHANGE_COPY:
+ delete_paths.add(old_path)
add_paths.add(new_path)
self._changes.append(change)
self._prune(add_paths, delete_paths)
@@ -444,12 +469,16 @@ class RenameDetector(object):
result.sort(key=_tree_change_key)
return result
+ def _prune_unchanged(self):
+ self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
+
def changes_with_renames(self):
"""Iterate TreeChanges between the two trees, with rename detection."""
self._collect_changes()
self._find_exact_renames()
self._find_content_renames()
self._join_modifies()
+ self._prune_unchanged()
return self._sorted_changes()
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index aab2831..7bfb61f 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -615,3 +615,56 @@ class RenameDetectionTest(DiffTestCase):
[TreeChange.add(('a', F, blob3.id)),
TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
self.detect_renames(tree1, tree2, rewrite_threshold=80))
+
+ def test_find_copies_harder_exact(self):
+ blob = make_object(Blob, data='blob')
+ tree1 = self.commit_tree([('a', blob)])
+ tree2 = self.commit_tree([('a', blob), ('b', blob)])
+ self.assertEqual([TreeChange.add(('b', F, blob.id))],
+ self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ [TreeChange(CHANGE_COPY, ('a', F, blob.id), ('b', F, blob.id))],
+ self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+ def test_find_copies_harder_content(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+ tree1 = self.commit_tree([('a', blob1)])
+ tree2 = self.commit_tree([('a', blob1), ('b', blob2)])
+ self.assertEqual([TreeChange.add(('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+ def test_find_copies_harder_modify(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+ tree1 = self.commit_tree([('a', blob1)])
+ tree2 = self.commit_tree([('a', blob2), ('b', blob2)])
+ self.assertEqual(
+ [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+ TreeChange.add(('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+ TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+ def test_find_copies_harder_with_rewrites(self):
+ blob_a1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob_a2 = make_object(Blob, data='f\ng\nh\ni\n')
+ blob_b2 = make_object(Blob, data='a\nb\nc\ne\n')
+ tree1 = self.commit_tree([('a', blob_a1)])
+ tree2 = self.commit_tree([('a', blob_a2), ('b', blob_b2)])
+ self.assertEqual(
+ [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+ ('a', F, blob_a2.id)),
+ TreeChange(CHANGE_COPY, ('a', F, blob_a1.id), ('b', F, blob_b2.id))],
+ self.detect_renames(tree1, tree2, find_copies_harder=True))
+ self.assertEqual(
+ [TreeChange.add(('a', F, blob_a2.id)),
+ TreeChange(CHANGE_RENAME, ('a', F, blob_a1.id),
+ ('b', F, blob_b2.id))],
+ self.detect_renames(tree1, tree2, rewrite_threshold=50,
+ find_copies_harder=True))
--
1.7.3.2.168.gd6b63
References