dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00342
[PATCH 18/24] diff_tree: Consider dissimilar modifies to be delete/add pairs.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
Modifies are split into delete/add pairs if their similarity is below
a given threshold, and are rejoined at the end of the rename detection
process.
Change-Id: I0f11549cc1117be7b8bb67cb584e4101e72a4913
---
dulwich/diff_tree.py | 46 +++++++++++++++++++++++++++++-
dulwich/tests/test_diff_tree.py | 58 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 101 insertions(+), 3 deletions(-)
diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index 4381b64..e53513b 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -44,6 +44,7 @@ _NULL_ENTRY = TreeEntry(None, None, None)
_MAX_SCORE = 100
_RENAME_THRESHOLD = 60
_MAX_FILES = 200
+_REWRITE_THRESHOLD = None
class TreeChange(TreeChangeTuple):
@@ -268,7 +269,8 @@ class RenameDetector(object):
"""Object for handling rename detection between two trees."""
def __init__(self, store, tree1_id, tree2_id,
- rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES):
+ rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
+ rewrite_threshold=_REWRITE_THRESHOLD):
"""Initialize the rename detector.
:param store: An ObjectStore for looking up objects.
@@ -281,23 +283,38 @@ class RenameDetector(object):
than max_files ** 2 add/delete pairs. This limit is provided because
rename detection can be quadratic in the project size. If the limit
is exceeded, no content rename detection is attempted.
+ :param rewrite_threshold: The threshold similarity score below which a
+ modify should be considered a delete/add, or None to not break
+ modifies; see _similarity_score.
"""
self._tree1_id = tree1_id
self._tree2_id = tree2_id
self._store = store
self._rename_threshold = rename_threshold
+ self._rewrite_threshold = rewrite_threshold
self._max_files = max_files
self._adds = []
self._deletes = []
self._changes = []
+ def _should_split(self, change):
+ if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
+ change.old.sha == change.new.sha):
+ return False
+ old_obj = self._store[change.old.sha]
+ new_obj = self._store[change.new.sha]
+ return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
+
def _collect_changes(self):
for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
if change.type == CHANGE_ADD:
self._adds.append(change)
elif change.type == CHANGE_DELETE:
self._deletes.append(change)
+ elif self._should_split(change):
+ self._deletes.append(TreeChange.delete(change.old))
+ self._adds.append(TreeChange.add(change.new))
else:
self._changes.append(change)
@@ -344,6 +361,7 @@ class RenameDetector(object):
if len(self._adds) * len(self._deletes) > self._max_files ** 2:
return
+ check_paths = self._rename_threshold is not None
candidates = []
for delete in self._deletes:
if S_ISGITLINK(delete.old.mode):
@@ -358,7 +376,13 @@ class RenameDetector(object):
score = _similarity_score(old_obj, new_obj,
block_cache={old_sha: old_blocks})
if score > self._rename_threshold:
- rename = TreeChange(CHANGE_RENAME, delete.old, add.new)
+ if check_paths and delete.old.path == add.new.path:
+ # If the paths match, this must be a split modify, so
+ # make sure it comes out as a modify.
+ new_type = CHANGE_MODIFY
+ else:
+ new_type = CHANGE_RENAME
+ rename = TreeChange(new_type, delete.old, add.new)
candidates.append((-score, rename))
# Sort scores from highest to lowest, but keep names in ascending order.
@@ -378,6 +402,23 @@ class RenameDetector(object):
self._changes.append(change)
self._prune(add_paths, delete_paths)
+ def _join_modifies(self):
+ if self._rewrite_threshold is None:
+ return
+
+ modifies = {}
+ delete_map = dict((d.old.path, d) for d in self._deletes)
+ for add in self._adds:
+ path = add.new.path
+ delete = delete_map.get(path)
+ if (delete is not None and
+ stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
+ modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
+
+ self._adds = [a for a in self._adds if a.new.path not in modifies]
+ self._deletes = [a for a in self._deletes if a.new.path not in modifies]
+ self._changes += modifies.values()
+
def _sorted_changes(self):
result = []
result.extend(self._adds)
@@ -391,6 +432,7 @@ class RenameDetector(object):
self._collect_changes()
self._find_exact_renames()
self._find_content_renames()
+ self._join_modifies()
return self._sorted_changes()
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index 1a9bd5c..c03367b 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -375,6 +375,16 @@ class RenameDetectionTest(DiffTestCase):
detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
return detector.changes_with_renames()
+ def test_no_renames(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\ne\nf\n')
+ blob3 = make_object(Blob, data='a\nb\ng\nh\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('a', blob1), ('b', blob3)])
+ self.assertEqual(
+ [TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob3.id))],
+ self.detect_renames(tree1, tree2))
+
def test_exact_rename_one_to_one(self):
blob1 = make_object(Blob, data='1')
blob2 = make_object(Blob, data='2')
@@ -432,7 +442,7 @@ class RenameDetectionTest(DiffTestCase):
TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
self.detect_renames(tree1, tree2))
- def test_content_rename_threshold(self):
+ def test_rename_threshold(self):
blob1 = make_object(Blob, data='a\nb\nc\n')
blob2 = make_object(Blob, data='a\nb\nd\n')
tree1 = self.commit_tree([('a', blob1)])
@@ -544,3 +554,49 @@ class RenameDetectionTest(DiffTestCase):
TreeChange.add(('c', 0100644, blob2.id)),
TreeChange.add(('d', 0160000, link2))],
self.detect_renames(tree1, tree2))
+
+ def test_exact_rename_swap(self):
+ blob1 = make_object(Blob, data='1')
+ blob2 = make_object(Blob, data='2')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('a', blob2), ('b', blob1)])
+ self.assertEqual(
+ [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+ TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob1.id))],
+ self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id)),
+ TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob2.id))],
+ self.detect_renames(tree1, tree2, rewrite_threshold=50))
+
+ def test_content_rename_swap(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+ blob3 = make_object(Blob, data='a\nb\nc\ne\n')
+ blob4 = make_object(Blob, data='e\nf\ng\ni\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('a', blob4), ('b', blob3)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob3.id)),
+ TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob4.id))],
+ self.detect_renames(tree1, tree2, rewrite_threshold=60))
+
+ def test_rewrite_threshold(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+ blob3 = make_object(Blob, data='a\nb\nf\ng\n')
+
+ tree1 = self.commit_tree([('a', blob1)])
+ tree2 = self.commit_tree([('a', blob3), ('b', blob2)])
+
+ no_renames = [
+ TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob3.id)),
+ TreeChange.add(('b', F, blob2.id))]
+ self.assertEqual(
+ no_renames, self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ no_renames, self.detect_renames(tree1, tree2, rewrite_threshold=40))
+ self.assertEqual(
+ [TreeChange.add(('a', F, blob3.id)),
+ TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2, rewrite_threshold=80))
--
1.7.3.2.168.gd6b63
References