dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00340
[PATCH 13/24] diff_tree: Simple content-based rename detection.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
As in C git, this computes the similarity between each add/delete pair,
and assigns matches in order of overall score. Lower-scored adds from
the same delete are marked as copies.
Change-Id: I8d8fb34b24a5fb58f310cb72ce91d60073068710
---
dulwich/diff_tree.py | 48 +++++++++++++++++++-
dulwich/tests/test_diff_tree.py | 95 +++++++++++++++++++++++++++++++++++++++
2 files changed, 142 insertions(+), 1 deletions(-)
diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index b6179bc..85f78d2 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -27,6 +27,7 @@ from dulwich.misc import (
TreeChangeTuple,
)
from dulwich.objects import (
+ S_ISGITLINK,
TreeEntry,
)
@@ -41,6 +42,7 @@ CHANGE_UNCHANGED = 'unchanged'
_NULL_ENTRY = TreeEntry(None, None, None)
_MAX_SCORE = 100
+_RENAME_THRESHOLD = 60
class TreeChange(TreeChangeTuple):
@@ -257,16 +259,20 @@ def _tree_change_key(entry):
class RenameDetector(object):
"""Object for handling rename detection between two trees."""
- def __init__(self, store, tree1_id, tree2_id):
+ def __init__(self, store, tree1_id, tree2_id,
+ rename_threshold=_RENAME_THRESHOLD):
"""Initialize the rename detector.
:param store: An ObjectStore for looking up objects.
:param tree1_id: The SHA of the first Tree.
:param tree2_id: The SHA of the second Tree.
+ :param rename_threshold: The threshold similarity score for considering
+ an add/delete pair to be a rename/copy; see _similarity_score.
"""
self._tree1_id = tree1_id
self._tree2_id = tree2_id
self._store = store
+ self._rename_threshold = rename_threshold
self._adds = []
self._deletes = []
@@ -314,6 +320,45 @@ class RenameDetector(object):
self._changes.append(TreeChange(CHANGE_COPY, old, new))
self._prune(add_paths, delete_paths)
+ def _find_content_renames(self):
+ # TODO: Optimizations:
+ # - Compare object sizes before counting blocks.
+ # - Skip if delete's S_IFMT differs from all adds.
+ # - Skip if adds or deletes is empty.
+ candidates = []
+ for delete in self._deletes:
+ if S_ISGITLINK(delete.old.mode):
+ continue # Git links don't exist in this repo.
+ old_sha = delete.old.sha
+ old_obj = self._store[old_sha]
+ old_blocks = _count_blocks(old_obj)
+ for add in self._adds:
+ if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
+ continue
+ new_obj = self._store[add.new.sha]
+ score = _similarity_score(old_obj, new_obj,
+ block_cache={old_sha: old_blocks})
+ if score > self._rename_threshold:
+ rename = TreeChange(CHANGE_RENAME, delete.old, add.new)
+ candidates.append((-score, rename))
+
+ # Sort scores from highest to lowest, but keep names in ascending order.
+ candidates.sort()
+
+ delete_paths = set()
+ add_paths = set()
+ for _, change in candidates:
+ new_path = change.new.path
+ if new_path in add_paths:
+ continue
+ old_path = change.old.path
+ if old_path in delete_paths:
+ change = TreeChange(CHANGE_COPY, change.old, change.new)
+ delete_paths.add(old_path)
+ add_paths.add(new_path)
+ self._changes.append(change)
+ self._prune(add_paths, delete_paths)
+
def _sorted_changes(self):
result = []
result.extend(self._adds)
@@ -326,4 +371,5 @@ class RenameDetector(object):
"""Iterate TreeChanges between the two trees, with rename detection."""
self._collect_changes()
self._find_exact_renames()
+ self._find_content_renames()
return self._sorted_changes()
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index 950dcaa..b0eba50 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -392,3 +392,98 @@ class RenameDetectionTest(DiffTestCase):
TreeChange(CHANGE_COPY, ('a', F, blob.id), ('e', F, blob.id)),
TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
self.detect_renames(tree1, tree2))
+
+ def test_content_rename_threshold(self):
+ blob1 = make_object(Blob, data='a\nb\nc\n')
+ blob2 = make_object(Blob, data='a\nb\nd\n')
+ tree1 = self.commit_tree([('a', blob1)])
+ tree2 = self.commit_tree([('b', blob2)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2, rename_threshold=50))
+ self.assertEqual(
+ [TreeChange.delete(('a', F, blob1.id)),
+ TreeChange.add(('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2, rename_threshold=75))
+
+ def test_content_rename_one_to_one(self):
+ b11 = make_object(Blob, data='a\nb\nc\nd\n')
+ b12 = make_object(Blob, data='a\nb\nc\ne\n')
+ b21 = make_object(Blob, data='e\nf\ng\n\h')
+ b22 = make_object(Blob, data='e\nf\ng\n\i')
+ tree1 = self.commit_tree([('a', b11), ('b', b21)])
+ tree2 = self.commit_tree([('c', b12), ('d', b22)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, b11.id), ('c', F, b12.id)),
+ TreeChange(CHANGE_RENAME, ('b', F, b21.id), ('d', F, b22.id))],
+ self.detect_renames(tree1, tree2))
+
+ def test_content_rename_one_to_one_ordering(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\ne\nf\n')
+ blob2 = make_object(Blob, data='a\nb\nc\nd\ng\nh\n')
+ # 6/10 match to blob1, 8/10 match to blob2
+ blob3 = make_object(Blob, data='a\nb\nc\nd\ng\ni\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('c', blob3)])
+ self.assertEqual(
+ [TreeChange.delete(('a', F, blob1.id)),
+ TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+ self.detect_renames(tree1, tree2))
+
+ tree3 = self.commit_tree([('a', blob2), ('b', blob1)])
+ tree4 = self.commit_tree([('c', blob3)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob2.id), ('c', F, blob3.id)),
+ TreeChange.delete(('b', F, blob1.id))],
+ self.detect_renames(tree3, tree4))
+
+ def test_content_rename_one_to_many(self):
+ blob1 = make_object(Blob, data='aa\nb\nc\nd\ne\n')
+ blob2 = make_object(Blob, data='ab\nb\nc\nd\ne\n') # 8/11 match
+ blob3 = make_object(Blob, data='aa\nb\nc\nd\nf\n') # 9/11 match
+ tree1 = self.commit_tree([('a', blob1)])
+ tree2 = self.commit_tree([('b', blob2), ('c', blob3)])
+ self.assertEqual(
+ [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id)),
+ TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id))],
+ self.detect_renames(tree1, tree2))
+
+ def test_content_rename_many_to_one(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+ blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('c', blob3)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+ TreeChange.delete(('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2))
+
+ def test_content_rename_many_to_many(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+ blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+ blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+ blob4 = make_object(Blob, data='a\nb\nc\ng\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+ # TODO(dborowitz): Distribute renames rather than greedily choosing
+ # copies.
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+ TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('d', F, blob4.id)),
+ TreeChange.delete(('b', F, blob2.id))],
+ self.detect_renames(tree1, tree2))
+
+ def test_content_rename_gitlink(self):
+ blob1 = make_object(Blob, data='blob1')
+ blob2 = make_object(Blob, data='blob2')
+ link1 = '1' * 40
+ link2 = '2' * 40
+ tree1 = self.commit_tree([('a', blob1), ('b', link1, 0160000)])
+ tree2 = self.commit_tree([('c', blob2), ('d', link2, 0160000)])
+ self.assertEqual(
+ [TreeChange.delete(('a', 0100644, blob1.id)),
+ TreeChange.delete(('b', 0160000, link1)),
+ TreeChange.add(('c', 0100644, blob2.id)),
+ TreeChange.add(('d', 0160000, link2))],
+ self.detect_renames(tree1, tree2))
--
1.7.3.2.168.gd6b63
References