← Back to team overview

dulwich-users team mailing list archive

[PATCH 13/24] diff_tree: Simple content-based rename detection.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

As in C git, this computes the similarity between each add/delete pair,
and assigns matches in order of overall score. Lower-scored adds from
the same delete are marked as copies.

Change-Id: I8d8fb34b24a5fb58f310cb72ce91d60073068710
---
 dulwich/diff_tree.py            |   48 +++++++++++++++++++-
 dulwich/tests/test_diff_tree.py |   95 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 1 deletions(-)

diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index b6179bc..85f78d2 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -27,6 +27,7 @@ from dulwich.misc import (
     TreeChangeTuple,
     )
 from dulwich.objects import (
+    S_ISGITLINK,
     TreeEntry,
     )
 
@@ -41,6 +42,7 @@ CHANGE_UNCHANGED = 'unchanged'
 _NULL_ENTRY = TreeEntry(None, None, None)
 
 _MAX_SCORE = 100
+_RENAME_THRESHOLD = 60
 
 
 class TreeChange(TreeChangeTuple):
@@ -257,16 +259,20 @@ def _tree_change_key(entry):
 class RenameDetector(object):
     """Object for handling rename detection between two trees."""
 
-    def __init__(self, store, tree1_id, tree2_id):
+    def __init__(self, store, tree1_id, tree2_id,
+                 rename_threshold=_RENAME_THRESHOLD):
         """Initialize the rename detector.
 
         :param store: An ObjectStore for looking up objects.
         :param tree1_id: The SHA of the first Tree.
         :param tree2_id: The SHA of the second Tree.
+        :param rename_threshold: The threshold similarity score for considering
+            an add/delete pair to be a rename/copy; see _similarity_score.
         """
         self._tree1_id = tree1_id
         self._tree2_id = tree2_id
         self._store = store
+        self._rename_threshold = rename_threshold
 
         self._adds = []
         self._deletes = []
@@ -314,6 +320,45 @@ class RenameDetector(object):
                     self._changes.append(TreeChange(CHANGE_COPY, old, new))
         self._prune(add_paths, delete_paths)
 
+    def _find_content_renames(self):
+        # TODO: Optimizations:
+        #  - Compare object sizes before counting blocks.
+        #  - Skip if delete's S_IFMT differs from all adds.
+        #  - Skip if adds or deletes is empty.
+        candidates = []
+        for delete in self._deletes:
+            if S_ISGITLINK(delete.old.mode):
+                continue  # Git links don't exist in this repo.
+            old_sha = delete.old.sha
+            old_obj = self._store[old_sha]
+            old_blocks = _count_blocks(old_obj)
+            for add in self._adds:
+                if stat.S_IFMT(delete.old.mode) != stat.S_IFMT(add.new.mode):
+                    continue
+                new_obj = self._store[add.new.sha]
+                score = _similarity_score(old_obj, new_obj,
+                                          block_cache={old_sha: old_blocks})
+                if score > self._rename_threshold:
+                    rename = TreeChange(CHANGE_RENAME, delete.old, add.new)
+                    candidates.append((-score, rename))
+
+        # Sort scores from highest to lowest, but keep names in ascending order.
+        candidates.sort()
+
+        delete_paths = set()
+        add_paths = set()
+        for _, change in candidates:
+            new_path = change.new.path
+            if new_path in add_paths:
+                continue
+            old_path = change.old.path
+            if old_path in delete_paths:
+                change = TreeChange(CHANGE_COPY, change.old, change.new)
+            delete_paths.add(old_path)
+            add_paths.add(new_path)
+            self._changes.append(change)
+        self._prune(add_paths, delete_paths)
+
     def _sorted_changes(self):
         result = []
         result.extend(self._adds)
@@ -326,4 +371,5 @@ class RenameDetector(object):
         """Iterate TreeChanges between the two trees, with rename detection."""
         self._collect_changes()
         self._find_exact_renames()
+        self._find_content_renames()
         return self._sorted_changes()
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index 950dcaa..b0eba50 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -392,3 +392,98 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange(CHANGE_COPY, ('a', F, blob.id), ('e', F, blob.id)),
            TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
           self.detect_renames(tree1, tree2))
+
+    def test_content_rename_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\n')
+        blob2 = make_object(Blob, data='a\nb\nd\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=50))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rename_threshold=75))
+
+    def test_content_rename_one_to_one(self):
+        b11 = make_object(Blob, data='a\nb\nc\nd\n')
+        b12 = make_object(Blob, data='a\nb\nc\ne\n')
+        b21 = make_object(Blob, data='e\nf\ng\n\h')
+        b22 = make_object(Blob, data='e\nf\ng\n\i')
+        tree1 = self.commit_tree([('a', b11), ('b', b21)])
+        tree2 = self.commit_tree([('c', b12), ('d', b22)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, b11.id), ('c', F, b12.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, b21.id), ('d', F, b22.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_one_to_one_ordering(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\ne\nf\n')
+        blob2 = make_object(Blob, data='a\nb\nc\nd\ng\nh\n')
+        # 6/10 match to blob1, 8/10 match to blob2
+        blob3 = make_object(Blob, data='a\nb\nc\nd\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+        tree3 = self.commit_tree([('a', blob2), ('b', blob1)])
+        tree4 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob2.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob1.id))],
+          self.detect_renames(tree3, tree4))
+
+    def test_content_rename_one_to_many(self):
+        blob1 = make_object(Blob, data='aa\nb\nc\nd\ne\n')
+        blob2 = make_object(Blob, data='ab\nb\nc\nd\ne\n')  # 8/11 match
+        blob3 = make_object(Blob, data='aa\nb\nc\nd\nf\n')  # 9/11 match
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('b', blob2), ('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_one(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_many_to_many(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nc\nf\n')
+        blob4 = make_object(Blob, data='a\nb\nc\ng\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        # TODO(dborowitz): Distribute renames rather than greedily choosing
+        # copies.
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('c', F, blob3.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange.delete(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+
+    def test_content_rename_gitlink(self):
+        blob1 = make_object(Blob, data='blob1')
+        blob2 = make_object(Blob, data='blob2')
+        link1 = '1' * 40
+        link2 = '2' * 40
+        tree1 = self.commit_tree([('a', blob1), ('b', link1, 0160000)])
+        tree2 = self.commit_tree([('c', blob2), ('d', link2, 0160000)])
+        self.assertEqual(
+          [TreeChange.delete(('a', 0100644, blob1.id)),
+           TreeChange.delete(('b', 0160000, link1)),
+           TreeChange.add(('c', 0100644, blob2.id)),
+           TreeChange.add(('d', 0160000, link2))],
+          self.detect_renames(tree1, tree2))
-- 
1.7.3.2.168.gd6b63




References