← Back to team overview

dulwich-users team mailing list archive

[PATCH 24/24] diff_tree: Add find_copies_harder to consider unmodified files.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

Change-Id: Ic2ed78f174c9f25e184a45c5ee9991fca330ba84
---
 dulwich/diff_tree.py            |   43 ++++++++++++++++++++++++++-----
 dulwich/tests/test_diff_tree.py |   53 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index 9a16298..af7f259 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -287,7 +287,8 @@ class RenameDetector(object):
 
     def __init__(self, store, tree1_id, tree2_id,
                  rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
-                 rewrite_threshold=_REWRITE_THRESHOLD):
+                 rewrite_threshold=_REWRITE_THRESHOLD,
+                 find_copies_harder=False):
         """Initialize the rename detector.
 
         :param store: An ObjectStore for looking up objects.
@@ -303,6 +304,8 @@ class RenameDetector(object):
         :param rewrite_threshold: The threshold similarity score below which a
             modify should be considered a delete/add, or None to not break
             modifies; see _similarity_score.
+        :param find_copies_harder: If True, consider unmodified files when
+            detecting copies.
         """
         self._tree1_id = tree1_id
         self._tree2_id = tree2_id
@@ -310,6 +313,7 @@ class RenameDetector(object):
         self._rename_threshold = rename_threshold
         self._rewrite_threshold = rewrite_threshold
         self._max_files = max_files
+        self._find_copies_harder = find_copies_harder
 
         self._adds = []
         self._deletes = []
@@ -324,7 +328,8 @@ class RenameDetector(object):
         return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
 
     def _collect_changes(self):
-        for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
+        for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
+                                   want_unchanged=self._find_copies_harder):
             if change.type == CHANGE_ADD:
                 self._adds.append(change)
             elif change.type == CHANGE_DELETE:
@@ -332,6 +337,11 @@ class RenameDetector(object):
             elif self._should_split(change):
                 self._deletes.append(TreeChange.delete(change.old))
                 self._adds.append(TreeChange.add(change.new))
+            elif (self._find_copies_harder and (
+              change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
+                # Treat modified/unchanged as deleted rather than splitting it,
+                # to avoid spurious renames.
+                self._deletes.append(change)
             else:
                 self._changes.append(change)
 
@@ -346,22 +356,27 @@ class RenameDetector(object):
             add_map[add.new.sha].append(add.new)
         delete_map = defaultdict(list)
         for delete in self._deletes:
-            delete_map[delete.old.sha].append(delete.old)
+            # Keep track of whether the delete was actually marked as a delete.
+            # If not, it must have been added due to find_copies_harder, and
+            # needs to be marked as a copy.
+            is_delete = delete.type == CHANGE_DELETE
+            delete_map[delete.old.sha].append((delete.old, is_delete))
 
         add_paths = set()
         delete_paths = set()
         for sha, sha_deletes in delete_map.iteritems():
             sha_adds = add_map[sha]
-            for old, new in itertools.izip(sha_deletes, sha_adds):
+            for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
                 if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
                     continue
                 delete_paths.add(old.path)
                 add_paths.add(new.path)
-                self._changes.append(TreeChange(CHANGE_RENAME, old, new))
+                new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
+                self._changes.append(TreeChange(new_type, old, new))
 
             num_extra_adds = len(sha_adds) - len(sha_deletes)
             # TODO(dborowitz): Less arbitrary way of dealing with extra copies.
-            old = sha_deletes[0]
+            old = sha_deletes[0][0]
             if num_extra_adds:
                 for new in sha_adds[-num_extra_adds:]:
                     add_paths.add(new.path)
@@ -397,6 +412,11 @@ class RenameDetector(object):
                         # If the paths match, this must be a split modify, so
                         # make sure it comes out as a modify.
                         new_type = CHANGE_MODIFY
+                    elif delete.type != CHANGE_DELETE:
+                        # If it's in deletes but not marked as a delete, it must
+                        # have been added due to find_copies_harder, and needs
+                        # to be marked as a copy.
+                        new_type = CHANGE_COPY
                     else:
                         new_type = CHANGE_RENAME
                     rename = TreeChange(new_type, delete.old, add.new)
@@ -412,9 +432,14 @@ class RenameDetector(object):
             if new_path in add_paths:
                 continue
             old_path = change.old.path
+            orig_type = change.type
             if old_path in delete_paths:
                 change = TreeChange(CHANGE_COPY, change.old, change.new)
-            delete_paths.add(old_path)
+
+            # If the candidate was originally a copy, that means it came from a
+            # modified or unchanged path, so we don't want to prune it.
+            if orig_type != CHANGE_COPY:
+                delete_paths.add(old_path)
             add_paths.add(new_path)
             self._changes.append(change)
         self._prune(add_paths, delete_paths)
@@ -444,12 +469,16 @@ class RenameDetector(object):
         result.sort(key=_tree_change_key)
         return result
 
+    def _prune_unchanged(self):
+        self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
+
     def changes_with_renames(self):
         """Iterate TreeChanges between the two trees, with rename detection."""
         self._collect_changes()
         self._find_exact_renames()
         self._find_content_renames()
         self._join_modifies()
+        self._prune_unchanged()
         return self._sorted_changes()
 
 
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index ca0ee18..92b75c9 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -615,3 +615,56 @@ class RenameDetectionTest(DiffTestCase):
           [TreeChange.add(('a', F, blob3.id)),
            TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
           self.detect_renames(tree1, tree2, rewrite_threshold=80))
+
+    def test_find_copies_harder_exact(self):
+        blob = make_object(Blob, data='blob')
+        tree1 = self.commit_tree([('a', blob)])
+        tree2 = self.commit_tree([('a', blob), ('b', blob)])
+        self.assertEqual([TreeChange.add(('b', F, blob.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob.id), ('b', F, blob.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_content(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob2)])
+        self.assertEqual([TreeChange.add(('b', F, blob2.id))],
+                         self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_modify(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange.add(('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+
+    def test_find_copies_harder_with_rewrites(self):
+        blob_a1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob_a2 = make_object(Blob, data='f\ng\nh\ni\n')
+        blob_b2 = make_object(Blob, data='a\nb\nc\ne\n')
+        tree1 = self.commit_tree([('a', blob_a1)])
+        tree2 = self.commit_tree([('a', blob_a2), ('b', blob_b2)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob_a1.id),
+                      ('a', F, blob_a2.id)),
+           TreeChange(CHANGE_COPY, ('a', F, blob_a1.id), ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, find_copies_harder=True))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob_a2.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob_a1.id),
+                      ('b', F, blob_b2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50,
+                              find_copies_harder=True))
-- 
1.7.3.2.168.gd6b63




References