← Back to team overview

dulwich-users team mailing list archive

[PATCH 22/28] diff: Consider dissimilar modifies to be delete/add pairs.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

Modifies are split into delete/add pairs if their similarity is below
a given threshold, and are rejoined at the end of the rename detection
process.

Change-Id: I0f11549cc1117be7b8bb67cb584e4101e72a4913
---
 dulwich/diff.py            |   46 +++++++++++++++++++++++++++++++++-
 dulwich/tests/test_diff.py |   58 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/dulwich/diff.py b/dulwich/diff.py
index 88274fd..a1055d3 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -44,6 +44,7 @@ _NULL_ENTRY = TreeEntry(None, None, None)
 _MAX_SCORE = 100
 _RENAME_THRESHOLD = 60
 _MAX_FILES = 200
+_REWRITE_THRESHOLD = None
 
 
 class TreeChange(TreeChangeTuple):
@@ -268,7 +269,8 @@ class RenameDetector(object):
     """Object for handling rename detection between two trees."""
 
     def __init__(self, store, tree1_id, tree2_id,
-                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES):
+                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
+                 rewrite_threshold=_REWRITE_THRESHOLD):
         """Initialize the rename detector.
 
         :param store: An ObjectStore for looking up objects.
@@ -281,23 +283,38 @@ class RenameDetector(object):
             than max_files ** 2 add/delete pairs. This limit is provided because
             rename detection can be quadratic in the project size. If the limit
             is exceeded, no content rename detection is attempted.
+        :param rewrite_threshold: The threshold similarity score below which a
+            modify should be considered a delete/add, or None to not break
+            modifies; see _similarity_score.
         """
         self._tree1_id = tree1_id
         self._tree2_id = tree2_id
         self._store = store
         self._rename_threshold = rename_threshold
+        self._rewrite_threshold = rewrite_threshold
         self._max_files = max_files
 
         self._adds = []
         self._deletes = []
         self._changes = []
 
+    def _should_split(self, change):
+        if (self._rewrite_threshold is None or change.type != CHANGE_MODIFY or
+            change.old.sha == change.new.sha):
+            return False
+        old_obj = self._store[change.old.sha]
+        new_obj = self._store[change.new.sha]
+        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
+
     def _collect_changes(self):
         for change in tree_changes(self._store, self._tree1_id, self._tree2_id):
             if change.type == CHANGE_ADD:
                 self._adds.append(change)
             elif change.type == CHANGE_DELETE:
                 self._deletes.append(change)
+            elif self._should_split(change):
+                self._deletes.append(TreeChange.delete(change.old))
+                self._adds.append(TreeChange.add(change.new))
             else:
                 self._changes.append(change)
 
@@ -344,6 +361,7 @@ class RenameDetector(object):
         if len(self._adds) * len(self._deletes) > self._max_files ** 2:
             return
 
+        check_paths = self._rename_threshold is not None
         candidates = []
         for delete in self._deletes:
             if S_ISGITLINK(delete.old.mode):
@@ -358,7 +376,13 @@ class RenameDetector(object):
                 score = _similarity_score(old_obj, new_obj,
                                           block_cache={old_sha: old_blocks})
                 if score > self._rename_threshold:
-                    rename = TreeChange(CHANGE_RENAME, delete.old, add.new)
+                    if check_paths and delete.old.path == add.new.path:
+                        # If the paths match, this must be a split modify, so
+                        # make sure it comes out as a modify.
+                        new_type = CHANGE_MODIFY
+                    else:
+                        new_type = CHANGE_RENAME
+                    rename = TreeChange(new_type, delete.old, add.new)
                     candidates.append((-score, rename))
 
         # Sort scores from highest to lowest, but keep names in ascending order.
@@ -378,6 +402,23 @@ class RenameDetector(object):
             self._changes.append(change)
         self._prune(add_paths, delete_paths)
 
+    def _join_modifies(self):
+        if self._rewrite_threshold is None:
+            return
+
+        modifies = {}
+        delete_map = dict((d.old.path, d) for d in self._deletes)
+        for add in self._adds:
+            path = add.new.path
+            delete = delete_map.get(path)
+            if (delete is not None and
+              stat.S_IFMT(delete.old.mode) == stat.S_IFMT(add.new.mode)):
+                modifies[path] = TreeChange(CHANGE_MODIFY, delete.old, add.new)
+
+        self._adds = [a for a in self._adds if a.new.path not in modifies]
+        self._deletes = [a for a in self._deletes if a.new.path not in modifies]
+        self._changes += modifies.values()
+
     def _sorted_changes(self):
         result = []
         result.extend(self._adds)
@@ -391,6 +432,7 @@ class RenameDetector(object):
         self._collect_changes()
         self._find_exact_renames()
         self._find_content_renames()
+        self._join_modifies()
         return self._sorted_changes()
 
 
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index 4c02659..38c1285 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -375,6 +375,16 @@ class RenameDetectionTest(DiffTestCase):
         detector = RenameDetector(self.store, tree1.id, tree2.id, **kwargs)
         return detector.changes_with_renames()
 
+    def test_no_renames(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\ne\nf\n')
+        blob3 = make_object(Blob, data='a\nb\ng\nh\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob1), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+
     def test_exact_rename_one_to_one(self):
         blob1 = make_object(Blob, data='1')
         blob2 = make_object(Blob, data='2')
@@ -432,7 +442,7 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange(CHANGE_RENAME, ('b', F, blob.id), ('d', F, blob.id))],
           self.detect_renames(tree1, tree2))
 
-    def test_content_rename_threshold(self):
+    def test_rename_threshold(self):
         blob1 = make_object(Blob, data='a\nb\nc\n')
         blob2 = make_object(Blob, data='a\nb\nd\n')
         tree1 = self.commit_tree([('a', blob1)])
@@ -544,3 +554,49 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange.add(('c', 0100644, blob2.id)),
            TreeChange.add(('d', 0160000, link2))],
           self.detect_renames(tree1, tree2))
+
+    def test_exact_rename_swap(self):
+        blob1 = make_object(Blob, data='1')
+        blob2 = make_object(Blob, data='2')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob2), ('b', blob1)])
+        self.assertEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob2.id)),
+           TreeChange(CHANGE_MODIFY, ('b', F, blob2.id), ('b', F, blob1.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob1.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=50))
+
+    def test_content_rename_swap(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob4 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('a', blob4), ('b', blob3)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('a', F, blob4.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=60))
+
+    def test_rewrite_threshold(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd\n')
+        blob2 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob3 = make_object(Blob, data='a\nb\nf\ng\n')
+
+        tree1 = self.commit_tree([('a', blob1)])
+        tree2 = self.commit_tree([('a', blob3), ('b', blob2)])
+
+        no_renames = [
+          TreeChange(CHANGE_MODIFY, ('a', F, blob1.id), ('a', F, blob3.id)),
+          TreeChange.add(('b', F, blob2.id))]
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          no_renames, self.detect_renames(tree1, tree2, rewrite_threshold=40))
+        self.assertEqual(
+          [TreeChange.add(('a', F, blob3.id)),
+           TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('b', F, blob2.id))],
+          self.detect_renames(tree1, tree2, rewrite_threshold=80))
-- 
1.7.3.2.168.gd6b63




References