← Back to team overview

dulwich-users team mailing list archive

[PATCH 18/28] diff: Add optional max_files limit for content rename detection.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

Change-Id: I61af93ba77810c685c7ed2fa78e5d80fc755cdf3
---
 dulwich/diff.py            |   14 +++++++++++++-
 dulwich/tests/test_diff.py |   18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletions(-)

diff --git a/dulwich/diff.py b/dulwich/diff.py
index 78463c9..6b26528 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -43,6 +43,7 @@ _NULL_ENTRY = TreeEntry(None, None, None)
 
 _MAX_SCORE = 100
 _RENAME_THRESHOLD = 60
+_MAX_FILES = 200
 
 
 class TreeChange(TreeChangeTuple):
@@ -260,7 +261,7 @@ class RenameDetector(object):
     """Object for handling rename detection between two trees."""
 
     def __init__(self, store, tree1_id, tree2_id,
-                 rename_threshold=_RENAME_THRESHOLD):
+                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES):
         """Initialize the rename detector.
 
         :param store: An ObjectStore for looking up objects.
@@ -268,11 +269,17 @@ class RenameDetector(object):
         :param tree2_id: The SHA of the second Tree.
         :param rename_threshold: The threshold similarity score for considering
             an add/delete pair to be a rename/copy; see _similarity_score.
+        :param max_files: The maximum number of adds and deletes to consider, or
+            None for no limit. The detector is guaranteed to compare no more
+            than max_files ** 2 add/delete pairs. This limit is provided because
+            rename detection can be quadratic in the project size. If the limit
+            is exceeded, no content rename detection is attempted.
         """
         self._tree1_id = tree1_id
         self._tree2_id = tree2_id
         self._store = store
         self._rename_threshold = rename_threshold
+        self._max_files = max_files
 
         self._adds = []
         self._deletes = []
@@ -325,6 +332,11 @@ class RenameDetector(object):
         #  - Compare object sizes before counting blocks.
         #  - Skip if delete's S_IFMT differs from all adds.
         #  - Skip if adds or deletes is empty.
+        # Match C git's behavior of not attempting to find content renames if
+        # the matrix size exceeds the threshold.
+        if len(self._adds) * len(self._deletes) > self._max_files ** 2:
+            return
+
         candidates = []
         for delete in self._deletes:
             if S_ISGITLINK(delete.old.mode):
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index 8773654..8d8f65d 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -406,6 +406,24 @@ class RenameDetectionTest(DiffTestCase):
            TreeChange.add(('b', F, blob2.id))],
           self.detect_renames(tree1, tree2, rename_threshold=75))
 
+    def test_content_rename_max_files(self):
+        blob1 = make_object(Blob, data='a\nb\nc\nd')
+        blob4 = make_object(Blob, data='a\nb\nc\ne\n')
+        blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+        blob3 = make_object(Blob, data='e\nf\ng\ni\n')
+        tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+        tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+        self.assertEqual(
+          [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('d', F, blob4.id)),
+           TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+          self.detect_renames(tree1, tree2))
+        self.assertEqual(
+          [TreeChange.delete(('a', F, blob1.id)),
+           TreeChange.delete(('b', F, blob2.id)),
+           TreeChange.add(('c', F, blob3.id)),
+           TreeChange.add(('d', F, blob4.id))],
+          self.detect_renames(tree1, tree2, max_files=1))
+
     def test_content_rename_one_to_one(self):
         b11 = make_object(Blob, data='a\nb\nc\nd\n')
         b12 = make_object(Blob, data='a\nb\nc\ne\n')
-- 
1.7.3.2.168.gd6b63




References