dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00338
[PATCH 14/24] diff_tree: Add optional max_files limit for content rename detection.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
Change-Id: I61af93ba77810c685c7ed2fa78e5d80fc755cdf3
---
dulwich/diff_tree.py | 14 +++++++++++++-
dulwich/tests/test_diff_tree.py | 18 ++++++++++++++++++
2 files changed, 31 insertions(+), 1 deletions(-)
diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index 85f78d2..ca80586 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -43,6 +43,7 @@ _NULL_ENTRY = TreeEntry(None, None, None)
_MAX_SCORE = 100
_RENAME_THRESHOLD = 60
+_MAX_FILES = 200
class TreeChange(TreeChangeTuple):
@@ -260,7 +261,7 @@ class RenameDetector(object):
"""Object for handling rename detection between two trees."""
def __init__(self, store, tree1_id, tree2_id,
- rename_threshold=_RENAME_THRESHOLD):
+ rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES):
"""Initialize the rename detector.
:param store: An ObjectStore for looking up objects.
@@ -268,11 +269,17 @@ class RenameDetector(object):
:param tree2_id: The SHA of the second Tree.
:param rename_threshold: The threshold similarity score for considering
an add/delete pair to be a rename/copy; see _similarity_score.
+ :param max_files: The maximum number of adds and deletes to consider, or
+ None for no limit. The detector is guaranteed to compare no more
+ than max_files ** 2 add/delete pairs. This limit is provided because
+ rename detection can be quadratic in the project size. If the limit
+ is exceeded, no content rename detection is attempted.
"""
self._tree1_id = tree1_id
self._tree2_id = tree2_id
self._store = store
self._rename_threshold = rename_threshold
+ self._max_files = max_files
self._adds = []
self._deletes = []
@@ -325,6 +332,11 @@ class RenameDetector(object):
# - Compare object sizes before counting blocks.
# - Skip if delete's S_IFMT differs from all adds.
# - Skip if adds or deletes is empty.
+ # Match C git's behavior of not attempting to find content renames if
+ # the matrix size exceeds the threshold.
+ if len(self._adds) * len(self._deletes) > self._max_files ** 2:
+ return
+
candidates = []
for delete in self._deletes:
if S_ISGITLINK(delete.old.mode):
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index b0eba50..91b760f 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -406,6 +406,24 @@ class RenameDetectionTest(DiffTestCase):
TreeChange.add(('b', F, blob2.id))],
self.detect_renames(tree1, tree2, rename_threshold=75))
+ def test_content_rename_max_files(self):
+ blob1 = make_object(Blob, data='a\nb\nc\nd')
+ blob4 = make_object(Blob, data='a\nb\nc\ne\n')
+ blob2 = make_object(Blob, data='e\nf\ng\nh\n')
+ blob3 = make_object(Blob, data='e\nf\ng\ni\n')
+ tree1 = self.commit_tree([('a', blob1), ('b', blob2)])
+ tree2 = self.commit_tree([('c', blob3), ('d', blob4)])
+ self.assertEqual(
+ [TreeChange(CHANGE_RENAME, ('a', F, blob1.id), ('d', F, blob4.id)),
+ TreeChange(CHANGE_RENAME, ('b', F, blob2.id), ('c', F, blob3.id))],
+ self.detect_renames(tree1, tree2))
+ self.assertEqual(
+ [TreeChange.delete(('a', F, blob1.id)),
+ TreeChange.delete(('b', F, blob2.id)),
+ TreeChange.add(('c', F, blob3.id)),
+ TreeChange.add(('d', F, blob4.id))],
+ self.detect_renames(tree1, tree2, max_files=1))
+
def test_content_rename_one_to_one(self):
b11 = make_object(Blob, data='a\nb\nc\nd\n')
b12 = make_object(Blob, data='a\nb\nc\ne\n')
--
1.7.3.2.168.gd6b63
References