dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00287
[PATCH 09/28] diff: Add a function to get the similarity score between objects.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
Change-Id: I0ba1002e50ed1fb51bda3fde63e3b724b8ca87b2
---
dulwich/diff.py | 40 ++++++++++++++++++++++++++++++++++++++++
dulwich/tests/test_diff.py | 38 ++++++++++++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 0 deletions(-)
diff --git a/dulwich/diff.py b/dulwich/diff.py
index 0d5e9a5..00ed08a 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -40,6 +40,8 @@ CHANGE_UNCHANGED = 'unchanged'
_NULL_ENTRY = TreeEntry(None, None, None)
+_MAX_SCORE = 100
+
class TreeChange(TreeChangeTuple):
"""Class encapsulating a single change between two trees."""
@@ -194,3 +196,41 @@ def _count_blocks(obj):
if last_block:
block_counts[last_block] += 1
return block_counts
+
+
+def _common_bytes(blocks1, blocks2):
+ """Count the number of common bytes in two block count dicts."""
+ # Iterate over the smaller of the two dicts, since this is symmetrical.
+ if len(blocks1) > len(blocks2):
+ blocks1, blocks2 = blocks2, blocks1
+ score = 0
+ for block, count1 in blocks1.iteritems():
+ count2 = blocks2.get(block)
+ if count2:
+ score += min(count1, count2) * len(block)
+ return score
+
+
+def _similarity_score(obj1, obj2, block_cache=None):
+ """Compute a similarity score for two objects.
+
+ :param obj1: The first object to score.
+ :param obj2: The second object to score.
+ :param block_cache: An optional dict of SHA to block counts to cache results
+ between calls.
+ :return: The similarity score between the two objects, defined as the number
+ of bytes in common between the two objects divided by the maximum size,
+ scaled to the range 0-100.
+ """
+ if block_cache is None:
+ block_cache = {}
+ if obj1.id not in block_cache:
+ block_cache[obj1.id] = _count_blocks(obj1)
+ if obj2.id not in block_cache:
+ block_cache[obj2.id] = _count_blocks(obj2)
+
+ common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
+ max_size = max(obj1.raw_length(), obj2.raw_length())
+ if not max_size:
+ return _MAX_SCORE
+ return int(float(common_bytes) * _MAX_SCORE / max_size)
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index d1054e2..260301f 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -28,6 +28,7 @@ from dulwich.diff import (
_merge_entries,
tree_changes,
_count_blocks,
+ _similarity_score,
)
from dulwich.index import (
commit_tree,
@@ -271,3 +272,40 @@ class RenameDetectionTest(TestCase):
blob = make_object(Blob, data=data)
self.assertEqual({'a' * 64: 2, 'xxx\n': 1, 'y\n': 1, 'zzz\n': 1},
_count_blocks(blob))
+
+ def assertSimilar(self, expected_score, blob1, blob2):
+ self.assertEqual(expected_score, _similarity_score(blob1, blob2))
+ self.assertEqual(expected_score, _similarity_score(blob2, blob1))
+
+ def test_similarity_score(self):
+ blob0 = make_object(Blob, data='')
+ blob1 = make_object(Blob, data='ab\ncd\ncd\n')
+ blob2 = make_object(Blob, data='ab\n')
+ blob3 = make_object(Blob, data='cd\n')
+ blob4 = make_object(Blob, data='cd\ncd\n')
+
+ self.assertSimilar(100, blob0, blob0)
+ self.assertSimilar(0, blob0, blob1)
+ self.assertSimilar(33, blob1, blob2)
+ self.assertSimilar(33, blob1, blob3)
+ self.assertSimilar(66, blob1, blob4)
+ self.assertSimilar(0, blob2, blob3)
+ self.assertSimilar(50, blob3, blob4)
+
+ def test_similarity_score_cache(self):
+ blob1 = make_object(Blob, data='ab\ncd\n')
+ blob2 = make_object(Blob, data='ab\n')
+
+ block_cache = {}
+ self.assertEqual(
+ 50, _similarity_score(blob1, blob2, block_cache=block_cache))
+ self.assertEqual(set([blob1.id, blob2.id]), set(block_cache))
+
+ def fail_chunks():
+ self.fail('Unexpected call to as_raw_chunks()')
+
+ blob1.as_raw_chunks = blob2.as_raw_chunks = fail_chunks
+ blob1.raw_length = lambda: 6
+ blob2.raw_length = lambda: 3
+ self.assertEqual(
+ 50, _similarity_score(blob1, blob2, block_cache=block_cache))
--
1.7.3.2.168.gd6b63
References