dulwich-users team mailing list archive

Thread
Date
[PATCH 09/28] diff: Add a function to get the similarity score between objects.

To: dulwich-users@xxxxxxxxxxxxxxxxxxx
From: dborowitz@xxxxxxxxxx
Date: Thu, 2 Dec 2010 16:22:00 -0800
In-reply-to: <1291335739-9005-1-git-send-email-dborowitz@google.com>
From: Dave Borowitz <dborowitz@xxxxxxxxxx>

Change-Id: I0ba1002e50ed1fb51bda3fde63e3b724b8ca87b2
---
 dulwich/diff.py            |   40 ++++++++++++++++++++++++++++++++++++++++
 dulwich/tests/test_diff.py |   38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/dulwich/diff.py b/dulwich/diff.py
index 0d5e9a5..00ed08a 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -40,6 +40,8 @@ CHANGE_UNCHANGED = 'unchanged'
 
 _NULL_ENTRY = TreeEntry(None, None, None)
 
+_MAX_SCORE = 100
+
 
 class TreeChange(TreeChangeTuple):
     """Class encapsulating a single change between two trees."""
@@ -194,3 +196,41 @@ def _count_blocks(obj):
     if last_block:
         block_counts[last_block] += 1
     return block_counts
+
+
+def _common_bytes(blocks1, blocks2):
+    """Count the number of common bytes in two block count dicts."""
+    # Iterate over the smaller of the two dicts, since this is symmetrical.
+    if len(blocks1) > len(blocks2):
+        blocks1, blocks2 = blocks2, blocks1
+    score = 0
+    for block, count1 in blocks1.iteritems():
+        count2 = blocks2.get(block)
+        if count2:
+            score += min(count1, count2) * len(block)
+    return score
+
+
+def _similarity_score(obj1, obj2, block_cache=None):
+    """Compute a similarity score for two objects.
+
+    :param obj1: The first object to score.
+    :param obj2: The second object to score.
+    :param block_cache: An optional dict of SHA to block counts to cache results
+        between calls.
+    :return: The similarity score between the two objects, defined as the number
+        of bytes in common between the two objects divided by the maximum size,
+        scaled to the range 0-100.
+    """
+    if block_cache is None:
+        block_cache = {}
+    if obj1.id not in block_cache:
+        block_cache[obj1.id] = _count_blocks(obj1)
+    if obj2.id not in block_cache:
+        block_cache[obj2.id] = _count_blocks(obj2)
+
+    common_bytes = _common_bytes(block_cache[obj1.id], block_cache[obj2.id])
+    max_size = max(obj1.raw_length(), obj2.raw_length())
+    if not max_size:
+        return _MAX_SCORE
+    return int(float(common_bytes) * _MAX_SCORE / max_size)
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index d1054e2..260301f 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -28,6 +28,7 @@ from dulwich.diff import (
     _merge_entries,
     tree_changes,
     _count_blocks,
+    _similarity_score,
     )
 from dulwich.index import (
     commit_tree,
@@ -271,3 +272,40 @@ class RenameDetectionTest(TestCase):
         blob = make_object(Blob, data=data)
         self.assertEqual({'a' * 64: 2, 'xxx\n': 1, 'y\n': 1, 'zzz\n': 1},
                          _count_blocks(blob))
+
+    def assertSimilar(self, expected_score, blob1, blob2):
+        self.assertEqual(expected_score, _similarity_score(blob1, blob2))
+        self.assertEqual(expected_score, _similarity_score(blob2, blob1))
+
+    def test_similarity_score(self):
+        blob0 = make_object(Blob, data='')
+        blob1 = make_object(Blob, data='ab\ncd\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+        blob3 = make_object(Blob, data='cd\n')
+        blob4 = make_object(Blob, data='cd\ncd\n')
+
+        self.assertSimilar(100, blob0, blob0)
+        self.assertSimilar(0, blob0, blob1)
+        self.assertSimilar(33, blob1, blob2)
+        self.assertSimilar(33, blob1, blob3)
+        self.assertSimilar(66, blob1, blob4)
+        self.assertSimilar(0, blob2, blob3)
+        self.assertSimilar(50, blob3, blob4)
+
+    def test_similarity_score_cache(self):
+        blob1 = make_object(Blob, data='ab\ncd\n')
+        blob2 = make_object(Blob, data='ab\n')
+
+        block_cache = {}
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
+        self.assertEqual(set([blob1.id, blob2.id]), set(block_cache))
+
+        def fail_chunks():
+            self.fail('Unexpected call to as_raw_chunks()')
+
+        blob1.as_raw_chunks = blob2.as_raw_chunks = fail_chunks
+        blob1.raw_length = lambda: 6
+        blob2.raw_length = lambda: 3
+        self.assertEqual(
+          50, _similarity_score(blob1, blob2, block_cache=block_cache))
-- 
1.7.3.2.168.gd6b63
References

[PATCH 00/28] Tree diffing and rename detection.
From: dborowitz, 2010-12-03