← Back to team overview

dulwich-users team mailing list archive

[PATCH 20/24] diff_tree: Use hashcodes as block keys instead of strings.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

This avoids storing essentially the entire contents of every blob in its
block dict, at the cost of some accuracy on hash conflicts.

Change-Id: I54df3f7e8ac32e7147f9b64d9944a514f7700a4f
---
 dulwich/diff_tree.py            |   18 +++++++++++++-----
 dulwich/tests/test_diff_tree.py |    9 +++++----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/dulwich/diff_tree.py b/dulwich/diff_tree.py
index 842945f..08f7f29 100644
--- a/dulwich/diff_tree.py
+++ b/dulwich/diff_tree.py
@@ -200,7 +200,7 @@ def _count_blocks(obj):
     Splits the data into blocks either on lines or <=64-byte chunks of lines.
 
     :param obj: The object to count blocks for.
-    :return: A dict of block -> number of occurrences.
+    :return: A dict of block hashcode -> total bytes occurring.
     """
     block_counts = defaultdict(int)
     block = StringIO()
@@ -216,17 +216,25 @@ def _count_blocks(obj):
         block_write(c)
         n += 1
         if c == '\n' or n == _BLOCK_SIZE:
-            block_counts[block_getvalue()] += 1
+            value = block_getvalue()
+            block_counts[hash(value)] += len(value)
             block_seek(0)
             block_truncate()
             n = 0
     if n > 0:
-        block_counts[block_getvalue()] += 1
+        last_block = block_getvalue()
+        block_counts[hash(last_block)] += len(last_block)
     return block_counts
 
 
 def _common_bytes(blocks1, blocks2):
-    """Count the number of common bytes in two block count dicts."""
+    """Count the number of common bytes in two block count dicts.
+
+    :param block1: The first dict of block hashcode -> total bytes.
+    :param block2: The second dict of block hashcode -> total bytes.
+    :return: The number of bytes in common between blocks1 and blocks2. This is
+        only approximate due to possible hash collisions.
+    """
     # Iterate over the smaller of the two dicts, since this is symmetrical.
     if len(blocks1) > len(blocks2):
         blocks1, blocks2 = blocks2, blocks1
@@ -234,7 +242,7 @@ def _common_bytes(blocks1, blocks2):
     for block, count1 in blocks1.iteritems():
         count2 = blocks2.get(block)
         if count2:
-            score += min(count1, count2) * len(block)
+            score += min(count1, count2)
     return score
 
 
diff --git a/dulwich/tests/test_diff_tree.py b/dulwich/tests/test_diff_tree.py
index c03367b..77cf7aa 100644
--- a/dulwich/tests/test_diff_tree.py
+++ b/dulwich/tests/test_diff_tree.py
@@ -300,21 +300,22 @@ class RenameDetectionTest(DiffTestCase):
 
     def test_count_blocks(self):
         blob = make_object(Blob, data='a\nb\na\n')
-        self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
 
     def test_count_blocks_no_newline(self):
         blob = make_object(Blob, data='a\na')
-        self.assertEqual({'a\n': 1, 'a': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
 
     def test_count_blocks_chunks(self):
         blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
-        self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
 
     def test_count_blocks_long_lines(self):
         a = 'a' * 64
         data = a + 'xxx\ny\n' + a + 'zzz\n'
         blob = make_object(Blob, data=data)
-        self.assertEqual({'a' * 64: 2, 'xxx\n': 1, 'y\n': 1, 'zzz\n': 1},
+        self.assertEqual({hash('a' * 64): 128, hash('xxx\n'): 4, hash('y\n'): 2,
+                          hash('zzz\n'): 4},
                          _count_blocks(blob))
 
     def assertSimilar(self, expected_score, blob1, blob2):
-- 
1.7.3.2.168.gd6b63




References