dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00302
[PATCH 24/28] diff: Use hashcodes as block keys instead of strings.
From: Dave Borowitz <dborowitz@xxxxxxxxxx>
This avoids storing essentially the entire contents of every blob in its
block dict, at the cost of some accuracy on hash conflicts.
Change-Id: I54df3f7e8ac32e7147f9b64d9944a514f7700a4f
---
dulwich/diff.py | 18 +++++++++++++-----
dulwich/tests/test_diff.py | 9 +++++----
2 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/dulwich/diff.py b/dulwich/diff.py
index bc250ac..c474d3a 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -200,7 +200,7 @@ def _count_blocks(obj):
Splits the data into blocks either on lines or <=64-byte chunks of lines.
:param obj: The object to count blocks for.
- :return: A dict of block -> number of occurrences.
+ :return: A dict of block hashcode -> total bytes occurring.
"""
block_counts = defaultdict(int)
block = StringIO()
@@ -216,17 +216,25 @@ def _count_blocks(obj):
block_write(c)
n += 1
if c == '\n' or n == _BLOCK_SIZE:
- block_counts[block_getvalue()] += 1
+ value = block_getvalue()
+ block_counts[hash(value)] += len(value)
block_seek(0)
block_truncate()
n = 0
if n > 0:
- block_counts[block_getvalue()] += 1
+ last_block = block_getvalue()
+ block_counts[hash(last_block)] += len(last_block)
return block_counts
def _common_bytes(blocks1, blocks2):
- """Count the number of common bytes in two block count dicts."""
+ """Count the number of common bytes in two block count dicts.
+
+ :param block1: The first dict of block hashcode -> total bytes.
+ :param block2: The second dict of block hashcode -> total bytes.
+ :return: The number of bytes in common between blocks1 and blocks2. This is
+ only approximate due to possible hash collisions.
+ """
# Iterate over the smaller of the two dicts, since this is symmetrical.
if len(blocks1) > len(blocks2):
blocks1, blocks2 = blocks2, blocks1
@@ -234,7 +242,7 @@ def _common_bytes(blocks1, blocks2):
for block, count1 in blocks1.iteritems():
count2 = blocks2.get(block)
if count2:
- score += min(count1, count2) * len(block)
+ score += min(count1, count2)
return score
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index 38c1285..21312b3 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -300,21 +300,22 @@ class RenameDetectionTest(DiffTestCase):
def test_count_blocks(self):
blob = make_object(Blob, data='a\nb\na\n')
- self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+ self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
def test_count_blocks_no_newline(self):
blob = make_object(Blob, data='a\na')
- self.assertEqual({'a\n': 1, 'a': 1}, _count_blocks(blob))
+ self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
def test_count_blocks_chunks(self):
blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
- self.assertEqual({'a\n': 2, 'b\n': 1}, _count_blocks(blob))
+ self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
def test_count_blocks_long_lines(self):
a = 'a' * 64
data = a + 'xxx\ny\n' + a + 'zzz\n'
blob = make_object(Blob, data=data)
- self.assertEqual({'a' * 64: 2, 'xxx\n': 1, 'y\n': 1, 'zzz\n': 1},
+ self.assertEqual({hash('a' * 64): 128, hash('xxx\n'): 4, hash('y\n'): 2,
+ hash('zzz\n'): 4},
_count_blocks(blob))
def assertSimilar(self, expected_score, blob1, blob2):
--
1.7.3.2.168.gd6b63
References