dulwich-users team mailing list archive
-
dulwich-users team
-
Mailing list archive
-
Message #00527
[PATCH 03/33] pack: Compute CRC32 during object unpacking.
From: Dave Borowitz <ddborowitz@xxxxxxxxx>
This avoids some seeking and rereading during object iteration. We
still have to seek backwards over the unused data, but we reread
considerably less data than before.
Change-Id: If1f92c73fcf7f66de0220406e4bf17f80c047bf7
---
dulwich/pack.py | 101 ++++++++++++++++++++++++++++++++------------
dulwich/server.py | 2 +-
dulwich/tests/test_pack.py | 13 ++++-
3 files changed, 85 insertions(+), 31 deletions(-)
diff --git a/dulwich/pack.py b/dulwich/pack.py
index be89a58..7b04065 100644
--- a/dulwich/pack.py
+++ b/dulwich/pack.py
@@ -35,6 +35,7 @@ try:
except ImportError:
from dulwich._compat import defaultdict
+import binascii
from cStringIO import (
StringIO,
)
@@ -92,18 +93,21 @@ REF_DELTA = 7
DELTA_TYPES = (OFS_DELTA, REF_DELTA)
-def take_msb_bytes(read):
+def take_msb_bytes(read, crc32=None):
"""Read bytes marked with most significant bit.
:param read: Read function
"""
ret = []
while len(ret) == 0 or ret[-1] & 0x80:
- ret.append(ord(read(1)))
- return ret
+ b = read(1)
+ if crc32 is not None:
+ crc32 = binascii.crc32(b, crc32)
+ ret.append(ord(b))
+ return ret, crc32
-def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
+def read_zlib_chunks(read_some, dec_size, buffer_size=4096, crc32=None):
"""Read zlib data from a buffer.
This function requires that the buffer have additional data following the
@@ -113,28 +117,44 @@ def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
return less than the requested size
:param dec_size: Expected size of the decompressed buffer
:param buffer_size: Size of the read buffer
- :return: Tuple with list of chunks, length of compressed data length and
- and unused read data.
+ :param crc32: If not None, the CRC32 of the compressed bytes will be
+ computed using this starting CRC32. If False, CRC32 computations will
+ not be done, and the returned CRC32 will be None.
+ :return: Tuple of (
+ list of uncompressed chunks,
+ length of compressed data,
+ crc32 of compressed data,
+ unused read data,
+ ).
:raise zlib.error: if a decompression error occurred.
"""
if dec_size <= -1:
raise ValueError('non-negative zlib data stream size expected')
obj = zlib.decompressobj()
ret = []
- fed = 0
size = 0
- while obj.unused_data == '':
+ comp_size = 0
+ while True:
add = read_some(buffer_size)
if not add:
raise zlib.error('EOF before end of zlib stream')
- fed += len(add)
+ comp_size += len(add)
decomp = obj.decompress(add)
size += len(decomp)
ret.append(decomp)
+ unused = obj.unused_data
+ if unused:
+ left = len(unused)
+ comp_size -= left
+ if crc32 is not None:
+ crc32 = binascii.crc32(add[:-left], crc32)
+ break
+ elif crc32 is not None:
+ crc32 = binascii.crc32(add, crc32)
+
if size != dec_size:
raise zlib.error('decompressed data does not match expected size')
- comp_len = fed - len(obj.unused_data)
- return ret, comp_len, obj.unused_data
+ return ret, comp_size, crc32, unused
def iter_sha1(iter):
@@ -535,26 +555,40 @@ def chunks_length(chunks):
return sum(imap(len, chunks))
-def unpack_object(read_all, read_some=None):
+def unpack_object(read_all, read_some=None, compute_crc32=False):
"""Unpack a Git object.
:param read_all: Read function that blocks until the number of requested
bytes are read.
:param read_some: Read function that returns at least one byte, but may not
return the number of bytes requested.
- :return: A tuple of (type number, uncompressed data,
- length of compressed data, compressed data, unused read data).
+ :param compute_crc32: If True, compute the CRC32 of the compressed data. If
+ False, the returned CRC32 will be None.
+ :return: A tuple of (
+ type number,
+ uncompressed data,
+ length of compressed data,
+ CRC32 of compressed data,
+ unused read data,
+ ).
+ For delta types, the uncompressed data is a tuple of
+ (base, uncompressed chunks).
"""
if read_some is None:
read_some = read_all
- bytes = take_msb_bytes(read_all)
+ if compute_crc32:
+ crc32 = 0
+ else:
+ crc32 = None
+
+ bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
type_num = (bytes[0] >> 4) & 0x07
size = bytes[0] & 0x0f
for i, byte in enumerate(bytes[1:]):
size += (byte & 0x7f) << ((i * 7) + 4)
raw_base = len(bytes)
if type_num == OFS_DELTA:
- bytes = take_msb_bytes(read_all)
+ bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
raw_base += len(bytes)
assert not (bytes[-1] & 0x80)
delta_base_offset = bytes[0] & 0x7f
@@ -565,16 +599,21 @@ def unpack_object(read_all, read_some=None):
base = delta_base_offset
elif type_num == REF_DELTA:
base = read_all(20)
+ if compute_crc32:
+ crc32 = binascii.crc32(base, crc32)
raw_base += 20
else:
base = None
- uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
+ uncomp, comp_len, crc32, unused = read_zlib_chunks(read_some, size,
+ crc32=crc32)
+ if compute_crc32:
+ crc32 &= 0xffffffff
comp_len += raw_base
if base is None:
- return type_num, uncomp, comp_len, unused
+ return type_num, uncomp, comp_len, crc32, unused
else:
- return type_num, (base, uncomp), comp_len, unused
+ return type_num, (base, uncomp), comp_len, crc32, unused
def _compute_object_size((num, obj)):
@@ -667,9 +706,17 @@ class PackStreamReader(object):
def __len__(self):
return self._num_objects
- def read_objects(self):
+ def read_objects(self, compute_crc32=False):
"""Read the objects in this pack file.
+ :param compute_crc32: If True, compute the CRC32 of the compressed
+ data. If False, the returned CRC32 will be None.
+ :yield: Tuples of (
+ type number,
+ list of uncompressed chunks,
+ length of compressed data,
+ crc32 of compressed data,
+ ).
:raise AssertionError: if there is an error in the pack format.
:raise ChecksumMismatch: if the checksum of the pack contents does not
match the checksum in the pack trailer.
@@ -678,8 +725,9 @@ class PackStreamReader(object):
"""
pack_version, self._num_objects = read_pack_header(self.read)
for i in xrange(self._num_objects):
- type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
- yield type, uncomp, comp_len
+ type_num, uncomp, comp_len, crc32, unused = unpack_object(
+ self.read, read_some=self.recv, compute_crc32=compute_crc32)
+ yield type_num, uncomp, comp_len, crc32
# prepend any unused data to current read buffer
buf = StringIO()
@@ -712,11 +760,10 @@ class PackObjectIterator(object):
def next(self):
if self.i == self.num:
raise StopIteration
- self.map.seek(self.offset)
- (type, obj, total_size, unused) = unpack_object(self.map.read)
- self.map.seek(self.offset)
- crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
- ret = (self.offset, type, obj, crc32)
+ self.map.seek(self.offset) # Back up over unused data.
+ type_num, obj, total_size, crc32, unused = unpack_object(
+ self.map.read, compute_crc32=True)
+ ret = (self.offset, type_num, obj, crc32)
self.offset += total_size
if self._progress is not None:
self._progress(self.i, self.num)
diff --git a/dulwich/server.py b/dulwich/server.py
index 00da55b..780ba51 100644
--- a/dulwich/server.py
+++ b/dulwich/server.py
@@ -140,7 +140,7 @@ class PackStreamCopier(PackStreamReader):
See PackStreamReader.iterobjects for a list of exceptions this may
throw.
"""
- for _, _, _ in self.read_objects():
+ for _ in self.read_objects():
pass
diff --git a/dulwich/tests/test_pack.py b/dulwich/tests/test_pack.py
index a56655b..f1ec1fe 100644
--- a/dulwich/tests/test_pack.py
+++ b/dulwich/tests/test_pack.py
@@ -508,17 +508,24 @@ class ReadZlibTests(TestCase):
def test_decompress_empty(self):
comp = zlib.compress('')
read = StringIO(comp + self.extra).read
- decomp, comp_len, unused_data = read_zlib_chunks(read, 0)
+ decomp, comp_len, crc32, unused_data = read_zlib_chunks(read, 0,
+ crc32=0)
self.assertEqual('', ''.join(decomp))
self.assertEqual(len(comp), comp_len)
self.assertNotEquals('', unused_data)
self.assertEquals(self.extra, unused_data + read())
+ def test_decompress_no_crc32(self):
+ _, _, crc32, _ = read_zlib_chunks(
+ self.read, len(self.decomp), buffer_size=4096)
+ self.assertEquals(None, crc32)
+
def _do_decompress_test(self, buffer_size):
- decomp, comp_len, unused_data = read_zlib_chunks(
- self.read, len(self.decomp), buffer_size=buffer_size)
+ decomp, comp_len, crc32, unused_data = read_zlib_chunks(
+ self.read, len(self.decomp), buffer_size=buffer_size, crc32=0)
self.assertEquals(self.decomp, ''.join(decomp))
self.assertEquals(len(self.comp), comp_len)
+ self.assertEquals(crc32, zlib.crc32(self.comp))
self.assertNotEquals('', unused_data)
self.assertEquals(self.extra, unused_data + self.read())
--
1.7.3.1
References