dulwich-users team mailing list archive

Thread
Date
[PATCH 03/33] pack: Compute CRC32 during object unpacking.

To: dulwich-users@xxxxxxxxxxxxxxxxxxx
From: dborowitz@xxxxxxxxxx
Date: Tue, 26 Jul 2011 19:03:01 -0700
Cc: Dave Borowitz <ddborowitz@xxxxxxxxx>
In-reply-to: <1311732211-6389-1-git-send-email-dborowitz@google.com>
From: Dave Borowitz <ddborowitz@xxxxxxxxx>

This avoids some seeking and rereading during object iteration. We
still have to seek backwards over the unused data, but we reread
considerably less data than before.

Change-Id: If1f92c73fcf7f66de0220406e4bf17f80c047bf7
---
 dulwich/pack.py            |  101 ++++++++++++++++++++++++++++++++------------
 dulwich/server.py          |    2 +-
 dulwich/tests/test_pack.py |   13 ++++-
 3 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/dulwich/pack.py b/dulwich/pack.py
index be89a58..7b04065 100644
--- a/dulwich/pack.py
+++ b/dulwich/pack.py
@@ -35,6 +35,7 @@ try:
 except ImportError:
     from dulwich._compat import defaultdict
 
+import binascii
 from cStringIO import (
     StringIO,
     )
@@ -92,18 +93,21 @@ REF_DELTA = 7
 DELTA_TYPES = (OFS_DELTA, REF_DELTA)
 
 
-def take_msb_bytes(read):
+def take_msb_bytes(read, crc32=None):
     """Read bytes marked with most significant bit.
 
     :param read: Read function
     """
     ret = []
     while len(ret) == 0 or ret[-1] & 0x80:
-        ret.append(ord(read(1)))
-    return ret
+        b = read(1)
+        if crc32 is not None:
+            crc32 = binascii.crc32(b, crc32)
+        ret.append(ord(b))
+    return ret, crc32
 
 
-def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
+def read_zlib_chunks(read_some, dec_size, buffer_size=4096, crc32=None):
     """Read zlib data from a buffer.
 
     This function requires that the buffer have additional data following the
@@ -113,28 +117,44 @@ def read_zlib_chunks(read_some, dec_size, buffer_size=4096):
         return less than the requested size
     :param dec_size: Expected size of the decompressed buffer
     :param buffer_size: Size of the read buffer
-    :return: Tuple with list of chunks, length of compressed data length and
-        and unused read data.
+    :param crc32: If not None, the CRC32 of the compressed bytes will be
+        computed using this starting CRC32. If False, CRC32 computations will
+        not be done, and the returned CRC32 will be None.
+    :return: Tuple of (
+        list of uncompressed chunks,
+        length of compressed data,
+        crc32 of compressed data,
+        unused read data,
+        ).
     :raise zlib.error: if a decompression error occurred.
     """
     if dec_size <= -1:
         raise ValueError('non-negative zlib data stream size expected')
     obj = zlib.decompressobj()
     ret = []
-    fed = 0
     size = 0
-    while obj.unused_data == '':
+    comp_size = 0
+    while True:
         add = read_some(buffer_size)
         if not add:
             raise zlib.error('EOF before end of zlib stream')
-        fed += len(add)
+        comp_size += len(add)
         decomp = obj.decompress(add)
         size += len(decomp)
         ret.append(decomp)
+        unused = obj.unused_data
+        if unused:
+            left = len(unused)
+            comp_size -= left
+            if crc32 is not None:
+                crc32 = binascii.crc32(add[:-left], crc32)
+            break
+        elif crc32 is not None:
+            crc32 = binascii.crc32(add, crc32)
+
     if size != dec_size:
         raise zlib.error('decompressed data does not match expected size')
-    comp_len = fed - len(obj.unused_data)
-    return ret, comp_len, obj.unused_data
+    return ret, comp_size, crc32, unused
 
 
 def iter_sha1(iter):
@@ -535,26 +555,40 @@ def chunks_length(chunks):
     return sum(imap(len, chunks))
 
 
-def unpack_object(read_all, read_some=None):
+def unpack_object(read_all, read_some=None, compute_crc32=False):
     """Unpack a Git object.
 
     :param read_all: Read function that blocks until the number of requested
         bytes are read.
     :param read_some: Read function that returns at least one byte, but may not
         return the number of bytes requested.
-    :return: A tuple of (type number, uncompressed data,
-        length of compressed data, compressed data, unused read data).
+    :param compute_crc32: If True, compute the CRC32 of the compressed data. If
+        False, the returned CRC32 will be None.
+    :return: A tuple of (
+        type number,
+        uncompressed data,
+        length of compressed data,
+        CRC32 of compressed data,
+        unused read data,
+        ).
+        For delta types, the uncompressed data is a tuple of
+        (base, uncompressed chunks).
     """
     if read_some is None:
         read_some = read_all
-    bytes = take_msb_bytes(read_all)
+    if compute_crc32:
+        crc32 = 0
+    else:
+        crc32 = None
+
+    bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
     type_num = (bytes[0] >> 4) & 0x07
     size = bytes[0] & 0x0f
     for i, byte in enumerate(bytes[1:]):
         size += (byte & 0x7f) << ((i * 7) + 4)
     raw_base = len(bytes)
     if type_num == OFS_DELTA:
-        bytes = take_msb_bytes(read_all)
+        bytes, crc32 = take_msb_bytes(read_all, crc32=crc32)
         raw_base += len(bytes)
         assert not (bytes[-1] & 0x80)
         delta_base_offset = bytes[0] & 0x7f
@@ -565,16 +599,21 @@ def unpack_object(read_all, read_some=None):
         base = delta_base_offset
     elif type_num == REF_DELTA:
         base = read_all(20)
+        if compute_crc32:
+            crc32 = binascii.crc32(base, crc32)
         raw_base += 20
     else:
         base = None
 
-    uncomp, comp_len, unused = read_zlib_chunks(read_some, size)
+    uncomp, comp_len, crc32, unused = read_zlib_chunks(read_some, size,
+                                                       crc32=crc32)
+    if compute_crc32:
+        crc32 &= 0xffffffff
     comp_len += raw_base
     if base is None:
-        return type_num, uncomp, comp_len, unused
+        return type_num, uncomp, comp_len, crc32, unused
     else:
-        return type_num, (base, uncomp), comp_len, unused
+        return type_num, (base, uncomp), comp_len, crc32, unused
 
 
 def _compute_object_size((num, obj)):
@@ -667,9 +706,17 @@ class PackStreamReader(object):
     def __len__(self):
         return self._num_objects
 
-    def read_objects(self):
+    def read_objects(self, compute_crc32=False):
         """Read the objects in this pack file.
 
+        :param compute_crc32: If True, compute the CRC32 of the compressed
+            data. If False, the returned CRC32 will be None.
+        :yield: Tuples of (
+            type number,
+            list of uncompressed chunks,
+            length of compressed data,
+            crc32 of compressed data,
+            ).
         :raise AssertionError: if there is an error in the pack format.
         :raise ChecksumMismatch: if the checksum of the pack contents does not
             match the checksum in the pack trailer.
@@ -678,8 +725,9 @@ class PackStreamReader(object):
         """
         pack_version, self._num_objects = read_pack_header(self.read)
         for i in xrange(self._num_objects):
-            type, uncomp, comp_len, unused = unpack_object(self.read, self.recv)
-            yield type, uncomp, comp_len
+            type_num, uncomp, comp_len, crc32, unused = unpack_object(
+              self.read, read_some=self.recv, compute_crc32=compute_crc32)
+            yield type_num, uncomp, comp_len, crc32
 
             # prepend any unused data to current read buffer
             buf = StringIO()
@@ -712,11 +760,10 @@ class PackObjectIterator(object):
     def next(self):
         if self.i == self.num:
             raise StopIteration
-        self.map.seek(self.offset)
-        (type, obj, total_size, unused) = unpack_object(self.map.read)
-        self.map.seek(self.offset)
-        crc32 = zlib.crc32(self.map.read(total_size)) & 0xffffffff
-        ret = (self.offset, type, obj, crc32)
+        self.map.seek(self.offset)  # Back up over unused data.
+        type_num, obj, total_size, crc32, unused = unpack_object(
+          self.map.read, compute_crc32=True)
+        ret = (self.offset, type_num, obj, crc32)
         self.offset += total_size
         if self._progress is not None:
             self._progress(self.i, self.num)
diff --git a/dulwich/server.py b/dulwich/server.py
index 00da55b..780ba51 100644
--- a/dulwich/server.py
+++ b/dulwich/server.py
@@ -140,7 +140,7 @@ class PackStreamCopier(PackStreamReader):
         See PackStreamReader.iterobjects for a list of exceptions this may
         throw.
         """
-        for _, _, _ in self.read_objects():
+        for _ in self.read_objects():
             pass
 
 
diff --git a/dulwich/tests/test_pack.py b/dulwich/tests/test_pack.py
index a56655b..f1ec1fe 100644
--- a/dulwich/tests/test_pack.py
+++ b/dulwich/tests/test_pack.py
@@ -508,17 +508,24 @@ class ReadZlibTests(TestCase):
     def test_decompress_empty(self):
         comp = zlib.compress('')
         read = StringIO(comp + self.extra).read
-        decomp, comp_len, unused_data = read_zlib_chunks(read, 0)
+        decomp, comp_len, crc32, unused_data = read_zlib_chunks(read, 0,
+                                                                crc32=0)
         self.assertEqual('', ''.join(decomp))
         self.assertEqual(len(comp), comp_len)
         self.assertNotEquals('', unused_data)
         self.assertEquals(self.extra, unused_data + read())
 
+    def test_decompress_no_crc32(self):
+        _, _, crc32, _ = read_zlib_chunks(
+          self.read, len(self.decomp), buffer_size=4096)
+        self.assertEquals(None, crc32)
+
     def _do_decompress_test(self, buffer_size):
-        decomp, comp_len, unused_data = read_zlib_chunks(
-          self.read, len(self.decomp), buffer_size=buffer_size)
+        decomp, comp_len, crc32, unused_data = read_zlib_chunks(
+          self.read, len(self.decomp), buffer_size=buffer_size, crc32=0)
         self.assertEquals(self.decomp, ''.join(decomp))
         self.assertEquals(len(self.comp), comp_len)
+        self.assertEquals(crc32, zlib.crc32(self.comp))
         self.assertNotEquals('', unused_data)
         self.assertEquals(self.extra, unused_data + self.read())
 
-- 
1.7.3.1
References

[PATCH 0/33] Rewrite and speed up pack inflation
From: dborowitz, 2011-07-27