dulwich team mailing list archive

Thread
Date
[PATCH 3/3] Correctly avoid parsing ShaFiles with fixed SHAs when calling sha().

To: jelmer@xxxxxxxxx
From: dborowitz@xxxxxxxxxx
Date: Mon, 12 Apr 2010 08:47:51 -0700
Cc: dulwich@xxxxxxxxxxxxxxxxxxx
In-reply-to: <1271087271-16601-1-git-send-email-dborowitz@google.com>
From: Dave Borowitz <dborowitz@xxxxxxxxxx>

This required some reworking of the _needs_* and _sha ivars. Improved
check() to force computing the SHA and verifying that it matches the
previously-set value. Added a test for this check.

Change-Id: I6782693d7d7708bc7c28f357c27419d51409b884
---
 dulwich/_objects.c                                 |    4 +-
 dulwich/objects.py                                 |   47 +++++++++++++++-----
 .../11/11111111111111111111111111111111111111      |  Bin 0 -> 62 bytes
 dulwich/tests/test_objects.py                      |   20 +++++---
 4 files changed, 50 insertions(+), 21 deletions(-)
 create mode 100644 dulwich/tests/data/blobs/11/11111111111111111111111111111111111111

diff --git a/dulwich/_objects.c b/dulwich/_objects.c
index fef82e7..d9699ec 100644
--- a/dulwich/_objects.c
+++ b/dulwich/_objects.c
@@ -60,7 +60,7 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 		mode = strtol(text, &text, 8);
 
 		if (*text != ' ') {
-			PyErr_SetString(PyExc_RuntimeError, "Expected space");
+			PyErr_SetString(PyExc_ValueError, "Expected space");
 			Py_DECREF(ret);
 			return NULL;
 		}
@@ -76,7 +76,7 @@ static PyObject *py_parse_tree(PyObject *self, PyObject *args)
 		}
 
 		if (text + namelen + 20 >= end) {
-			PyErr_SetString(PyExc_RuntimeError, "SHA truncated");
+			PyErr_SetString(PyExc_ValueError, "SHA truncated");
 			Py_DECREF(ret);
 			Py_DECREF(name);
 			return NULL;
diff --git a/dulwich/objects.py b/dulwich/objects.py
index 27bbf0a..44c28a3 100644
--- a/dulwich/objects.py
+++ b/dulwich/objects.py
@@ -31,6 +31,7 @@ import stat
 import zlib
 
 from dulwich.errors import (
+    ChecksumMismatch,
     NotBlobError,
     NotCommitError,
     NotTagError,
@@ -208,7 +209,7 @@ class ShaFile(object):
     def as_raw_chunks(self):
         if self._needs_parsing:
             self._ensure_parsed()
-        else:
+        elif self._needs_serialization:
             self._chunked_text = self._serialize()
         return self._chunked_text
 
@@ -239,8 +240,9 @@ class ShaFile(object):
 
     def set_raw_chunks(self, chunks):
         self._chunked_text = chunks
+        self._deserialize(chunks)
         self._sha = None
-        self._needs_parsing = True
+        self._needs_parsing = False
         self._needs_serialization = False
 
     @staticmethod
@@ -370,15 +372,22 @@ class ShaFile(object):
         """Check this object for internal consistency.
 
         :raise ObjectFormatException: if the object is malformed in some way
+        :raise ChecksumMismatch: if the object was created with a SHA that does
+            not match its contents
         """
         # TODO: if we find that error-checking during object parsing is a
         # performance bottleneck, those checks should be moved to the class's
         # check() method during optimization so we can still check the object
         # when necessary.
+        old_sha = self.id
         try:
             self._deserialize(self.as_raw_chunks())
+            self._sha = None
+            new_sha = self.id
         except Exception, e:
             raise ObjectFormatException(e)
+        if old_sha != new_sha:
+            raise ChecksumMismatch(new_sha, old_sha)
 
     def _header(self):
         return "%s %lu\0" % (self.type_name, self.raw_length())
@@ -399,8 +408,13 @@ class ShaFile(object):
 
     def sha(self):
         """The SHA1 object that is the name of this object."""
-        if self._needs_serialization or self._sha is None:
-            self._sha = self._make_sha()
+        if self._sha is None:
+            # this is a local because as_raw_chunks() overwrites self._sha
+            new_sha = make_sha()
+            new_sha.update(self._header())
+            for chunk in self.as_raw_chunks():
+                new_sha.update(chunk)
+            self._sha = new_sha
         return self._sha
 
     @property
@@ -483,7 +497,7 @@ class Blob(ShaFile):
 
         :raise ObjectFormatException: if the object is malformed in some way
         """
-        pass  # it's impossible for raw data to be malformed
+        super(Blob, self).check()
 
 
 def _parse_tag_or_commit(text):
@@ -580,7 +594,10 @@ class Tag(ShaFile):
             if field == _OBJECT_HEADER:
                 self._object_sha = value
             elif field == _TYPE_HEADER:
-                self._object_class = object_class(value)
+                obj_class = object_class(value)
+                if not obj_class:
+                    raise ObjectFormatException("Not a known type: %s" % value)
+                self._object_class = obj_class
             elif field == _TAG_HEADER:
                 self._name = value
             elif field == _TAGGER_HEADER:
@@ -593,14 +610,17 @@ class Tag(ShaFile):
                     self._tag_timezone_neg_utc = False
                 else:
                     self._tagger = value[0:sep+1]
-                    (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
-                    self._tag_time = int(timetext)
-                    self._tag_timezone, self._tag_timezone_neg_utc = \
-                            parse_timezone(timezonetext)
+                    try:
+                        (timetext, timezonetext) = value[sep+2:].rsplit(" ", 1)
+                        self._tag_time = int(timetext)
+                        self._tag_timezone, self._tag_timezone_neg_utc = \
+                                parse_timezone(timezonetext)
+                    except ValueError, e:
+                        raise ObjectFormatException(e)
             elif field is None:
                 self._message = value
             else:
-                raise AssertionError("Unknown field %s" % field)
+                raise ObjectFormatError("Unknown field %s" % field)
 
     def _get_object(self):
         """Get the object pointed to by this tag.
@@ -746,7 +766,10 @@ class Tree(ShaFile):
 
     def _deserialize(self, chunks):
         """Grab the entries in the tree"""
-        parsed_entries = parse_tree("".join(chunks))
+        try:
+            parsed_entries = parse_tree("".join(chunks))
+        except ValueError, e:
+            raise ObjectFormatException(e)
         # TODO: list comprehension is for efficiency in the common (small) case;
         # if memory efficiency in the large case is a concern, use a genexp.
         self._entries = dict([(n, (m, s)) for n, m, s in parsed_entries])
diff --git a/dulwich/tests/data/blobs/11/11111111111111111111111111111111111111 b/dulwich/tests/data/blobs/11/11111111111111111111111111111111111111
new file mode 100644
index 0000000000000000000000000000000000000000..1942d238fd2e742022b145383e67863d02862c08
GIT binary patch
literal 62
zcmV-E0Kxxw0ZYosPf{?kU<k>`ELJGaNK`LYNXyJg%}dNpRY=KCEmp|OFHy)%EJ@B#
U$Sf&VC`wJt0V(4G02#{@9zW|C+5i9m

literal 0
HcmV?d00001

diff --git a/dulwich/tests/test_objects.py b/dulwich/tests/test_objects.py
index 0d275b1..e568072 100644
--- a/dulwich/tests/test_objects.py
+++ b/dulwich/tests/test_objects.py
@@ -29,6 +29,7 @@ import stat
 import unittest
 
 from dulwich.errors import (
+    ChecksumMismatch,
     ObjectFormatException,
     )
 from dulwich.objects import (
@@ -210,22 +211,27 @@ class BlobReadTests(unittest.TestCase):
         self.assertEqual(c.author_timezone, 0)
         self.assertEqual(c.message, 'Merge ../b\n')
 
+    def test_check_id(self):
+        wrong_sha = '1' * 40
+        b = self.get_blob(wrong_sha)
+        self.assertEqual(wrong_sha, b.id)
+        self.assertRaises(ChecksumMismatch, b.check)
+        self.assertEqual('742b386350576589175e374a5706505cbd17680c', b.id)
+
 
 class ShaFileCheckTests(unittest.TestCase):
 
     def assertCheckFails(self, cls, data):
         obj = cls()
-        obj.set_raw_string(data)
-        self.assertRaises(ObjectFormatException, obj.check)
+        def do_check():
+            obj.set_raw_string(data)
+            obj.check()
+        self.assertRaises(ObjectFormatException, do_check)
 
     def assertCheckSucceeds(self, cls, data):
         obj = cls()
         obj.set_raw_string(data)
-        try:
-            obj.check()
-        except ObjectFormatException, e:
-            raise
-            self.fail(e)
+        self.assertEqual(None, obj.check())
 
 
 class CommitSerializationTests(unittest.TestCase):
-- 
1.7.0.3.295.gd8fa2
References

[PATCH 1/3] Lazily read the contents of ShaFiles from disk.
From: dborowitz, 2010-04-12