← Back to team overview

dulwich-users team mailing list archive

[PATCH 27/28] diff: C implementation of count_blocks.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

Change-Id: I54f903b90533cf821235060aa6f632ea9d4f827a
---
 dulwich/_diff.c            |  161 +++++++++++++++++++++++++++++++++++++++++--
 dulwich/diff.py            |    3 +-
 dulwich/tests/test_diff.py |   31 +++++++--
 3 files changed, 181 insertions(+), 14 deletions(-)

diff --git a/dulwich/_diff.c b/dulwich/_diff.c
index 4838a91..b5c0ac5 100644
--- a/dulwich/_diff.c
+++ b/dulwich/_diff.c
@@ -28,7 +28,9 @@ typedef int Py_ssize_t;
 #define Py_SIZE(x) Py_Size(x)
 #endif
 
-static PyObject *tree_entry_cls, *null_entry;
+static PyObject *tree_entry_cls = NULL, *null_entry = NULL,
+	*defaultdict_cls = NULL, *int_cls = NULL;
+static int block_size;
 
 /**
  * Free an array of PyObject pointers, decrementing any references.
@@ -266,34 +268,177 @@ static PyObject *py_is_tree(PyObject *self, PyObject *args)
 	return result;
 }
 
+static int add_hash(PyObject *get, PyObject *set, char *str, int n) {
+	PyObject *str_obj = NULL, *hash_obj = NULL, *value = NULL,
+		*set_value = NULL;
+	long hash;
+
+	/* It would be nice to hash without copying str into a PyString, but that
+	 * isn't exposed by the API. */
+	str_obj = PyString_FromStringAndSize(str, n);
+	if (!str_obj)
+		goto error;
+	hash = PyObject_Hash(str_obj);
+	if (hash == -1)
+		goto error;
+	hash_obj = PyInt_FromLong(hash);
+	if (!hash_obj)
+		goto error;
+
+	value = PyObject_CallFunctionObjArgs(get, hash_obj, NULL);
+	if (!value)
+		goto error;
+	set_value = PyObject_CallFunction(set, "(Ol)", hash_obj,
+		PyInt_AS_LONG(value) + n);
+	if (!set_value)
+		goto error;
+
+	Py_DECREF(str_obj);
+	Py_DECREF(hash_obj);
+	Py_DECREF(value);
+	Py_DECREF(set_value);
+	return 0;
+
+error:
+	Py_XDECREF(str_obj);
+	Py_XDECREF(hash_obj);
+	Py_XDECREF(value);
+	Py_XDECREF(set_value);
+	return -1;
+}
+
+static PyObject *py_count_blocks(PyObject *self, PyObject *args)
+{
+	PyObject *obj, *chunks = NULL, *chunk, *counts = NULL, *get = NULL,
+		*set = NULL;
+	char *chunk_str, *block = NULL;
+	Py_ssize_t num_chunks, chunk_len;
+	int i, j, n = 0;
+	char c;
+
+	if (!PyArg_ParseTuple(args, "O", &obj))
+		goto error;
+
+	counts = PyObject_CallFunctionObjArgs(defaultdict_cls, int_cls, NULL);
+	if (!counts)
+		goto error;
+	get = PyObject_GetAttrString(counts, "__getitem__");
+	set = PyObject_GetAttrString(counts, "__setitem__");
+
+	chunks = PyObject_CallMethod(obj, "as_raw_chunks", NULL);
+	if (!chunks)
+		goto error;
+	if (!PyList_Check(chunks)) {
+		PyErr_SetString(PyExc_TypeError,
+			"as_raw_chunks() did not return a list");
+		goto error;
+	}
+	num_chunks = PyList_GET_SIZE(chunks);
+	block = PyMem_New(char, block_size);
+	if (!block) {
+		PyErr_SetNone(PyExc_MemoryError);
+		goto error;
+	}
+
+	for (i = 0; i < num_chunks; i++) {
+		chunk = PyList_GET_ITEM(chunks, i);
+		if (!PyString_Check(chunk)) {
+			PyErr_SetString(PyExc_TypeError, "chunk is not a string");
+			goto error;
+		}
+		if (PyString_AsStringAndSize(chunk, &chunk_str, &chunk_len) == -1)
+			goto error;
+
+		for (j = 0; j < chunk_len; j++) {
+			c = chunk_str[j];
+			block[n++] = c;
+			if (c == '\n' || n == block_size) {
+				if (add_hash(get, set, block, n) == -1)
+					goto error;
+				n = 0;
+			}
+		}
+	}
+	if (n && add_hash(get, set, block, n) == -1)
+		goto error;
+
+	Py_DECREF(chunks);
+	Py_DECREF(get);
+	Py_DECREF(set);
+	PyMem_Free(block);
+	return counts;
+
+error:
+	Py_XDECREF(chunks);
+	Py_XDECREF(get);
+	Py_XDECREF(set);
+	Py_XDECREF(counts);
+	PyMem_Free(block);
+	return NULL;
+}
+
 static PyMethodDef py_diff_methods[] = {
 	{ "_is_tree", (PyCFunction)py_is_tree, METH_VARARGS, NULL },
 	{ "_merge_entries", (PyCFunction)py_merge_entries, METH_VARARGS, NULL },
+	{ "_count_blocks", (PyCFunction)py_count_blocks, METH_VARARGS, NULL },
 	{ NULL, NULL, 0, NULL }
 };
 
 PyMODINIT_FUNC
 init_diff(void)
 {
-	PyObject *m, *objects_mod, *diff_mod;
+	PyObject *m, *objects_mod = NULL, *diff_mod = NULL, *block_size_obj = NULL;
 	m = Py_InitModule("_diff", py_diff_methods);
 	if (!m)
-		return;
+		goto error;
 
 	objects_mod = PyImport_ImportModule("dulwich.objects");
 	if (!objects_mod)
-		return;
+		goto error;
 
 	tree_entry_cls = PyObject_GetAttrString(objects_mod, "TreeEntry");
 	Py_DECREF(objects_mod);
 	if (!tree_entry_cls)
-		return;
+		goto error;
 
 	diff_mod = PyImport_ImportModule("dulwich.diff");
 	if (!diff_mod)
-		return;
+		goto error;
+
 	null_entry = PyObject_GetAttrString(diff_mod, "_NULL_ENTRY");
-	Py_DECREF(diff_mod);
 	if (!null_entry)
-		return;
+		goto error;
+
+	block_size_obj = PyObject_GetAttrString(diff_mod, "_BLOCK_SIZE");
+	if (!block_size_obj)
+		goto error;
+	block_size = (int)PyInt_AsLong(block_size_obj);
+
+	if (PyErr_Occurred())
+		goto error;
+
+	defaultdict_cls = PyObject_GetAttrString(diff_mod, "defaultdict");
+	if (!defaultdict_cls)
+		goto error;
+
+	/* This is kind of hacky, but I don't know of a better way to get the
+	 * PyObject* version of int. */
+	int_cls = PyDict_GetItemString(PyEval_GetBuiltins(), "int");
+	if (!int_cls) {
+		PyErr_SetString(PyExc_NameError, "int");
+		goto error;
+	}
+
+	Py_DECREF(objects_mod);
+	Py_DECREF(diff_mod);
+	return;
+
+error:
+	Py_XDECREF(objects_mod);
+	Py_XDECREF(diff_mod);
+	Py_XDECREF(null_entry);
+	Py_XDECREF(block_size_obj);
+	Py_XDECREF(defaultdict_cls);
+	Py_XDECREF(int_cls);
+	return;
 }
diff --git a/dulwich/diff.py b/dulwich/diff.py
index c474d3a..f88f06a 100644
--- a/dulwich/diff.py
+++ b/dulwich/diff.py
@@ -456,8 +456,9 @@ class RenameDetector(object):
 # Hold on to the pure-python implementations for testing.
 _is_tree_py = _is_tree
 _merge_entries_py = _merge_entries
+_count_blocks_py = _count_blocks
 try:
     # Try to import C versions
-    from dulwich._diff import _is_tree, _merge_entries
+    from dulwich._diff import _is_tree, _merge_entries, _count_blocks
 except ImportError:
     pass
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
index 4862708..aab2831 100644
--- a/dulwich/tests/test_diff.py
+++ b/dulwich/tests/test_diff.py
@@ -28,6 +28,7 @@ from dulwich.diff import (
     _merge_entries_py,
     tree_changes,
     _count_blocks,
+    _count_blocks_py,
     _similarity_score,
     _tree_change_key,
     RenameDetector,
@@ -291,19 +292,34 @@ class TreeChangesTest(DiffTestCase):
 
 class RenameDetectionTest(DiffTestCase):
 
-    def test_count_blocks(self):
+    def _do_test_count_blocks(self, count_blocks):
         blob = make_object(Blob, data='a\nb\na\n')
-        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
+        self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, count_blocks(blob))
+
+    test_count_blocks = functest_builder(_do_test_count_blocks,
+                                         _count_blocks_py)
+    test_count_blocks_extension = ext_functest_builder(_do_test_count_blocks,
+                                                       _count_blocks)
 
-    def test_count_blocks_no_newline(self):
+    def _do_test_count_blocks_no_newline(self, count_blocks):
         blob = make_object(Blob, data='a\na')
         self.assertEqual({hash('a\n'): 2, hash('a'): 1}, _count_blocks(blob))
 
-    def test_count_blocks_chunks(self):
+    test_count_blocks_no_newline = functest_builder(
+      _do_test_count_blocks_no_newline, _count_blocks_py)
+    test_count_blocks_no_newline_extension = ext_functest_builder(
+       _do_test_count_blocks_no_newline, _count_blocks)
+
+    def _do_test_count_blocks_chunks(self, count_blocks):
         blob = ShaFile.from_raw_chunks(Blob.type_num, ['a\nb', '\na\n'])
         self.assertEqual({hash('a\n'): 4, hash('b\n'): 2}, _count_blocks(blob))
 
-    def test_count_blocks_long_lines(self):
+    test_count_blocks_chunks = functest_builder(_do_test_count_blocks_chunks,
+                                                _count_blocks_py)
+    test_count_blocks_chunks_extension = ext_functest_builder(
+      _do_test_count_blocks_chunks, _count_blocks)
+
+    def _do_test_count_blocks_long_lines(self, count_blocks):
         a = 'a' * 64
         data = a + 'xxx\ny\n' + a + 'zzz\n'
         blob = make_object(Blob, data=data)
@@ -311,6 +327,11 @@ class RenameDetectionTest(DiffTestCase):
                           hash('zzz\n'): 4},
                          _count_blocks(blob))
 
+    test_count_blocks_long_lines = functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks_py)
+    test_count_blocks_long_lines_extension = ext_functest_builder(
+      _do_test_count_blocks_long_lines, _count_blocks)
+
     def assertSimilar(self, expected_score, blob1, blob2):
         self.assertEqual(expected_score, _similarity_score(blob1, blob2))
         self.assertEqual(expected_score, _similarity_score(blob2, blob1))
-- 
1.7.3.2.168.gd6b63




References