← Back to team overview

dulwich-users team mailing list archive

[PATCH 02/28] Add tree-diffing functionality in a 'diff' module.

 

From: Dave Borowitz <dborowitz@xxxxxxxxxx>

This is a rewrite of some of the functionality in
BaseObjectStore.iter_tree_contents. It mainly differs in being more
structured and providing the same depth-first pre-order traversal as in
BaseObjectStore.iter_tree_contents.

Change-Id: I6c1504ebf508c485259072b9033c644123c12752
---
 dulwich/diff.py            |  164 +++++++++++++++++++++++++++++++
 dulwich/misc.py            |   41 ++++++++
 dulwich/tests/test_diff.py |  230 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 435 insertions(+), 0 deletions(-)
 create mode 100644 dulwich/diff.py
 create mode 100644 dulwich/tests/test_diff.py

diff --git a/dulwich/diff.py b/dulwich/diff.py
new file mode 100644
index 0000000..5c86fa1
--- /dev/null
+++ b/dulwich/diff.py
@@ -0,0 +1,164 @@
+# diff.py -- Utilities for diffing files and trees.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Utilities for diffing files and trees."""
+
+import stat
+
+from dulwich.misc import (
+    TreeChangeTuple,
+    )
+from dulwich.objects import (
+    TreeEntry,
+    )
+
+# TreeChange type constants.
+CHANGE_ADD = 'add'
+CHANGE_MODIFY = 'modify'
+CHANGE_DELETE = 'delete'
+CHANGE_RENAME = 'rename'
+CHANGE_COPY = 'copy'
+CHANGE_UNCHANGED = 'unchanged'
+
+_NULL_ENTRY = TreeEntry(None, None, None)
+
+
+class TreeChange(TreeChangeTuple):
+    """Class encapsulating a single change between two trees."""
+
+
+def _tree_entries(path, tree):
+    result = []
+    if not tree:
+        return result
+    for entry in tree.iteritems(name_order=True):
+        result.append(entry.in_path(path))
+    return result
+
+
+def _merge_entries(path, tree1, tree2):
+    """Merge the entries of two trees.
+
+    :param path: A path to prepend to all tree entry names.
+    :param tree1: The first Tree object to iterate, or None.
+    :param tree2: The second Tree object to iterate, or None.
+    :return: A list of pairs of TreeEntry objects for each pair of entries in
+        the trees. If an entry exists in one tree but not the other, the other
+        entry will have all attributes set to None. If neither entry's path is
+        None, they are guaranteed to match.
+    """
+    entries1 = _tree_entries(path, tree1)
+    entries2 = _tree_entries(path, tree2)
+    i1 = i2 = 0
+    len1 = len(entries1)
+    len2 = len(entries2)
+
+    result = []
+    while i1 < len1 and i2 < len2:
+        entry1 = entries1[i1]
+        entry2 = entries2[i2]
+        if entry1.path < entry2.path:
+            result.append((entry1, _NULL_ENTRY))
+            i1 += 1
+        elif entry1.path > entry2.path:
+            result.append((_NULL_ENTRY, entry2))
+            i2 += 1
+        else:
+            result.append((entry1, entry2))
+            i1 += 1
+            i2 += 1
+    for i in xrange(i1, len1):
+        result.append((entries1[i], _NULL_ENTRY))
+    for i in xrange(i2, len2):
+        result.append((_NULL_ENTRY, entries2[i]))
+    return result
+
+
+def walk_trees(store, tree1_id, tree2_id):
+    """Recursively walk all the entries of two trees.
+
+    Iteration is depth-first pre-order, as in e.g. os.walk.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the first Tree object to iterate, or None.
+    :param tree2_id: The SHA of the second Tree object to iterate, or None.
+    :yield: Pairs of TreeEntry objects for each pair of entries in the trees and
+        their subtrees recursively. If an entry exists in one tree but not the
+        other, the other entry will have all attributes set to None. If neither
+        entry's path is None, they are guaranteed to match.
+    """
+    # This could be fairly easily generalized to >2 trees if we find a use case.
+    mode1 = tree1_id and stat.S_IFDIR or None
+    mode2 = tree2_id and stat.S_IFDIR or None
+    todo = [(TreeEntry('', mode1, tree1_id), TreeEntry('', mode2, tree2_id))]
+    while todo:
+        entry1, entry2 = todo.pop()
+        is_tree1 = entry1.mode and stat.S_ISDIR(entry1.mode)
+        is_tree2 = entry2.mode and stat.S_ISDIR(entry2.mode)
+
+        tree1 = is_tree1 and store[entry1.sha] or None
+        tree2 = is_tree2 and store[entry2.sha] or None
+        path = entry1.path or entry2.path
+        todo.extend(reversed(_merge_entries(path, tree1, tree2)))
+        yield entry1, entry2
+
+
+def _skip_tree(entry):
+    if entry.mode is None or stat.S_ISDIR(entry.mode):
+        return _NULL_ENTRY
+    return entry
+
+
+def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
+    """Find the differences between the contents of two trees.
+
+    :param store: An ObjectStore for looking up objects.
+    :param tree1_id: The SHA of the source tree.
+    :param tree2_id: The SHA of the target tree.
+    :param want_unchanged: If True, include TreeChanges for unmodified entries
+        as well.
+    :yield: TreeChange instances for each change between the source and target
+        tree.
+    """
+    entries = walk_trees(store, tree1_id, tree2_id)
+    for entry1, entry2 in entries:
+        if entry1 == entry2 and not want_unchanged:
+            continue
+
+        # Treat entries for trees as missing.
+        entry1 = _skip_tree(entry1)
+        entry2 = _skip_tree(entry2)
+
+        if entry1 != _NULL_ENTRY and entry2 != _NULL_ENTRY:
+            if stat.S_IFMT(entry1.mode) != stat.S_IFMT(entry2.mode):
+                # File type changed: report as delete/add.
+                yield TreeChange(CHANGE_DELETE, entry1, _NULL_ENTRY)
+                entry1 = _NULL_ENTRY
+                change_type = CHANGE_ADD
+            elif entry1 == entry2:
+                change_type = CHANGE_UNCHANGED
+            else:
+                change_type = CHANGE_MODIFY
+        elif entry1 != _NULL_ENTRY:
+            change_type = CHANGE_DELETE
+        elif entry2 != _NULL_ENTRY:
+            change_type = CHANGE_ADD
+        else:
+            # Both were None because at least one was a tree.
+            continue
+        yield TreeChange(change_type, entry1, entry2)
diff --git a/dulwich/misc.py b/dulwich/misc.py
index 7c74751..c7f00c0 100644
--- a/dulwich/misc.py
+++ b/dulwich/misc.py
@@ -105,6 +105,7 @@ try:
     from collections import namedtuple
 
     TreeEntryTuple = namedtuple('TreeEntryTuple', ['path', 'mode', 'sha'])
+    TreeChangeTuple = namedtuple('TreeChangeTuple', ['type', 'old', 'new'])
 except ImportError:
     # Provide manual implementations of namedtuples for Python <2.5.
     # If the class definitions change, be sure to keep these in sync by running
@@ -153,3 +154,43 @@ except ImportError:
             path = _property(_itemgetter(0))
             mode = _property(_itemgetter(1))
             sha = _property(_itemgetter(2))
+
+
+    class TreeChangeTuple(tuple):
+            'TreeChangeTuple(type, old, new)'
+
+            __slots__ = ()
+
+            _fields = ('type', 'old', 'new')
+
+            def __new__(_cls, type, old, new):
+                return _tuple.__new__(_cls, (type, old, new))
+
+            @classmethod
+            def _make(cls, iterable, new=tuple.__new__, len=len):
+                'Make a new TreeChangeTuple object from a sequence or iterable'
+                result = new(cls, iterable)
+                if len(result) != 3:
+                    raise TypeError('Expected 3 arguments, got %d' % len(result))
+                return result
+
+            def __repr__(self):
+                return 'TreeChangeTuple(type=%r, old=%r, new=%r)' % self
+
+            def _asdict(t):
+                'Return a new dict which maps field names to their values'
+                return {'type': t[0], 'old': t[1], 'new': t[2]}
+
+            def _replace(_self, **kwds):
+                'Return a new TreeChangeTuple object replacing specified fields with new values'
+                result = _self._make(map(kwds.pop, ('type', 'old', 'new'), _self))
+                if kwds:
+                    raise ValueError('Got unexpected field names: %r' % kwds.keys())
+                return result
+
+            def __getnewargs__(self):
+                return tuple(self)
+
+            type = _property(_itemgetter(0))
+            old = _property(_itemgetter(1))
+            new = _property(_itemgetter(2))
diff --git a/dulwich/tests/test_diff.py b/dulwich/tests/test_diff.py
new file mode 100644
index 0000000..e17ce49
--- /dev/null
+++ b/dulwich/tests/test_diff.py
@@ -0,0 +1,230 @@
+# diff.py -- Utilities for diffing files and trees.
+# Copyright (C) 2010 Google, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# or (at your option) a later version of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA  02110-1301, USA.
+
+"""Tests for diff utilities."""
+
+from dulwich.diff import (
+    CHANGE_ADD,
+    CHANGE_MODIFY,
+    CHANGE_DELETE,
+    CHANGE_UNCHANGED,
+    _NULL_ENTRY,
+    TreeChange,
+    _merge_entries,
+    tree_changes,
+    )
+from dulwich.index import (
+    commit_tree,
+    )
+from dulwich.object_store import (
+    MemoryObjectStore,
+    )
+from dulwich.objects import (
+    Blob,
+    )
+from dulwich.tests import (
+    TestCase,
+    )
+from dulwich.tests.utils import (
+    make_object,
+    )
+
+
+class TreeChangesTest(TestCase):
+
+    def setUp(self):
+        self.store = MemoryObjectStore()
+        self.empty_tree = self.commit_tree([])
+
+    def commit_tree(self, blobs):
+        commit_blobs = []
+        for path, blob, mode in blobs:
+            self.store.add_object(blob)
+            commit_blobs.append((path, blob.id, mode))
+        return self.store[commit_tree(self.store, commit_blobs)]
+
+    def test_merge_entries(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        blob_b1 = make_object(Blob, data='b1')
+        blob_c2 = make_object(Blob, data='c2')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644),
+                                  ('b', blob_b1, 0100755)])
+        tree2 = self.commit_tree([('a', blob_a2, 0100644),
+                                  ('c', blob_c2, 0100755)])
+
+        self.assertEqual([], _merge_entries('', self.empty_tree,
+                                            self.empty_tree))
+        self.assertEqual([
+          ((None, None, None), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          ], _merge_entries('', self.empty_tree, tree1))
+        self.assertEqual([
+          ((None, None, None), ('x/a', 0100644, blob_a1.id)),
+          ((None, None, None), ('x/b', 0100755, blob_b1.id)),
+          ], _merge_entries('x', self.empty_tree, tree1))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), (None, None, None)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], _merge_entries('', tree2, self.empty_tree))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a1.id), ('a', 0100644, blob_a2.id)),
+          (('b', 0100755, blob_b1.id), (None, None, None)),
+          ((None, None, None), ('c', 0100755, blob_c2.id)),
+          ], _merge_entries('', tree1, tree2))
+
+        self.assertEqual([
+          (('a', 0100644, blob_a2.id), ('a', 0100644, blob_a1.id)),
+          ((None, None, None), ('b', 0100755, blob_b1.id)),
+          (('c', 0100755, blob_c2.id), (None, None, None)),
+          ], _merge_entries('', tree2, tree1))
+
+    def assertChangesEqual(self, expected, tree1, tree2, **kwargs):
+        actual = list(tree_changes(self.store, tree1.id, tree2.id, **kwargs))
+        self.assertEqual(expected, actual)
+
+    # For brevity, the following tests use tuples instead of TreeEntry objects.
+
+    def test_tree_changes_empty(self):
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+
+    def test_tree_changes_no_changes(self):
+        blob = make_object(Blob, data='blob')
+        tree = self.commit_tree([('a', blob, 0100644),
+                                 ('b/c', blob, 0100644)])
+        self.assertChangesEqual([], self.empty_tree, self.empty_tree)
+        self.assertChangesEqual([], tree, tree)
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_UNCHANGED, ('a', 0100644, blob.id),
+                      ('a', 0100644, blob.id)),
+           TreeChange(CHANGE_UNCHANGED, ('b/c', 0100644, blob.id),
+                      ('b/c', 0100644, blob.id))],
+          tree, tree, want_unchanged=True)
+
+    def test_tree_changes_add_delete(self):
+        blob_a = make_object(Blob, data='a')
+        blob_b = make_object(Blob, data='b')
+        tree = self.commit_tree([('a', blob_a, 0100644),
+                                 ('x/b', blob_b, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_ADD, _NULL_ENTRY, ('a', 0100644, blob_a.id)),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('x/b', 0100755, blob_b.id))],
+          self.empty_tree, tree)
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_DELETE, ('a', 0100644, blob_a.id), _NULL_ENTRY),
+           TreeChange(CHANGE_DELETE, ('x/b', 0100755, blob_b.id), _NULL_ENTRY)],
+          tree, self.empty_tree)
+
+    def test_tree_changes_modify_contents(self):
+        blob_a1 = make_object(Blob, data='a1')
+        blob_a2 = make_object(Blob, data='a2')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a2, 0100644)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', 0100644, blob_a1.id),
+                      ('a', 0100644, blob_a2.id))], tree1, tree2)
+
+    def test_tree_changes_modify_mode(self):
+        blob_a = make_object(Blob, data='a')
+        tree1 = self.commit_tree([('a', blob_a, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a, 0100755)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', 0100644, blob_a.id),
+                      ('a', 0100755, blob_a.id))], tree1, tree2)
+
+    def test_tree_changes_change_type(self):
+        blob_a1 = make_object(Blob, data='a')
+        blob_a2 = make_object(Blob, data='/foo/bar')
+        tree1 = self.commit_tree([('a', blob_a1, 0100644)])
+        tree2 = self.commit_tree([('a', blob_a2, 0120000)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_DELETE, ('a', 0100644, blob_a1.id), _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('a', 0120000, blob_a2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_to_tree(self):
+        blob_a = make_object(Blob, data='a')
+        blob_x = make_object(Blob, data='x')
+        tree1 = self.commit_tree([('a', blob_a, 0100644)])
+        tree2 = self.commit_tree([('a/x', blob_x, 0100644)])
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_DELETE, ('a', 0100644, blob_a.id), _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('a/x', 0100644, blob_x.id))],
+          tree1, tree2)
+
+    def test_tree_changes_complex(self):
+        blob_a_1 = make_object(Blob, data='a1_1')
+        blob_bx1_1 = make_object(Blob, data='bx1_1')
+        blob_bx2_1 = make_object(Blob, data='bx2_1')
+        blob_by1_1 = make_object(Blob, data='by1_1')
+        blob_by2_1 = make_object(Blob, data='by2_1')
+        tree1 = self.commit_tree([
+          ('a', blob_a_1, 0100644),
+          ('b/x/1', blob_bx1_1, 0100644),
+          ('b/x/2', blob_bx2_1, 0100644),
+          ('b/y/1', blob_by1_1, 0100644),
+          ('b/y/2', blob_by2_1, 0100644),
+          ])
+
+        blob_a_2 = make_object(Blob, data='a1_2')
+        blob_bx1_2 = blob_bx1_1
+        blob_by_2 = make_object(Blob, data='by_2')
+        blob_c_2 = make_object(Blob, data='c_2')
+        tree2 = self.commit_tree([
+          ('a', blob_a_2, 0100644),
+          ('b/x/1', blob_bx1_2, 0100644),
+          ('b/y', blob_by_2, 0100644),
+          ('c', blob_c_2, 0100644),
+          ])
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_MODIFY, ('a', 0100644, blob_a_1.id),
+                      ('a', 0100644, blob_a_2.id)),
+           TreeChange(CHANGE_DELETE, ('b/x/2', 0100644, blob_bx2_1.id),
+                      _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('b/y', 0100644, blob_by_2.id)),
+           TreeChange(CHANGE_DELETE, ('b/y/1', 0100644, blob_by1_1.id),
+                      _NULL_ENTRY),
+           TreeChange(CHANGE_DELETE, ('b/y/2', 0100644, blob_by2_1.id),
+                      _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('c', 0100644, blob_c_2.id))],
+          tree1, tree2)
+
+    def test_tree_changes_name_order(self):
+        blob = make_object(Blob, data='a')
+        tree1 = self.commit_tree([
+          ('a', blob, 0100644),
+          ('a.', blob, 0100644),
+          ('a..', blob, 0100644),
+          ])
+        # Tree order is the reverse of this, so if we used tree order, 'a..'
+        # would not be merged.
+        tree2 = self.commit_tree([
+          ('a/x', blob, 0100644),
+          ('a./x', blob, 0100644),
+          ('a..', blob, 0100644),
+          ])
+
+        self.assertChangesEqual(
+          [TreeChange(CHANGE_DELETE, ('a', 0100644, blob.id), _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('a/x', 0100644, blob.id)),
+           TreeChange(CHANGE_DELETE, ('a.', 0100644, blob.id), _NULL_ENTRY),
+           TreeChange(CHANGE_ADD, _NULL_ENTRY, ('a./x', 0100644, blob.id))],
+          tree1, tree2)
-- 
1.7.3.2.168.gd6b63




Follow ups

References