← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~jelmer/launchpad/avoid-revision-history-2 into lp:launchpad

 

Jelmer Vernooij has proposed merging lp:~jelmer/launchpad/avoid-revision-history-2 into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #903703 in Launchpad itself: "uses deprecated Branch.revision_history call"
  https://bugs.launchpad.net/launchpad/+bug/903703

For more details, see:
https://code.launchpad.net/~jelmer/launchpad/avoid-revision-history-2/+merge/85501

Avoid Branch.revision_history and Repository.get_ancestry calls in the Launchpad codebase.

These calls are both deprecated in newer versions of bzr, so their presence blocks Launchpad using newer versions of Bazaar. They also both scale with the size of history (the reason we deprecated them in Bazaar), so there is a good reason to avoid them anway.

Where I removed Branch.revision_history I made sure the new code wouldn't access the entire branch history, but only those parts it needed.

In particular, this should improve the performance of the branch scanner and help towards fixing bug #808930. It should be noted that this fixes the bzr side of things to not scale with the size of history - the scanner still fetches the full branch history when it retrieves data from the database.


-- 
https://code.launchpad.net/~jelmer/launchpad/avoid-revision-history-2/+merge/85501
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~jelmer/launchpad/avoid-revision-history-2 into lp:launchpad.
=== modified file 'lib/lp/code/interfaces/branch.py'
--- lib/lp/code/interfaces/branch.py	2011-10-27 06:51:38 +0000
+++ lib/lp/code/interfaces/branch.py	2011-12-13 14:52:27 +0000
@@ -898,8 +898,7 @@
 
         :return: tuple of three items.
             1. Ancestry set of bzr revision-ids.
-            2. History list of bzr revision-ids. Similar to the result of
-               bzrlib.Branch.revision_history().
+            2. History list of bzr revision-ids, oldest first.
             3. Dictionnary mapping bzr bzr revision-ids to the database ids of
                the corresponding BranchRevision rows for this branch.
         """

=== modified file 'lib/lp/code/model/branchjob.py'
--- lib/lp/code/model/branchjob.py	2011-08-26 00:43:31 +0000
+++ lib/lp/code/model/branchjob.py	2011-12-13 14:52:27 +0000
@@ -486,15 +486,20 @@
     def iterAddedMainline(self):
         """Iterate through revisions added to the mainline."""
         repository = self.bzr_branch.repository
-        added_revisions = repository.get_graph().find_unique_ancestors(
+        graph = repository.get_graph()
+        branch_last_revinfo = self.bzr_branch.last_revision_info()
+        last_revno = graph.find_distance_to_null(
+            self.last_revision_id,
+            [(branch_last_revinfo[1], branch_last_revinfo[0])])
+        added_revisions = graph.find_unique_ancestors(
             self.last_revision_id, [self.last_scanned_id])
         # Avoid hitting the database since bzrlib makes it easy to check.
-        # There are possibly more efficient ways to get the mainline
-        # revisions, but this is simple and it works.
-        history = self.bzr_branch.revision_history()
-        for num, revid in enumerate(history):
-            if revid in added_revisions:
-                yield repository.get_revision(revid), num + 1
+        history = graph.iter_lefthand_ancestry(
+            self.last_revision_id, (NULL_REVISION, None))
+        for distance, revid in enumerate(history):
+            if not revid in added_revisions:
+                break
+            yield revid, last_revno - distance
 
     def generateDiffs(self):
         """Determine whether to generate diffs."""
@@ -515,8 +520,11 @@
 
         self.bzr_branch.lock_read()
         try:
-            for revision, revno in self.iterAddedMainline():
+            for revision_id, revno in reversed(
+                    list(self.iterAddedMainline())):
                 assert revno is not None
+                revision = self.bzr_branch.repository.get_revision(
+                    revision_id)
                 mailer = self.getMailerForRevision(
                     revision, revno, self.generateDiffs())
                 mailer.sendAll()

=== modified file 'lib/lp/code/model/tests/test_branchjob.py'
--- lib/lp/code/model/tests/test_branchjob.py	2011-10-18 13:41:18 +0000
+++ lib/lp/code/model/tests/test_branchjob.py	2011-12-13 14:52:27 +0000
@@ -377,16 +377,13 @@
         job = RevisionsAddedJob.create(branch, 'rev1', 'rev2', '')
         self.assertEqual([job], list(RevisionsAddedJob.iterReady()))
 
-    def updateDBRevisions(self, branch, bzr_branch, revision_ids=None):
+    def updateDBRevisions(self, branch, bzr_branch, revision_ids):
         """Update the database for the revisions.
 
         :param branch: The database branch associated with the revisions.
         :param bzr_branch: The Bazaar branch associated with the revisions.
-        :param revision_ids: The ids of the revisions to update.  If not
-            supplied, the branch revision history is used.
+        :param revision_ids: The ids of the revisions to update.
         """
-        if revision_ids is None:
-            revision_ids = bzr_branch.revision_history()
         for bzr_revision in bzr_branch.repository.get_revisions(revision_ids):
             existing = branch.getBranchRevision(
                 revision_id=bzr_revision.revision_id)
@@ -433,7 +430,7 @@
         job = RevisionsAddedJob.create(branch, 'rev1', 'rev2', '')
         job.bzr_branch.lock_read()
         self.addCleanup(job.bzr_branch.unlock)
-        [(revision, revno)] = list(job.iterAddedMainline())
+        [(revision_id, revno)] = list(job.iterAddedMainline())
         self.assertEqual(2, revno)
 
     def test_iterAddedNonMainline(self):
@@ -447,24 +444,24 @@
         with override_environ(BZR_EMAIL='me@xxxxxxxxxxx'):
             tree.commit('rev3a', rev_id='rev3a')
         self.updateDBRevisions(branch, tree.branch, ['rev3', 'rev3a'])
-        job = RevisionsAddedJob.create(branch, 'rev1', 'rev3', '')
+        job = RevisionsAddedJob.create(branch, 'rev1', 'rev2', '')
         job.bzr_branch.lock_read()
         self.addCleanup(job.bzr_branch.unlock)
-        out = [x.revision_id for x, y in job.iterAddedMainline()]
+        out = [x for x, y in job.iterAddedMainline()]
         self.assertEqual(['rev2'], out)
 
     def test_iterAddedMainline_order(self):
-        """iterAddedMainline iterates in commit order."""
+        """iterAddedMainline iterates in reverse commit order."""
         self.useBzrBranches(direct_database=True)
         branch, tree = self.create3CommitsBranch()
         job = RevisionsAddedJob.create(branch, 'rev1', 'rev3', '')
         job.bzr_branch.lock_read()
         self.addCleanup(job.bzr_branch.unlock)
         # Since we've gone from rev1 to rev3, we've added rev2 and rev3.
-        [(rev2, revno2), (rev3, revno3)] = list(job.iterAddedMainline())
-        self.assertEqual('rev2', rev2.revision_id)
+        [(rev3, revno3), (rev2, revno2)] = list(job.iterAddedMainline())
+        self.assertEqual('rev2', rev2)
         self.assertEqual(2, revno2)
-        self.assertEqual('rev3', rev3.revision_id)
+        self.assertEqual('rev3', rev3)
         self.assertEqual(3, revno3)
 
     def makeBranchWithCommit(self):
@@ -780,7 +777,8 @@
                 timestamp=1000100000.0, timezone=0)
         transaction.commit()
         self.layer.switchDbUser('branchscanner')
-        self.updateDBRevisions(db_branch, tree.branch)
+        self.updateDBRevisions(
+            db_branch, tree.branch, [first_revision, second_revision])
         expected = (
             u"-" * 60 + '\n'
             "revno: 1" '\n'
@@ -824,7 +822,8 @@
                 timezone=0)
         transaction.commit()
         self.layer.switchDbUser('branchscanner')
-        self.updateDBRevisions(db_branch, tree.branch)
+        self.updateDBRevisions(
+            db_branch, tree.branch, [rev_id])
         job = RevisionsAddedJob.create(db_branch, '', '', '')
         message = job.getRevisionMessage(rev_id, 1)
         # The revision message must be a unicode object.

=== modified file 'lib/lp/codehosting/codeimport/tests/test_worker.py'
--- lib/lp/codehosting/codeimport/tests/test_worker.py	2011-12-11 21:02:54 +0000
+++ lib/lp/codehosting/codeimport/tests/test_worker.py	2011-12-13 14:52:27 +0000
@@ -185,7 +185,7 @@
         store = self.makeBranchStore()
         bzr_branch = store.pull(
             self.arbitrary_branch_id, self.temp_dir, default_format)
-        self.assertEqual([], bzr_branch.revision_history())
+        self.assertEqual((0, 'null:'), bzr_branch.last_revision_info())
 
     def test_getNewBranch_without_tree(self):
         # If pull() with needs_tree=False creates a new branch, it doesn't
@@ -657,7 +657,7 @@
         # import.
         worker = self.makeImportWorker()
         bzr_branch = worker.getBazaarBranch()
-        self.assertEqual([], bzr_branch.revision_history())
+        self.assertEqual((0, 'null:'), bzr_branch.last_revision_info())
 
     def test_bazaarBranchLocation(self):
         # getBazaarBranch makes the working tree under the current working
@@ -844,7 +844,7 @@
         worker.run()
         branch = self.getStoredBazaarBranch(worker)
         self.assertEqual(
-            self.foreign_commit_count, len(branch.revision_history()))
+            self.foreign_commit_count, branch.revno())
 
     def test_sync(self):
         # Do an import.
@@ -854,7 +854,7 @@
         worker.run()
         branch = self.getStoredBazaarBranch(worker)
         self.assertEqual(
-            self.foreign_commit_count, len(branch.revision_history()))
+            self.foreign_commit_count, branch.revno())
 
         # Change the remote branch.
         self.makeForeignCommit(worker.source_details)
@@ -865,7 +865,7 @@
         # Check that the new revisions are in the Bazaar branch.
         branch = self.getStoredBazaarBranch(worker)
         self.assertEqual(
-            self.foreign_commit_count, len(branch.revision_history()))
+            self.foreign_commit_count, branch.revno())
 
     def test_import_script(self):
         # Like test_import, but using the code-import-worker.py script
@@ -902,7 +902,7 @@
         branch = Branch.open(branch_url)
 
         self.assertEqual(
-            self.foreign_commit_count, len(branch.revision_history()))
+            self.foreign_commit_count, branch.revno())
 
     def test_script_exit_codes(self):
         # After a successful import that imports revisions, the worker exits
@@ -1381,8 +1381,7 @@
         self.assertEqual(
             CodeImportWorkerExitCode.SUCCESS, worker.run())
         branch = self.getStoredBazaarBranch(worker)
-        self.assertEqual(
-            1, len(branch.revision_history()))
+        self.assertEqual(1, branch.revno())
         self.assertEqual(
             "Some Random Hacker <jane@xxxxxxxxxxx>",
             branch.repository.get_revision(branch.last_revision()).committer)

=== modified file 'lib/lp/codehosting/codeimport/tests/test_workermonitor.py'
--- lib/lp/codehosting/codeimport/tests/test_workermonitor.py	2011-12-10 23:19:26 +0000
+++ lib/lp/codehosting/codeimport/tests/test_workermonitor.py	2011-12-13 14:52:27 +0000
@@ -803,8 +803,7 @@
         url = get_default_bazaar_branch_store()._getMirrorURL(
             code_import.branch.id)
         branch = Branch.open(url)
-        self.assertEqual(
-            self.foreign_commit_count, len(branch.revision_history()))
+        self.assertEqual(self.foreign_commit_count, branch.revno())
 
     def assertImported(self, ignored, code_import_id):
         """Assert that the `CodeImport` of the given id was imported."""

=== modified file 'lib/lp/codehosting/scanner/bzrsync.py'
--- lib/lp/codehosting/scanner/bzrsync.py	2011-06-14 18:36:07 +0000
+++ lib/lp/codehosting/scanner/bzrsync.py	2011-12-13 14:52:27 +0000
@@ -85,7 +85,7 @@
         # Get the history and ancestry from the branch first, to fail early
         # if something is wrong with the branch.
         self.logger.info("Retrieving history from bzrlib.")
-        bzr_history = bzr_branch.revision_history()
+        last_revision_info = bzr_branch.last_revision_info()
         # The BranchRevision, Revision and RevisionParent tables are only
         # written to by the branch-scanner, so they are not subject to
         # write-lock contention. Update them all in a single transaction to
@@ -94,7 +94,7 @@
 
         (new_ancestry, branchrevisions_to_delete,
             revids_to_insert) = self.planDatabaseChanges(
-            bzr_branch, bzr_history, db_ancestry, db_history)
+            bzr_branch, last_revision_info, db_ancestry, db_history)
         new_db_revs = (
             new_ancestry - getUtility(IRevisionSet).onlyPresent(new_ancestry))
         self.logger.info("Adding %s new revisions.", len(new_db_revs))
@@ -126,7 +126,7 @@
         # not been updated. Since this has no ill-effect, and can only err on
         # the pessimistic side (tell the user the data has not yet been
         # updated although it has), the race is acceptable.
-        self.updateBranchStatus(bzr_history)
+        self.updateBranchStatus(last_revision_info)
         notify(
             events.ScanCompleted(
                 self.db_branch, bzr_branch, self.logger, new_ancestry))
@@ -156,41 +156,40 @@
 
         return bzr_branch.repository.get_graph(PPSource)
 
-    def getAncestryDelta(self, bzr_branch):
-        bzr_last = bzr_branch.last_revision()
+    def getAncestryDelta(self, bzr_branch, bzr_last_revinfo):
+        bzr_last = bzr_last_revinfo[1]
         db_last = self.db_branch.last_scanned_id
         if db_last is None:
-            added_ancestry = set(bzr_branch.repository.get_ancestry(bzr_last))
-            added_ancestry.discard(None)
-            removed_ancestry = set()
-        else:
-            graph = self._getRevisionGraph(bzr_branch, db_last)
+            db_last = NULL_REVISION
+        graph = self._getRevisionGraph(bzr_branch, db_last)
+        bzr_branch.lock_read()
+        try:
             added_ancestry, removed_ancestry = (
                 graph.find_difference(bzr_last, db_last))
-            added_ancestry.discard(NULL_REVISION)
+        finally:
+            bzr_branch.unlock()
+        added_ancestry.discard(NULL_REVISION)
         return added_ancestry, removed_ancestry
 
-    def getHistoryDelta(self, bzr_history, db_history):
+    def getHistoryDelta(self, bzr_branch, bzr_last_revinfo, db_history):
         self.logger.info("Calculating history delta.")
-        common_len = min(len(bzr_history), len(db_history))
+        common_len = min(bzr_last_revinfo[0], len(db_history))
+        common_revid = NULL_REVISION
         while common_len > 0:
-            # The outer conditional improves efficiency. Without it, the
-            # algorithm is O(history-size * change-size), which can be
-            # excessive if a long branch is replaced by another long branch
-            # with a distant (or no) common mainline parent. The inner
-            # conditional is needed for correctness with branches where the
-            # history does not follow the line of leftmost parents.
-            if db_history[common_len - 1] == bzr_history[common_len - 1]:
-                if db_history[:common_len] == bzr_history[:common_len]:
-                    break
+            if db_history[common_len - 1] == bzr_branch.get_rev_id(common_len - 1):
+                common_revid = db_history[common_len - 1]
+                break
             common_len -= 1
         # Revision added or removed from the branch's history. These lists may
         # include revisions whose history position has merely changed.
         removed_history = db_history[common_len:]
-        added_history = bzr_history[common_len:]
+        bzr_graph = bzr_branch.repository.get_graph()
+        added_history = list(bzr_graph.iter_lefthand_ancestry(bzr_last_revinfo[1],
+            (common_revid, )))
+        added_history.reverse()
         return added_history, removed_history
 
-    def planDatabaseChanges(self, bzr_branch, bzr_history, db_ancestry,
+    def planDatabaseChanges(self, bzr_branch, bzr_last_revinfo, db_ancestry,
                             db_history):
         """Plan database changes to synchronize with bzrlib data.
 
@@ -200,8 +199,9 @@
         self.logger.info("Planning changes.")
         # Find the length of the common history.
         added_history, removed_history = self.getHistoryDelta(
-            bzr_history, db_history)
-        added_ancestry, removed_ancestry = self.getAncestryDelta(bzr_branch)
+            bzr_branch, bzr_last_revinfo, db_history)
+        added_ancestry, removed_ancestry = self.getAncestryDelta(
+            bzr_branch, bzr_last_revinfo)
 
         notify(
             events.RevisionsRemoved(
@@ -216,10 +216,9 @@
 
         # We must insert BranchRevision rows for all revisions which were
         # added to the ancestry or whose sequence value has changed.
-        last_revno = len(bzr_history)
         revids_to_insert = dict(
             self.revisionsToInsert(
-                added_history, last_revno, added_ancestry))
+                added_history, bzr_last_revinfo[0], added_ancestry))
         # We must remove any stray BranchRevisions that happen to already be
         # present.
         existing_branchrevisions = Store.of(self.db_branch).find(
@@ -297,12 +296,10 @@
         for revid_seq_pair_chunk in iter_list_chunks(revid_seq_pairs, 1000):
             self.db_branch.createBranchRevisionFromIDs(revid_seq_pair_chunk)
 
-    def updateBranchStatus(self, bzr_history):
+    def updateBranchStatus(self, (revision_count, last_revision)):
         """Update the branch-scanner status in the database Branch table."""
         # Record that the branch has been updated.
-        revision_count = len(bzr_history)
         if revision_count > 0:
-            last_revision = bzr_history[-1]
             revision = getUtility(IRevisionSet).getByRevisionId(last_revision)
         else:
             revision = None

=== modified file 'lib/lp/codehosting/scanner/tests/test_bzrsync.py'
--- lib/lp/codehosting/scanner/tests/test_bzrsync.py	2011-09-26 07:21:53 +0000
+++ lib/lp/codehosting/scanner/tests/test_bzrsync.py	2011-12-13 14:52:27 +0000
@@ -429,7 +429,8 @@
             else:
                 bzr_branch.set_last_revision_info(revno, bzr_rev)
                 delta_branch = bzr_branch
-            return sync.getAncestryDelta(delta_branch)
+            return sync.getAncestryDelta(
+                delta_branch, delta_branch.last_revision_info())
 
         added_ancestry, removed_ancestry = get_delta('merge', None)
         # All revisions are new for an unscanned branch
@@ -472,8 +473,9 @@
         # yield each revision along with a sequence number, starting at 1.
         self.commitRevision(rev_id='rev-1')
         bzrsync = self.makeBzrSync(self.db_branch)
-        bzr_history = self.bzr_branch.revision_history()
-        added_ancestry = bzrsync.getAncestryDelta(self.bzr_branch)[0]
+        bzr_history = ['rev-1']
+        added_ancestry = bzrsync.getAncestryDelta(
+            self.bzr_branch, self.bzr_branch.last_revision_info())[0]
         result = bzrsync.revisionsToInsert(
             bzr_history, self.bzr_branch.revno(), added_ancestry)
         self.assertEqual({'rev-1': 1}, dict(result))
@@ -484,8 +486,9 @@
         (db_branch, bzr_tree), ignored = self.makeBranchWithMerge(
             'base', 'trunk', 'branch', 'merge')
         bzrsync = self.makeBzrSync(db_branch)
-        bzr_history = bzr_tree.branch.revision_history()
-        added_ancestry = bzrsync.getAncestryDelta(bzr_tree.branch)[0]
+        bzr_history = ['base', 'trunk', 'merge']
+        added_ancestry = bzrsync.getAncestryDelta(
+            bzr_tree.branch, bzr_tree.branch.last_revision_info())[0]
         expected = {'base': 1, 'trunk': 2, 'merge': 3, 'branch': None}
         self.assertEqual(
             expected, dict(bzrsync.revisionsToInsert(bzr_history,
@@ -579,7 +582,7 @@
         self.db_branch.last_scanned_id = rev1_id
         db_ancestry, db_history = self.db_branch.getScannerData()
         branchrevisions_to_delete = syncer.planDatabaseChanges(
-            self.bzr_branch, [rev1_id, rev2_id], db_ancestry, db_history)[1]
+            self.bzr_branch, (2, rev2_id), db_ancestry, db_history)[1]
         self.assertIn(merge_id, branchrevisions_to_delete)
 
 

=== modified file 'lib/lp/testing/factory.py'
--- lib/lp/testing/factory.py	2011-12-12 04:47:21 +0000
+++ lib/lp/testing/factory.py	2011-12-13 14:52:27 +0000
@@ -1564,7 +1564,7 @@
             source_branch = merge_proposal.source_branch
 
         def make_revision(parent=None):
-            sequence = source_branch.revision_history.count() + 1
+            sequence = source_branch.revno() + 1
             if parent is None:
                 parent_ids = []
             else:


Follow ups