← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/commit-ALL-the-build-changes into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/commit-ALL-the-build-changes into lp:launchpad.

Commit message:
Fix all BuildFarmJobBehaviors to commit any pending database changes before yielding to the reactor. Otherwise an intermediate scan may fail and cause the changes to be aborted.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #717969 in Launchpad itself: "storeBuildInfo is sometimes ineffective"
  https://bugs.launchpad.net/launchpad/+bug/717969

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/commit-ALL-the-build-changes/+merge/144440

This branch hopefully fixes bug #717969 and various other build inconsistencies that we see from buildd-manager.

buildd-manager is thoroughly Twisted and interacts with the DB. All the main operations occur in a single thread, so there's a single current transaction shared across all of the builders' scanners. Because all the build handling code was ported from the pre-Twisted buildd-manager/sequencer, it wasn't designed with this transaction sharing in mind. If a BuildFarmJobBehavior yielded back to the reactor before committing, it was possible that a scan from another builder would fail and abort the transaction before the original scan resumed to perform the commit.

Since I moved all the Twisted code into BFJB and derivatives last week, it's now relatively easy to ensure that a commit precedes each yield to the reactor. So I did that manually, inserting a transaction.commit() between any DB change and a call that might yield.
-- 
https://code.launchpad.net/~wgrant/launchpad/commit-ALL-the-build-changes/+merge/144440
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/commit-ALL-the-build-changes into lp:launchpad.
=== modified file 'lib/lp/buildmaster/model/buildfarmjobbehavior.py'
--- lib/lp/buildmaster/model/buildfarmjobbehavior.py	2013-01-22 07:24:05 +0000
+++ lib/lp/buildmaster/model/buildfarmjobbehavior.py	2013-01-23 06:50:26 +0000
@@ -16,7 +16,7 @@
 import socket
 import xmlrpclib
 
-from storm.store import Store
+import transaction
 from twisted.internet import defer
 from zope.component import getUtility
 from zope.interface import implements
@@ -103,6 +103,7 @@
         lfa_id = yield self.getLogFromSlave(
             self.build, build_queue or self.build.buildqueue_record)
         self.build.setLog(lfa_id)
+        transaction.commit()
 
     def updateBuild(self, queueItem):
         """See `IBuildFarmJobBehavior`."""
@@ -139,6 +140,7 @@
                 # known.
                 queueItem._builder = None
                 queueItem.setDateStarted(None)
+                transaction.commit()
                 return
 
             # Since logtail is a xmlrpclib.Binary container and it is
@@ -165,12 +167,14 @@
             "Builder %s forgot about buildqueue %d -- resetting buildqueue "
             "record" % (queueItem.builder.url, queueItem.id))
         queueItem.reset()
+        transaction.commit()
 
     def updateBuild_BUILDING(self, queueItem, slave_status, logtail, logger):
         """Build still building, collect the logtail"""
         if queueItem.job.status != JobStatus.RUNNING:
             queueItem.job.start()
         queueItem.logtail = encoding.guess(str(logtail))
+        transaction.commit()
 
     def updateBuild_ABORTING(self, queueItem, slave_status, logtail, logger):
         """Build was ABORTED.
@@ -178,6 +182,7 @@
         Master-side should wait until the slave finish the process correctly.
         """
         queueItem.logtail = "Waiting for slave process to be terminated"
+        transaction.commit()
 
     def updateBuild_ABORTED(self, queueItem, slave_status, logtail, logger):
         """ABORTING process has successfully terminated.
@@ -191,6 +196,7 @@
             if queueItem.job.status != JobStatus.FAILED:
                 queueItem.job.fail()
             queueItem.specific_job.jobAborted()
+            transaction.commit()
         return d.addCallback(got_cleaned)
 
     def extractBuildStatus(self, slave_status):
@@ -269,8 +275,8 @@
         if build.build_farm_job_type == BuildFarmJobType.PACKAGEBUILD:
             build = build.buildqueue_record.specific_job.build
             if not build.current_source_publication:
+                yield self.build.buildqueue_record.builder.cleanSlave()
                 build.updateStatus(BuildStatus.SUPERSEDED)
-                yield self.build.buildqueue_record.builder.cleanSlave()
                 self.build.buildqueue_record.destroySelf()
                 return
 
@@ -324,6 +330,8 @@
         build.updateStatus(
             status, builder=build.buildqueue_record.builder,
             slave_status=slave_status)
+        transaction.commit()
+
         yield self.storeLogFromSlave()
 
         # We only attempt the upload if we successfully copied all the
@@ -346,10 +354,7 @@
 
         yield self.build.buildqueue_record.builder.cleanSlave()
         self.build.buildqueue_record.destroySelf()
-
-        # Commit so there are no race conditions with archiveuploader
-        # about build.status.
-        Store.of(build).commit()
+        transaction.commit()
 
         # Move the directory used to grab the binaries into
         # the incoming directory so the upload processor never
@@ -369,11 +374,13 @@
         self.build.updateStatus(
             status, builder=self.build.buildqueue_record.builder,
             slave_status=slave_status)
+        transaction.commit()
         yield self.storeLogFromSlave()
         if send_notification:
             self.build.notify()
         yield self.build.buildqueue_record.builder.cleanSlave()
         self.build.buildqueue_record.destroySelf()
+        transaction.commit()
 
     def _handleStatus_PACKAGEFAIL(self, librarian, slave_status, logger,
                                   send_notification):
@@ -405,6 +412,7 @@
         self.build.buildqueue_record.builder.failBuilder(
             "Builder returned BUILDERFAIL when asked for its status")
         self.build.buildqueue_record.reset()
+        transaction.commit()
 
     @defer.inlineCallbacks
     def _handleStatus_GIVENBACK(self, librarian, slave_status, logger,
@@ -417,6 +425,7 @@
         """
         yield self.build.buildqueue_record.builder.cleanSlave()
         self.build.buildqueue_record.reset()
+        transaction.commit()
 
 
 class IdleBuildBehavior(BuildFarmJobBehaviorBase):

=== modified file 'lib/lp/translations/model/translationtemplatesbuildbehavior.py'
--- lib/lp/translations/model/translationtemplatesbuildbehavior.py	2013-01-22 07:27:17 +0000
+++ lib/lp/translations/model/translationtemplatesbuildbehavior.py	2013-01-23 06:50:26 +0000
@@ -14,6 +14,7 @@
 import os
 import tempfile
 
+import transaction
 from twisted.internet import defer
 from zope.component import getUtility
 from zope.interface import implements
@@ -138,6 +139,7 @@
         if build_status == 'OK':
             self.build.updateStatus(
                 BuildStatus.UPLOADING, builder=queue_item.builder)
+            transaction.commit()
             logger.debug("Processing successful templates build.")
             filemap = slave_status.get('filemap')
             filename = yield self._readTarball(queue_item, filemap, logger)
@@ -168,8 +170,10 @@
         else:
             self.build.updateStatus(
                 BuildStatus.FAILEDTOBUILD, builder=queue_item.builder)
+        transaction.commit()
 
         yield self.storeLogFromSlave(build_queue=queue_item)
 
         yield queue_item.builder.cleanSlave()
         queue_item.destroySelf()
+        transaction.commit()