← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad.

Commit message:
Verify that a transition makes sense in BuildFarmJob.updateStatus, and improve buildd-manager resume and cycle logging.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #1422303 in Launchpad itself: "buildd-manager can make insane status transitions"
  https://bugs.launchpad.net/launchpad/+bug/1422303

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/buildd-manager-robustification/+merge/249930

Verify that a transition makes sense in BuildFarmJob.updateStatus, and improve buildd-manager resume and cycle logging.

buildd-manager failure counting was also adjusted to avoid crashing when attempting to fail a build that cannot be failed. It will now force the transition unless it's FULLYBUILT, since that status is out of buildd-manager's hands and a retry can't succeed without DB surgery.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py	2014-06-25 11:32:15 +0000
+++ lib/lp/buildmaster/interactor.py	2015-02-17 09:01:30 +0000
@@ -316,6 +316,8 @@
                     builder_factory[vitals.name].setCleanStatus(
                         BuilderCleanStatus.CLEANING)
                     transaction.commit()
+                    logger = cls._getSlaveScannerLogger()
+                    logger.info("%s is being cleaned.", vitals.name)
                 defer.returnValue(False)
             raise CannotResumeHost(
                 "Invalid vm_reset_protocol: %r" % vitals.vm_reset_protocol)

=== modified file 'lib/lp/buildmaster/interfaces/buildfarmjob.py'
--- lib/lp/buildmaster/interfaces/buildfarmjob.py	2014-06-26 06:26:29 +0000
+++ lib/lp/buildmaster/interfaces/buildfarmjob.py	2015-02-17 09:01:30 +0000
@@ -180,13 +180,19 @@
         """Set the `LibraryFileAlias` that contains the job log."""
 
     def updateStatus(status, builder=None, slave_status=None,
-                     date_started=None, date_finished=None):
+                     date_started=None, date_finished=None,
+                     force_invalid_transition=False):
         """Update job metadata when the build status changes.
 
         This automatically handles setting status, date_finished, builder,
         dependencies. Later it will manage the denormalised search schema.
 
         date_started and date_finished override the default (now).
+
+        Only sensible transitions are permitted unless
+        force_invalid_transition is set. The override only exists for
+        tests and as an escape hatch for buildd-manager's failure
+        counting. You do not want to use it.
         """
 
     def gotFailure():

=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-06-26 12:52:06 +0000
+++ lib/lp/buildmaster/manager.py	2015-02-17 09:01:30 +0000
@@ -219,7 +219,20 @@
         elif job_action == False:
             # Fail and dequeue the job.
             logger.info("Failing job %s.", job.build_cookie)
-            job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
+            if job.specific_build.status == BuildStatus.FULLYBUILT:
+                # A FULLYBUILT build should be out of our hands, and
+                # probably has artifacts like binaries attached. It's
+                # impossible to enter the state twice, so don't revert
+                # the status. Something's wrong, so log an OOPS and get
+                # it out of the queue to avoid further corruption.
+                logger.warning(
+                    "Build is already successful! Dequeuing but leaving build "
+                    "status alone. Something is very wrong.")
+            else:
+                # Whatever it was before, we want it failed. We're an
+                # error handler, so let's not risk more errors.
+                job.specific_build.updateStatus(
+                    BuildStatus.FAILEDTOBUILD, force_invalid_transition=True)
             job.destroySelf()
         elif job_action == True:
             # Reset the job so it will be retried elsewhere.
@@ -319,6 +332,7 @@
         return d
 
     def _updateDateScanned(self, ignored):
+        self.logger.debug("Scan finished for builder %s" % self.builder_name)
         self.date_scanned = datetime.datetime.utcnow()
 
     def _scanFailed(self, retry, failure):
@@ -541,6 +555,7 @@
 
     def scan(self):
         """If a new builder appears, create a SlaveScanner for it."""
+        self.manager.logger.debug("Refreshing builders from the database.")
         try:
             self.manager.builder_factory.update()
             new_builders = self.checkForNewBuilders()
@@ -550,6 +565,7 @@
                 "Failure while updating builders:\n",
                 exc_info=True)
             transaction.abort()
+        self.manager.logger.debug("Builder refresh complete.")
 
     def checkForNewBuilders(self):
         """See if any new builders were added."""

=== modified file 'lib/lp/buildmaster/model/buildfarmjob.py'
--- lib/lp/buildmaster/model/buildfarmjob.py	2014-06-26 06:26:29 +0000
+++ lib/lp/buildmaster/model/buildfarmjob.py	2015-02-17 09:01:30 +0000
@@ -50,6 +50,23 @@
     )
 
 
+VALID_STATUS_TRANSITIONS = {
+    BuildStatus.NEEDSBUILD: tuple(BuildStatus.items),
+    BuildStatus.FULLYBUILT: (),
+    BuildStatus.FAILEDTOBUILD: (BuildStatus.NEEDSBUILD,),
+    BuildStatus.MANUALDEPWAIT: (BuildStatus.NEEDSBUILD,),
+    BuildStatus.CHROOTWAIT: (BuildStatus.NEEDSBUILD,),
+    BuildStatus.SUPERSEDED: (),
+    BuildStatus.BUILDING: tuple(BuildStatus.items),
+    BuildStatus.FAILEDTOUPLOAD: (BuildStatus.NEEDSBUILD,),
+    BuildStatus.UPLOADING: (
+        BuildStatus.FULLYBUILT, BuildStatus.FAILEDTOUPLOAD,
+        BuildStatus.SUPERSEDED),
+    BuildStatus.CANCELLING: (BuildStatus.CANCELLED,),
+    BuildStatus.CANCELLED: (BuildStatus.NEEDSBUILD,),
+    }
+
+
 class BuildFarmJob(Storm):
     """A base implementation for `IBuildFarmJob` classes."""
     __storm_table__ = 'BuildFarmJob'
@@ -157,8 +174,17 @@
         self.log = log
 
     def updateStatus(self, status, builder=None, slave_status=None,
-                     date_started=None, date_finished=None):
+                     date_started=None, date_finished=None,
+                     force_invalid_transition=False):
         """See `IBuildFarmJob`."""
+        if (not force_invalid_transition
+                and status != self.build_farm_job.status
+                and status not in VALID_STATUS_TRANSITIONS[
+                    self.build_farm_job.status]):
+            raise AssertionError(
+                "Can't change build status from %s to %s."
+                % (self.build_farm_job.status.name, status.name))
+
         self.build_farm_job.status = self.status = status
 
         # If there's a builder provided, set it if we don't already have

=== modified file 'lib/lp/buildmaster/model/packagebuild.py'
--- lib/lp/buildmaster/model/packagebuild.py	2013-11-15 04:18:59 +0000
+++ lib/lp/buildmaster/model/packagebuild.py	2015-02-17 09:01:30 +0000
@@ -49,10 +49,12 @@
         return self.archive.private
 
     def updateStatus(self, status, builder=None, slave_status=None,
-                     date_started=None, date_finished=None):
+                     date_started=None, date_finished=None,
+                     force_invalid_transition=False):
         super(PackageBuildMixin, self).updateStatus(
             status, builder=builder, slave_status=slave_status,
-            date_started=date_started, date_finished=date_finished)
+            date_started=date_started, date_finished=date_finished,
+            force_invalid_transition=force_invalid_transition)
 
         if (status == BuildStatus.MANUALDEPWAIT and slave_status is not None
             and slave_status.get('dependencies') is not None):

=== modified file 'lib/lp/buildmaster/tests/test_buildfarmjob.py'
--- lib/lp/buildmaster/tests/test_buildfarmjob.py	2013-01-31 23:43:04 +0000
+++ lib/lp/buildmaster/tests/test_buildfarmjob.py	2015-02-17 09:01:30 +0000
@@ -180,6 +180,7 @@
 
         # Setting it to BUILDING for the first time sets date_started
         # and date_first_dispatched.
+        self.build_farm_job.updateStatus(BuildStatus.NEEDSBUILD)
         self.build_farm_job.updateStatus(BuildStatus.BUILDING)
         self.assertIsNot(None, self.build_farm_job.date_started)
         first = self.build_farm_job.date_started
@@ -211,6 +212,8 @@
             build.updateStatus(status)
             self.assertIs(None, build.date_started)
             self.assertIs(None, build.date_finished)
+            build.updateStatus(
+                BuildStatus.NEEDSBUILD, force_invalid_transition=True)
             build.updateStatus(BuildStatus.BUILDING)
             self.assertIsNot(None, build.date_started)
             self.assertIs(None, build.date_finished)

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-26 12:36:03 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2015-02-17 09:01:30 +0000
@@ -1107,6 +1107,26 @@
         self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
         self.assertEqual(0, self.builder.failure_count)
 
+    def test_bad_job_does_not_unsucceed(self):
+        # If a FULLYBUILT build somehow ends up back in buildd-manager,
+        # all manner of failures can occur as invariants are violated.
+        # But we can't just fail and later retry the build as normal, as
+        # a FULLYBUILT build has binaries. Instead, failure handling
+        # just destroys the BuildQueue and leaves the status as
+        # FULLYBUILT.
+        self.build.updateStatus(BuildStatus.FULLYBUILT)
+        self.build.gotFailure()
+        self.build.gotFailure()
+        self.builder.gotFailure()
+
+        log = self._recover_failure("failnotes")
+        self.assertIn("Failing job", log)
+        self.assertIn("Build is already successful!", log)
+        self.assertIn("Resetting failure count of builder", log)
+        self.assertIs(None, self.builder.currentjob)
+        self.assertEqual(self.build.status, BuildStatus.FULLYBUILT)
+        self.assertEqual(0, self.builder.failure_count)
+
     def test_failure_during_cancellation_cancels(self):
         self.buildqueue.cancel()
         self.assertEqual(BuildStatus.CANCELLING, self.build.status)

=== modified file 'lib/lp/buildmaster/tests/test_packagebuild.py'
--- lib/lp/buildmaster/tests/test_packagebuild.py	2014-07-24 09:37:03 +0000
+++ lib/lp/buildmaster/tests/test_packagebuild.py	2015-02-17 09:01:30 +0000
@@ -54,7 +54,7 @@
         self.package_build.updateStatus(
             BuildStatus.MANUALDEPWAIT, slave_status={'dependencies': u'deps'})
         self.assertEqual(u'deps', self.package_build.dependencies)
-        self.package_build.updateStatus(BuildStatus.FULLYBUILT)
+        self.package_build.updateStatus(BuildStatus.NEEDSBUILD)
         self.assertEqual(None, self.package_build.dependencies)
 
     def test_log_url(self):

=== modified file 'lib/lp/code/model/tests/test_recipebuilder.py'
--- lib/lp/code/model/tests/test_recipebuilder.py	2014-06-26 07:39:03 +0000
+++ lib/lp/code/model/tests/test_recipebuilder.py	2015-02-17 09:01:30 +0000
@@ -284,7 +284,6 @@
         self.queue_record = (
             self.factory.makeSourcePackageRecipeBuild().queueBuild())
         build = self.queue_record.specific_build
-        build.updateStatus(BuildStatus.FULLYBUILT)
         if fake_successful_upload:
             removeSecurityProxy(build).verifySuccessfulUpload = FakeMethod(
                 result=True)

=== modified file 'lib/lp/soyuz/browser/tests/test_build_views.py'
--- lib/lp/soyuz/browser/tests/test_build_views.py	2014-10-31 07:00:29 +0000
+++ lib/lp/soyuz/browser/tests/test_build_views.py	2015-02-17 09:01:30 +0000
@@ -399,7 +399,8 @@
             # BPBs in certain states need a bit tweaking to appear in
             # the result of getBuildRecords().
             if status == BuildStatus.FULLYBUILT:
-                build.updateStatus(BuildStatus.BUILDING)
+                build.updateStatus(
+                    BuildStatus.BUILDING, force_invalid_transition=True)
                 build.updateStatus(BuildStatus.FULLYBUILT)
             elif status in (BuildStatus.NEEDSBUILD, BuildStatus.BUILDING):
                 build.queueBuild()

=== modified file 'lib/lp/soyuz/browser/tests/test_builder.py'
--- lib/lp/soyuz/browser/tests/test_builder.py	2014-06-23 10:07:18 +0000
+++ lib/lp/soyuz/browser/tests/test_builder.py	2015-02-17 09:01:30 +0000
@@ -8,6 +8,7 @@
 from testtools.matchers import Equals
 from zope.component import getUtility
 
+from lp.buildmaster.enums import BuildStatus
 from lp.buildmaster.interfaces.builder import IBuilderSet
 from lp.services.job.model.job import Job
 from lp.soyuz.browser.tests.test_builder_views import BuildCreationMixin
@@ -43,6 +44,8 @@
     def test_builders_binary_package_build_query_count(self):
         def create_build():
             build = self.createBinaryPackageBuild()
+            build.updateStatus(
+                BuildStatus.NEEDSBUILD, force_invalid_transition=True)
             queue = build.queueBuild()
             queue.markAsBuilding(build.builder)
 
@@ -54,6 +57,8 @@
     def test_builders_recipe_build_query_count(self):
         def create_build():
             build = self.createRecipeBuildWithBuilder()
+            build.updateStatus(
+                BuildStatus.NEEDSBUILD, force_invalid_transition=True)
             queue = build.queueBuild()
             queue.markAsBuilding(build.builder)
 

=== modified file 'lib/lp/soyuz/doc/build-failedtoupload-workflow.txt'
--- lib/lp/soyuz/doc/build-failedtoupload-workflow.txt	2014-07-09 13:15:47 +0000
+++ lib/lp/soyuz/doc/build-failedtoupload-workflow.txt	2015-02-17 09:01:30 +0000
@@ -103,87 +103,3 @@
   i386 build of cdrkit 1.0 in ubuntu breezy-autotest RELEASE
   http://launchpad.dev/ubuntu/+source/cdrkit/1.0/+build/22
   <BLANKLINE>
-
-
-== Rescuing FAILEDTOBUILD record ==
-
-Let's emulate the procedure of rescuing an FAILEDTOUPLOAD build.
-A FAILEDTOUPLOAD build obviously has no binaries:
-
-  >>> print failedtoupload_candidate.status.name
-  FAILEDTOUPLOAD
-
-  >>> failedtoupload_candidate.binarypackages.count()
-  0
-
-  >>> failedtoupload_candidate.verifySuccessfulUpload()
-  False
-
-But once the person responsible have received the information about
-the binary upload failure via build notification, or accessing the
-build page, he will be aware of:
-
- * The targeted Build ID
- * Path where the rejected/failed upload is, usually in some
-   destination inside buildd upload queue (/srv/launchpad.net/builddmaster/*)
-
-Having those informations, one of buildd-admin team members will be
-able to reprocess the binary upload via an shell account following
-these steps:
-
- * Move the rejected/failed upload directory to 'incoming' queue.
- * Issue a process-upload command as:
-   `process-upload.py
-        -C buildd
-        -r <DISTRORELEASE>
-        -b <BUILD_ID>
-        -J <LEAF_NAME>
-        -Mvv
-        <UPLOAD_FUL_PATH>`
- * LEAF_NAME is the upload directory name, required to not affect
-   other uploads in progress.
- * '-Mvv' will produce descriptive debug information and suppress
-   upload email notifications, that are definetelly *not wanted*  for
-   binary uploads.
- * It's possible to use '-n' (dry-run) if he is unsure about the parameters.
-
-For simplicity we will process the binary upload using the
-NascentUpload class instead of calling the script:
-
-  >>> login('foo.bar@xxxxxxxxxxxxx')
-
-  >>> from lp.archiveuploader.nascentupload import NascentUpload
-  >>> from lp.archiveuploader.tests import datadir, getPolicy
-
-The policy contains the right arguments to reprocess the build for the
-right target location, they are passed via process-upload command-line:
-
-  >>> buildd_policy = getPolicy(
-  ...     name='buildd',
-  ...     distro=failedtoupload_candidate.distribution.name,
-  ...     distroseries=failedtoupload_candidate.distro_series.name)
-
-  >>> from lp.services.log.logger import DevNullLogger
-  >>> cdrkit_bin_upload = NascentUpload.from_changesfile_path(
-  ...     datadir('suite/cdrkit_1.0/cdrkit_1.0_i386.changes'),
-  ...     buildd_policy, DevNullLogger())
-  >>> cdrkit_bin_upload.process()
-  >>> cdrkit_bin_upload.is_rejected
-  False
-  >>> success = cdrkit_bin_upload.do_accept(build=failedtoupload_candidate)
-  >>> print cdrkit_bin_upload.queue_root.status.name
-  NEW
-
-After successfully reprocessing binary upload the originally
-FAILEDTOUPLOAD build record is already in FULLYBUILT state.
-
-  >>> print failedtoupload_candidate.status.name
-  FULLYBUILT
-
-And contains the associated binaries.
-
-  >>> failedtoupload_candidate.binarypackages.count()
-  1
-  >>> failedtoupload_candidate.verifySuccessfulUpload()
-  True
-

=== modified file 'lib/lp/soyuz/tests/test_publishing.py'
--- lib/lp/soyuz/tests/test_publishing.py	2014-10-31 10:34:51 +0000
+++ lib/lp/soyuz/tests/test_publishing.py	2015-02-17 09:01:30 +0000
@@ -431,7 +431,9 @@
         # Adjust the build record in way it looks complete.
         date_finished = datetime.datetime(2008, 1, 1, 0, 5, 0, tzinfo=pytz.UTC)
         date_started = date_finished - datetime.timedelta(minutes=5)
-        build.updateStatus(BuildStatus.BUILDING, date_started=date_started)
+        build.updateStatus(
+            BuildStatus.BUILDING, date_started=date_started,
+            force_invalid_transition=True)
         build.updateStatus(BuildStatus.FULLYBUILT, date_finished=date_finished)
         buildlog_filename = 'buildlog_%s-%s-%s.%s_%s_%s.txt.gz' % (
             build.distribution.name,


Follow ups