launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #17889
[Merge] lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad
William Grant has proposed merging lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad.
Commit message:
Verify that a transition makes sense in BuildFarmJob.updateStatus, and improve buildd-manager resume and cycle logging.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #1422303 in Launchpad itself: "buildd-manager can make insane status transitions"
https://bugs.launchpad.net/launchpad/+bug/1422303
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/buildd-manager-robustification/+merge/249930
Verify that a transition makes sense in BuildFarmJob.updateStatus, and improve buildd-manager resume and cycle logging.
buildd-manager failure counting was also adjusted to avoid crashing when attempting to fail a build that cannot be failed. It will now force the transition unless it's FULLYBUILT, since that status is out of buildd-manager's hands and a retry can't succeed without DB surgery.
--
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/buildd-manager-robustification into lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py 2014-06-25 11:32:15 +0000
+++ lib/lp/buildmaster/interactor.py 2015-02-17 09:01:30 +0000
@@ -316,6 +316,8 @@
builder_factory[vitals.name].setCleanStatus(
BuilderCleanStatus.CLEANING)
transaction.commit()
+ logger = cls._getSlaveScannerLogger()
+ logger.info("%s is being cleaned.", vitals.name)
defer.returnValue(False)
raise CannotResumeHost(
"Invalid vm_reset_protocol: %r" % vitals.vm_reset_protocol)
=== modified file 'lib/lp/buildmaster/interfaces/buildfarmjob.py'
--- lib/lp/buildmaster/interfaces/buildfarmjob.py 2014-06-26 06:26:29 +0000
+++ lib/lp/buildmaster/interfaces/buildfarmjob.py 2015-02-17 09:01:30 +0000
@@ -180,13 +180,19 @@
"""Set the `LibraryFileAlias` that contains the job log."""
def updateStatus(status, builder=None, slave_status=None,
- date_started=None, date_finished=None):
+ date_started=None, date_finished=None,
+ force_invalid_transition=False):
"""Update job metadata when the build status changes.
This automatically handles setting status, date_finished, builder,
dependencies. Later it will manage the denormalised search schema.
date_started and date_finished override the default (now).
+
+ Only sensible transitions are permitted unless
+ force_invalid_transition is set. The override only exists for
+ tests and as an escape hatch for buildd-manager's failure
+ counting. You do not want to use it.
"""
def gotFailure():
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py 2014-06-26 12:52:06 +0000
+++ lib/lp/buildmaster/manager.py 2015-02-17 09:01:30 +0000
@@ -219,7 +219,20 @@
elif job_action == False:
# Fail and dequeue the job.
logger.info("Failing job %s.", job.build_cookie)
- job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
+ if job.specific_build.status == BuildStatus.FULLYBUILT:
+ # A FULLYBUILT build should be out of our hands, and
+ # probably has artifacts like binaries attached. It's
+ # impossible to enter the state twice, so don't revert
+ # the status. Something's wrong, so log an OOPS and get
+ # it out of the queue to avoid further corruption.
+ logger.warning(
+ "Build is already successful! Dequeuing but leaving build "
+ "status alone. Something is very wrong.")
+ else:
+ # Whatever it was before, we want it failed. We're an
+ # error handler, so let's not risk more errors.
+ job.specific_build.updateStatus(
+ BuildStatus.FAILEDTOBUILD, force_invalid_transition=True)
job.destroySelf()
elif job_action == True:
# Reset the job so it will be retried elsewhere.
@@ -319,6 +332,7 @@
return d
def _updateDateScanned(self, ignored):
+ self.logger.debug("Scan finished for builder %s" % self.builder_name)
self.date_scanned = datetime.datetime.utcnow()
def _scanFailed(self, retry, failure):
@@ -541,6 +555,7 @@
def scan(self):
"""If a new builder appears, create a SlaveScanner for it."""
+ self.manager.logger.debug("Refreshing builders from the database.")
try:
self.manager.builder_factory.update()
new_builders = self.checkForNewBuilders()
@@ -550,6 +565,7 @@
"Failure while updating builders:\n",
exc_info=True)
transaction.abort()
+ self.manager.logger.debug("Builder refresh complete.")
def checkForNewBuilders(self):
"""See if any new builders were added."""
=== modified file 'lib/lp/buildmaster/model/buildfarmjob.py'
--- lib/lp/buildmaster/model/buildfarmjob.py 2014-06-26 06:26:29 +0000
+++ lib/lp/buildmaster/model/buildfarmjob.py 2015-02-17 09:01:30 +0000
@@ -50,6 +50,23 @@
)
+VALID_STATUS_TRANSITIONS = {
+ BuildStatus.NEEDSBUILD: tuple(BuildStatus.items),
+ BuildStatus.FULLYBUILT: (),
+ BuildStatus.FAILEDTOBUILD: (BuildStatus.NEEDSBUILD,),
+ BuildStatus.MANUALDEPWAIT: (BuildStatus.NEEDSBUILD,),
+ BuildStatus.CHROOTWAIT: (BuildStatus.NEEDSBUILD,),
+ BuildStatus.SUPERSEDED: (),
+ BuildStatus.BUILDING: tuple(BuildStatus.items),
+ BuildStatus.FAILEDTOUPLOAD: (BuildStatus.NEEDSBUILD,),
+ BuildStatus.UPLOADING: (
+ BuildStatus.FULLYBUILT, BuildStatus.FAILEDTOUPLOAD,
+ BuildStatus.SUPERSEDED),
+ BuildStatus.CANCELLING: (BuildStatus.CANCELLED,),
+ BuildStatus.CANCELLED: (BuildStatus.NEEDSBUILD,),
+ }
+
+
class BuildFarmJob(Storm):
"""A base implementation for `IBuildFarmJob` classes."""
__storm_table__ = 'BuildFarmJob'
@@ -157,8 +174,17 @@
self.log = log
def updateStatus(self, status, builder=None, slave_status=None,
- date_started=None, date_finished=None):
+ date_started=None, date_finished=None,
+ force_invalid_transition=False):
"""See `IBuildFarmJob`."""
+ if (not force_invalid_transition
+ and status != self.build_farm_job.status
+ and status not in VALID_STATUS_TRANSITIONS[
+ self.build_farm_job.status]):
+ raise AssertionError(
+ "Can't change build status from %s to %s."
+ % (self.build_farm_job.status.name, status.name))
+
self.build_farm_job.status = self.status = status
# If there's a builder provided, set it if we don't already have
=== modified file 'lib/lp/buildmaster/model/packagebuild.py'
--- lib/lp/buildmaster/model/packagebuild.py 2013-11-15 04:18:59 +0000
+++ lib/lp/buildmaster/model/packagebuild.py 2015-02-17 09:01:30 +0000
@@ -49,10 +49,12 @@
return self.archive.private
def updateStatus(self, status, builder=None, slave_status=None,
- date_started=None, date_finished=None):
+ date_started=None, date_finished=None,
+ force_invalid_transition=False):
super(PackageBuildMixin, self).updateStatus(
status, builder=builder, slave_status=slave_status,
- date_started=date_started, date_finished=date_finished)
+ date_started=date_started, date_finished=date_finished,
+ force_invalid_transition=force_invalid_transition)
if (status == BuildStatus.MANUALDEPWAIT and slave_status is not None
and slave_status.get('dependencies') is not None):
=== modified file 'lib/lp/buildmaster/tests/test_buildfarmjob.py'
--- lib/lp/buildmaster/tests/test_buildfarmjob.py 2013-01-31 23:43:04 +0000
+++ lib/lp/buildmaster/tests/test_buildfarmjob.py 2015-02-17 09:01:30 +0000
@@ -180,6 +180,7 @@
# Setting it to BUILDING for the first time sets date_started
# and date_first_dispatched.
+ self.build_farm_job.updateStatus(BuildStatus.NEEDSBUILD)
self.build_farm_job.updateStatus(BuildStatus.BUILDING)
self.assertIsNot(None, self.build_farm_job.date_started)
first = self.build_farm_job.date_started
@@ -211,6 +212,8 @@
build.updateStatus(status)
self.assertIs(None, build.date_started)
self.assertIs(None, build.date_finished)
+ build.updateStatus(
+ BuildStatus.NEEDSBUILD, force_invalid_transition=True)
build.updateStatus(BuildStatus.BUILDING)
self.assertIsNot(None, build.date_started)
self.assertIs(None, build.date_finished)
=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py 2014-06-26 12:36:03 +0000
+++ lib/lp/buildmaster/tests/test_manager.py 2015-02-17 09:01:30 +0000
@@ -1107,6 +1107,26 @@
self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
self.assertEqual(0, self.builder.failure_count)
+ def test_bad_job_does_not_unsucceed(self):
+ # If a FULLYBUILT build somehow ends up back in buildd-manager,
+ # all manner of failures can occur as invariants are violated.
+ # But we can't just fail and later retry the build as normal, as
+ # a FULLYBUILT build has binaries. Instead, failure handling
+ # just destroys the BuildQueue and leaves the status as
+ # FULLYBUILT.
+ self.build.updateStatus(BuildStatus.FULLYBUILT)
+ self.build.gotFailure()
+ self.build.gotFailure()
+ self.builder.gotFailure()
+
+ log = self._recover_failure("failnotes")
+ self.assertIn("Failing job", log)
+ self.assertIn("Build is already successful!", log)
+ self.assertIn("Resetting failure count of builder", log)
+ self.assertIs(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.FULLYBUILT)
+ self.assertEqual(0, self.builder.failure_count)
+
def test_failure_during_cancellation_cancels(self):
self.buildqueue.cancel()
self.assertEqual(BuildStatus.CANCELLING, self.build.status)
=== modified file 'lib/lp/buildmaster/tests/test_packagebuild.py'
--- lib/lp/buildmaster/tests/test_packagebuild.py 2014-07-24 09:37:03 +0000
+++ lib/lp/buildmaster/tests/test_packagebuild.py 2015-02-17 09:01:30 +0000
@@ -54,7 +54,7 @@
self.package_build.updateStatus(
BuildStatus.MANUALDEPWAIT, slave_status={'dependencies': u'deps'})
self.assertEqual(u'deps', self.package_build.dependencies)
- self.package_build.updateStatus(BuildStatus.FULLYBUILT)
+ self.package_build.updateStatus(BuildStatus.NEEDSBUILD)
self.assertEqual(None, self.package_build.dependencies)
def test_log_url(self):
=== modified file 'lib/lp/code/model/tests/test_recipebuilder.py'
--- lib/lp/code/model/tests/test_recipebuilder.py 2014-06-26 07:39:03 +0000
+++ lib/lp/code/model/tests/test_recipebuilder.py 2015-02-17 09:01:30 +0000
@@ -284,7 +284,6 @@
self.queue_record = (
self.factory.makeSourcePackageRecipeBuild().queueBuild())
build = self.queue_record.specific_build
- build.updateStatus(BuildStatus.FULLYBUILT)
if fake_successful_upload:
removeSecurityProxy(build).verifySuccessfulUpload = FakeMethod(
result=True)
=== modified file 'lib/lp/soyuz/browser/tests/test_build_views.py'
--- lib/lp/soyuz/browser/tests/test_build_views.py 2014-10-31 07:00:29 +0000
+++ lib/lp/soyuz/browser/tests/test_build_views.py 2015-02-17 09:01:30 +0000
@@ -399,7 +399,8 @@
# BPBs in certain states need a bit tweaking to appear in
# the result of getBuildRecords().
if status == BuildStatus.FULLYBUILT:
- build.updateStatus(BuildStatus.BUILDING)
+ build.updateStatus(
+ BuildStatus.BUILDING, force_invalid_transition=True)
build.updateStatus(BuildStatus.FULLYBUILT)
elif status in (BuildStatus.NEEDSBUILD, BuildStatus.BUILDING):
build.queueBuild()
=== modified file 'lib/lp/soyuz/browser/tests/test_builder.py'
--- lib/lp/soyuz/browser/tests/test_builder.py 2014-06-23 10:07:18 +0000
+++ lib/lp/soyuz/browser/tests/test_builder.py 2015-02-17 09:01:30 +0000
@@ -8,6 +8,7 @@
from testtools.matchers import Equals
from zope.component import getUtility
+from lp.buildmaster.enums import BuildStatus
from lp.buildmaster.interfaces.builder import IBuilderSet
from lp.services.job.model.job import Job
from lp.soyuz.browser.tests.test_builder_views import BuildCreationMixin
@@ -43,6 +44,8 @@
def test_builders_binary_package_build_query_count(self):
def create_build():
build = self.createBinaryPackageBuild()
+ build.updateStatus(
+ BuildStatus.NEEDSBUILD, force_invalid_transition=True)
queue = build.queueBuild()
queue.markAsBuilding(build.builder)
@@ -54,6 +57,8 @@
def test_builders_recipe_build_query_count(self):
def create_build():
build = self.createRecipeBuildWithBuilder()
+ build.updateStatus(
+ BuildStatus.NEEDSBUILD, force_invalid_transition=True)
queue = build.queueBuild()
queue.markAsBuilding(build.builder)
=== modified file 'lib/lp/soyuz/doc/build-failedtoupload-workflow.txt'
--- lib/lp/soyuz/doc/build-failedtoupload-workflow.txt 2014-07-09 13:15:47 +0000
+++ lib/lp/soyuz/doc/build-failedtoupload-workflow.txt 2015-02-17 09:01:30 +0000
@@ -103,87 +103,3 @@
i386 build of cdrkit 1.0 in ubuntu breezy-autotest RELEASE
http://launchpad.dev/ubuntu/+source/cdrkit/1.0/+build/22
<BLANKLINE>
-
-
-== Rescuing FAILEDTOBUILD record ==
-
-Let's emulate the procedure of rescuing an FAILEDTOUPLOAD build.
-A FAILEDTOUPLOAD build obviously has no binaries:
-
- >>> print failedtoupload_candidate.status.name
- FAILEDTOUPLOAD
-
- >>> failedtoupload_candidate.binarypackages.count()
- 0
-
- >>> failedtoupload_candidate.verifySuccessfulUpload()
- False
-
-But once the person responsible have received the information about
-the binary upload failure via build notification, or accessing the
-build page, he will be aware of:
-
- * The targeted Build ID
- * Path where the rejected/failed upload is, usually in some
- destination inside buildd upload queue (/srv/launchpad.net/builddmaster/*)
-
-Having those informations, one of buildd-admin team members will be
-able to reprocess the binary upload via an shell account following
-these steps:
-
- * Move the rejected/failed upload directory to 'incoming' queue.
- * Issue a process-upload command as:
- `process-upload.py
- -C buildd
- -r <DISTRORELEASE>
- -b <BUILD_ID>
- -J <LEAF_NAME>
- -Mvv
- <UPLOAD_FUL_PATH>`
- * LEAF_NAME is the upload directory name, required to not affect
- other uploads in progress.
- * '-Mvv' will produce descriptive debug information and suppress
- upload email notifications, that are definetelly *not wanted* for
- binary uploads.
- * It's possible to use '-n' (dry-run) if he is unsure about the parameters.
-
-For simplicity we will process the binary upload using the
-NascentUpload class instead of calling the script:
-
- >>> login('foo.bar@xxxxxxxxxxxxx')
-
- >>> from lp.archiveuploader.nascentupload import NascentUpload
- >>> from lp.archiveuploader.tests import datadir, getPolicy
-
-The policy contains the right arguments to reprocess the build for the
-right target location, they are passed via process-upload command-line:
-
- >>> buildd_policy = getPolicy(
- ... name='buildd',
- ... distro=failedtoupload_candidate.distribution.name,
- ... distroseries=failedtoupload_candidate.distro_series.name)
-
- >>> from lp.services.log.logger import DevNullLogger
- >>> cdrkit_bin_upload = NascentUpload.from_changesfile_path(
- ... datadir('suite/cdrkit_1.0/cdrkit_1.0_i386.changes'),
- ... buildd_policy, DevNullLogger())
- >>> cdrkit_bin_upload.process()
- >>> cdrkit_bin_upload.is_rejected
- False
- >>> success = cdrkit_bin_upload.do_accept(build=failedtoupload_candidate)
- >>> print cdrkit_bin_upload.queue_root.status.name
- NEW
-
-After successfully reprocessing binary upload the originally
-FAILEDTOUPLOAD build record is already in FULLYBUILT state.
-
- >>> print failedtoupload_candidate.status.name
- FULLYBUILT
-
-And contains the associated binaries.
-
- >>> failedtoupload_candidate.binarypackages.count()
- 1
- >>> failedtoupload_candidate.verifySuccessfulUpload()
- True
-
=== modified file 'lib/lp/soyuz/tests/test_publishing.py'
--- lib/lp/soyuz/tests/test_publishing.py 2014-10-31 10:34:51 +0000
+++ lib/lp/soyuz/tests/test_publishing.py 2015-02-17 09:01:30 +0000
@@ -431,7 +431,9 @@
# Adjust the build record in way it looks complete.
date_finished = datetime.datetime(2008, 1, 1, 0, 5, 0, tzinfo=pytz.UTC)
date_started = date_finished - datetime.timedelta(minutes=5)
- build.updateStatus(BuildStatus.BUILDING, date_started=date_started)
+ build.updateStatus(
+ BuildStatus.BUILDING, date_started=date_started,
+ force_invalid_transition=True)
build.updateStatus(BuildStatus.FULLYBUILT, date_finished=date_finished)
buildlog_filename = 'buildlog_%s-%s-%s.%s_%s_%s.txt.gz' % (
build.distribution.name,
Follow ups