launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #15854
[Merge] lp:~cjwatson/launchpad/failure-count-reset into lp:launchpad
Colin Watson has proposed merging lp:~cjwatson/launchpad/failure-count-reset into lp:launchpad.
Commit message:
Try to reset virtual builders a couple of times before giving up and failing them.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #1039935 in Launchpad itself: "buildd-manager failure counting is too trigger-happy when killing virtual builders"
https://bugs.launchpad.net/launchpad/+bug/1039935
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/failure-count-reset/+merge/183320
buildd-manager often determines that a virtual builder has failed without trying to reset it first. As discussed with William, this branch arranges to try to ppa-reset it twice before giving up, going through the usual failure counting in between resets. It will take three times as long to fail a builder completely if it's really unrecoverable, but hopefully many of them can be brought back by way of these automatic ppa-resets.
--
https://code.launchpad.net/~cjwatson/launchpad/failure-count-reset/+merge/183320
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/failure-count-reset into lp:launchpad.
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py 2013-08-29 12:09:40 +0000
+++ lib/lp/buildmaster/manager.py 2013-08-31 11:16:12 +0000
@@ -45,11 +45,12 @@
return getUtility(IBuilderSet)[name]
-def assessFailureCounts(builder, fail_notes):
+def assessFailureCounts(logger, interactor, exception):
"""View builder/job failure_count and work out which needs to die. """
# builder.currentjob hides a complicated query, don't run it twice.
# See bug 623281 (Note that currentjob is a cachedproperty).
+ builder = interactor.builder
del get_property_cache(builder).currentjob
current_job = builder.currentjob
if current_job is None:
@@ -79,9 +80,15 @@
# failing jobs because sometimes they get unresponsive due to
# human error, flaky networks etc. We expect the builder to get
# better, whereas jobs are very unlikely to get better.
- if builder.failure_count >= Builder.FAILURE_THRESHOLD:
- # It's also gone over the threshold so let's disable it.
- builder.failBuilder(fail_notes)
+ if builder.failure_count >= (
+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD):
+ # We've already tried resetting it enough times, so we have
+ # little choice but to give up.
+ builder.failBuilder(str(exception))
+ elif builder.failure_count % Builder.RESET_THRESHOLD == 0:
+ # The builder is dead, but in the virtual case it might be worth
+ # resetting it.
+ interactor.resetOrFail(logger, exception)
else:
# The job is the culprit! Override its status to 'failed'
# to make sure it won't get automatically dispatched again,
@@ -175,7 +182,8 @@
builder = get_builder(self.builder_name)
try:
builder.handleFailure(self.logger)
- assessFailureCounts(builder, failure.getErrorMessage())
+ assessFailureCounts(
+ self.logger, BuilderInteractor(builder), failure.value)
transaction.commit()
except Exception:
# Catastrophic code failure! Not much we can do.
=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py 2013-08-28 12:03:57 +0000
+++ lib/lp/buildmaster/model/builder.py 2013-08-31 11:16:12 +0000
@@ -84,9 +84,14 @@
active = BoolCol(dbName='active', notNull=True, default=True)
failure_count = IntCol(dbName='failure_count', default=0, notNull=True)
- # The number of times a builder can consecutively fail before we
- # give up and mark it builderok=False.
- FAILURE_THRESHOLD = 5
+ # The number of times a builder can consecutively fail before we try
+ # resetting it (if virtual) or marking it builderok=False (if not).
+ RESET_THRESHOLD = 5
+
+ # The number of times a virtual builder can reach its reset threshold
+ # due to consecutive failures before we give up and mark it
+ # builderok=False.
+ RESET_FAILURE_THRESHOLD = 3
def _getBuilderok(self):
return self._builderok
=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py 2013-08-28 12:03:57 +0000
+++ lib/lp/buildmaster/tests/test_manager.py 2013-08-31 11:16:12 +0000
@@ -260,6 +260,7 @@
d.addCallback(self._checkNoDispatch, builder)
return d
+ @defer.inlineCallbacks
def test_scan_with_not_ok_builder(self):
# Reset sampledata builder.
builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
@@ -267,11 +268,9 @@
self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
builder.builderok = False
scanner = self._getScanner()
- d = scanner.scan()
+ yield scanner.scan()
# Because the builder is not ok, we can't use _checkNoDispatch.
- d.addCallback(
- lambda ignored: self.assertIs(None, builder.currentjob))
- return d
+ self.assertIsNone(builder.currentjob)
def test_scan_of_broken_slave(self):
builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
@@ -283,6 +282,7 @@
d = scanner.scan()
return assert_fails_with(d, xmlrpclib.Fault)
+ @defer.inlineCallbacks
def _assertFailureCounting(self, builder_count, job_count,
expected_builder_count, expected_job_count):
# If scan() fails with an exception, failure_counts should be
@@ -306,20 +306,16 @@
# singleCycle() calls scan() which is our fake one that throws an
# exception.
- d = scanner.singleCycle()
+ yield scanner.singleCycle()
# Failure counts should be updated, and the assessment method
# should have been called. The actual behaviour is tested below
# in TestFailureAssessments.
- def got_scan(ignored):
- self.assertEqual(expected_builder_count, builder.failure_count)
- self.assertEqual(
- expected_job_count,
- builder.currentjob.specific_job.build.failure_count)
- self.assertEqual(
- 1, manager_module.assessFailureCounts.call_count)
-
- return d.addCallback(got_scan)
+ self.assertEqual(expected_builder_count, builder.failure_count)
+ self.assertEqual(
+ expected_job_count,
+ builder.currentjob.specific_job.build.failure_count)
+ self.assertEqual(1, manager_module.assessFailureCounts.call_count)
def test_scan_first_fail(self):
# The first failure of a job should result in the failure_count
@@ -342,23 +338,22 @@
builder_count=0, job_count=1, expected_builder_count=1,
expected_job_count=2)
+ @defer.inlineCallbacks
def test_scanFailed_handles_lack_of_a_job_on_the_builder(self):
def failing_scan():
return defer.fail(Exception("fake exception"))
scanner = self._getScanner()
scanner.scan = failing_scan
builder = getUtility(IBuilderSet)[scanner.builder_name]
- builder.failure_count = Builder.FAILURE_THRESHOLD
+ builder.failure_count = (
+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
builder.currentjob.reset()
self.layer.txn.commit()
- d = scanner.singleCycle()
-
- def scan_finished(ignored):
- self.assertFalse(builder.builderok)
-
- return d.addCallback(scan_finished)
-
+ yield scanner.singleCycle()
+ self.assertFalse(builder.builderok)
+
+ @defer.inlineCallbacks
def test_fail_to_resume_slave_resets_job(self):
# If an attempt to resume and dispatch a slave fails, it should
# reset the job via job.reset()
@@ -382,19 +377,15 @@
job = removeSecurityProxy(builder._findBuildCandidate())
job.virtualized = True
builder.virtualized = True
- d = scanner.singleCycle()
-
- def check(ignored):
- # The failure_count will have been incremented on the
- # builder, we can check that to see that a dispatch attempt
- # did indeed occur.
- self.assertEqual(1, builder.failure_count)
- # There should also be no builder set on the job.
- self.assertTrue(job.builder is None)
- build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
- self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
-
- return d.addCallback(check)
+ yield scanner.singleCycle()
+
+ # The failure_count will have been incremented on the builder, we
+ # can check that to see that a dispatch attempt did indeed occur.
+ self.assertEqual(1, builder.failure_count)
+ # There should also be no builder set on the job.
+ self.assertIsNone(job.builder)
+ build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
+ self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
@defer.inlineCallbacks
def test_cancelling_a_build(self):
@@ -573,12 +564,18 @@
self.build = self.factory.makeSourcePackageRecipeBuild()
self.buildqueue = self.build.queueBuild()
self.buildqueue.markAsBuilding(self.builder)
+ self.interactor = BuilderInteractor(self.builder)
+
+ def _assessFailureCounts(self, fail_notes):
+ # Helper for assessFailureCounts boilerplate.
+ assessFailureCounts(
+ BufferLogger(), self.interactor, Exception(fail_notes))
def test_equal_failures_reset_job(self):
self.builder.gotFailure()
self.builder.getCurrentBuildFarmJob().gotFailure()
- assessFailureCounts(self.builder, "failnotes")
+ self._assessFailureCounts("failnotes")
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
@@ -587,33 +584,63 @@
self.builder.getCurrentBuildFarmJob().gotFailure()
self.builder.gotFailure()
- assessFailureCounts(self.builder, "failnotes")
+ self._assessFailureCounts("failnotes")
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
self.assertEqual(0, self.builder.failure_count)
- def test_builder_failing_more_than_job_but_under_fail_threshold(self):
- self.builder.failure_count = Builder.FAILURE_THRESHOLD - 1
-
- assessFailureCounts(self.builder, "failnotes")
- self.assertIs(None, self.builder.currentjob)
- self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ def test_virtual_builder_reset_thresholds(self):
+ self.builder.virtualized = True
+ self.patch(self.interactor, "resumeSlaveHost", FakeMethod())
+
+ for failure_count in range(
+ Builder.RESET_THRESHOLD - 1,
+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD):
+ self.builder.failure_count = failure_count
+ self._assessFailureCounts("failnotes")
+ self.assertIs(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assertEqual(
+ failure_count // Builder.RESET_THRESHOLD,
+ self.interactor.resumeSlaveHost.call_count)
+ self.assertTrue(self.builder.builderok)
+
+ self.builder.failure_count = (
+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
+ self._assessFailureCounts("failnotes")
+ self.assertIs(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assertEqual(
+ Builder.RESET_FAILURE_THRESHOLD - 1,
+ self.interactor.resumeSlaveHost.call_count)
+ self.assertFalse(self.builder.builderok)
+ self.assertEqual("failnotes", self.builder.failnotes)
+
+ def test_non_virtual_builder_reset_thresholds(self):
+ self.builder.virtualized = False
+ self.patch(self.interactor, "resumeSlaveHost", FakeMethod())
+
+ self.builder.failure_count = Builder.RESET_THRESHOLD - 1
+ self._assessFailureCounts("failnotes")
+ self.assertIs(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assertEqual(0, self.interactor.resumeSlaveHost.call_count)
self.assertTrue(self.builder.builderok)
- def test_builder_failing_more_than_job_but_over_fail_threshold(self):
- self.builder.failure_count = Builder.FAILURE_THRESHOLD
-
- assessFailureCounts(self.builder, "failnotes")
+ self.builder.failure_count = Builder.RESET_THRESHOLD
+ self._assessFailureCounts("failnotes")
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assertEqual(0, self.interactor.resumeSlaveHost.call_count)
self.assertFalse(self.builder.builderok)
self.assertEqual("failnotes", self.builder.failnotes)
def test_builder_failing_with_no_attached_job(self):
self.buildqueue.reset()
- self.builder.failure_count = Builder.FAILURE_THRESHOLD
+ self.builder.failure_count = (
+ Builder.RESET_THRESHOLD * Builder.RESET_FAILURE_THRESHOLD)
- assessFailureCounts(self.builder, "failnotes")
+ self._assessFailureCounts("failnotes")
self.assertFalse(self.builder.builderok)
self.assertEqual("failnotes", self.builder.failnotes)
Follow ups