launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #16077
[Merge] lp:~wgrant/launchpad/bug-1039927 into lp:launchpad
William Grant has proposed merging lp:~wgrant/launchpad/bug-1039927 into lp:launchpad.
Commit message:
Adjust assessFailureCounts to ignore the first few failures, rather than resetting a build due to a single transient connection failure.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #1039927 in Launchpad itself: "buildd-manager failure counting is too trigger-happy when killing builds"
https://bugs.launchpad.net/launchpad/+bug/1039927
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/bug-1039927/+merge/190304
Resetting builds after a single connection failure is a bit silly. Let's give them a few retries to recover, in case of a dropped packet or two.
Dispatch now handles errors itself without a retry, since the next scan would just cause the job to be rescued from the idle slave anyway.
--
https://code.launchpad.net/~wgrant/launchpad/bug-1039927/+merge/190304
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/bug-1039927 into lp:launchpad.
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py 2013-10-07 05:09:49 +0000
+++ lib/lp/buildmaster/manager.py 2013-10-10 08:16:41 +0000
@@ -11,6 +11,7 @@
]
import datetime
+import functools
import logging
from storm.expr import LeftJoin
@@ -127,7 +128,8 @@
@defer.inlineCallbacks
-def assessFailureCounts(logger, vitals, builder, slave, interactor, exception):
+def assessFailureCounts(logger, vitals, builder, slave, interactor, retry,
+ exception):
"""View builder/job failure_count and work out which needs to die.
:return: A Deferred that fires either immediately or after a virtual
@@ -146,11 +148,13 @@
if builder.failure_count == job_failure_count and current_job is not None:
# If the failure count for the builder is the same as the
# failure count for the job being built, then we cannot
- # tell whether the job or the builder is at fault. The best
- # we can do is try them both again, and hope that the job
- # runs against a different builder.
- current_job.reset()
- del get_property_cache(builder).currentjob
+ # tell whether the job or the builder is at fault. We retry the
+ # scan a few times, but once we give up the best we can do is
+ # reset the job and hope it runs against a different builder,
+ # giving us a judgement on which is at fault.
+ if not retry or builder.failure_count >= Builder.JOB_RESET_THRESHOLD:
+ current_job.reset()
+ del get_property_cache(builder).currentjob
return
if builder.failure_count > job_failure_count:
@@ -259,8 +263,11 @@
return defer.succeed(None)
self.logger.debug("Scanning builder %s" % self.builder_name)
+ # Errors should normally be able to be retried a few times. Bits
+ # of scan() which don't want retries will call _scanFailed
+ # directly.
d = self.scan()
- d.addErrback(self._scanFailed)
+ d.addErrback(functools.partial(self._scanFailed, True))
d.addBoth(self._updateDateScanned)
return d
@@ -268,11 +275,12 @@
self.date_scanned = datetime.datetime.utcnow()
@defer.inlineCallbacks
- def _scanFailed(self, failure):
+ def _scanFailed(self, retry, failure):
"""Deal with failures encountered during the scan cycle.
1. Print the error in the log
2. Increment and assess failure counts on the builder and job.
+ If asked to retry, a single failure may not be considered fatal.
:return: A Deferred that fires either immediately or after a virtual
slave has been reset.
@@ -302,7 +310,7 @@
builder.handleFailure(self.logger)
yield assessFailureCounts(
self.logger, vitals, builder, self.slave_factory(vitals),
- self.interactor_factory(), failure.value)
+ self.interactor_factory(), retry, failure.value)
transaction.commit()
except Exception:
# Catastrophic code failure! Not much we can do.
@@ -383,7 +391,6 @@
:return: A Deferred that fires when the scan is complete.
"""
- self.logger.debug("Scanning %s." % self.builder_name)
self.builder_factory.prescanUpdate()
vitals = self.builder_factory.getVitals(self.builder_name)
interactor = self.interactor_factory()
@@ -395,6 +402,7 @@
if not vitals.builderok:
lost_reason = '%s is disabled' % vitals.name
else:
+ self.logger.debug("Scanning %s." % self.builder_name)
cancelled = yield self.checkCancellation(vitals, slave, interactor)
if cancelled:
return
@@ -431,7 +439,12 @@
else:
# See if there is a job we can dispatch to the builder slave.
builder = self.builder_factory[self.builder_name]
- yield interactor.findAndStartJob(vitals, builder, slave)
+ # Try to dispatch the job. If it fails, don't attempt to
+ # just retry the scan; we need to reset the job so the
+ # dispatch will be reattempted.
+ d = interactor.findAndStartJob(vitals, builder, slave)
+ d.addErrback(functools.partial(self._scanFailed, False))
+ yield d
if builder.currentjob is not None:
# After a successful dispatch we can reset the
# failure_count.
=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py 2013-09-02 07:30:53 +0000
+++ lib/lp/buildmaster/model/builder.py 2013-10-10 08:16:41 +0000
@@ -84,6 +84,10 @@
active = BoolCol(dbName='active', notNull=True, default=True)
failure_count = IntCol(dbName='failure_count', default=0, notNull=True)
+ # The number of times a builder can consecutively fail before we
+ # reset its current job.
+ JOB_RESET_THRESHOLD = 3
+
# The number of times a builder can consecutively fail before we try
# resetting it (if virtual) or marking it builderok=False (if not).
RESET_THRESHOLD = 5
=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py 2013-10-07 05:09:49 +0000
+++ lib/lp/buildmaster/tests/test_manager.py 2013-10-10 08:16:41 +0000
@@ -876,18 +876,36 @@
self.buildqueue.markAsBuilding(self.builder)
self.slave = OkSlave()
- def _assessFailureCounts(self, fail_notes):
+ def _assessFailureCounts(self, fail_notes, retry=True):
# Helper for assessFailureCounts boilerplate.
return assessFailureCounts(
BufferLogger(), extract_vitals_from_db(self.builder), self.builder,
- self.slave, BuilderInteractor(), Exception(fail_notes))
-
- @defer.inlineCallbacks
- def test_equal_failures_reset_job(self):
- self.builder.gotFailure()
- self.build.gotFailure()
-
- yield self._assessFailureCounts("failnotes")
+ self.slave, BuilderInteractor(), retry, Exception(fail_notes))
+
+ @defer.inlineCallbacks
+ def test_job_reset_threshold_with_retry(self):
+ naked_build = removeSecurityProxy(self.build)
+ self.builder.failure_count = Builder.JOB_RESET_THRESHOLD - 1
+ naked_build.failure_count = Builder.JOB_RESET_THRESHOLD - 1
+
+ yield self._assessFailureCounts("failnotes")
+ self.assertIsNot(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.BUILDING)
+
+ self.builder.failure_count += 1
+ naked_build.failure_count += 1
+
+ yield self._assessFailureCounts("failnotes")
+ self.assertIs(None, self.builder.currentjob)
+ self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+
+ @defer.inlineCallbacks
+ def test_job_reset_threshold_no_retry(self):
+ naked_build = removeSecurityProxy(self.build)
+ self.builder.failure_count = 1
+ naked_build.failure_count = 1
+
+ yield self._assessFailureCounts("failnotes", retry=False)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
Follow ups