launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #30654
[Merge] ~wgrant/launchpad:buildd-manager-failure-refactor into launchpad:master
William Grant has proposed merging ~wgrant/launchpad:buildd-manager-failure-refactor into launchpad:master.
Commit message:
Refactor buildd-manager job dispatch error handling
manager.py is now entirely inlineCallbacks.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/+git/launchpad/+merge/454692
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~wgrant/launchpad:buildd-manager-failure-refactor into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 7f2b870..0c6d335 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -11,7 +11,6 @@ __all__ = [
]
import datetime
-import functools
import logging
import os.path
import shutil
@@ -502,6 +501,8 @@ class WorkerScanner:
self.date_cancel = None
self.date_scanned = None
+ self.can_retry = True
+
# We cache the build cookie, keyed on the BuildQueue, to avoid
# hitting the DB on every scan.
self._cached_build_cookie = None
@@ -520,6 +521,7 @@ class WorkerScanner:
"""Terminate the LoopingCall."""
self.loop.stop()
+ @defer.inlineCallbacks
def singleCycle(self):
# Inhibit scanning if the BuilderFactory hasn't updated since
# the last run. This doesn't matter for the base BuilderFactory,
@@ -533,22 +535,19 @@ class WorkerScanner:
self.logger.debug(
"Skipping builder %s (cache out of date)" % self.builder_name
)
- return defer.succeed(None)
+ return
self.logger.debug("Scanning builder %s" % self.builder_name)
- # Errors should normally be able to be retried a few times. Bits
- # of scan() which don't want retries will call _scanFailed
- # directly.
- d = self.scan()
- d.addErrback(functools.partial(self._scanFailed, True))
- d.addBoth(self._updateDateScanned)
- return d
- def _updateDateScanned(self, ignored):
+ try:
+ yield self.scan()
+ except Exception as e:
+ self._scanFailed(self.can_retry, e)
+
self.logger.debug("Scan finished for builder %s" % self.builder_name)
self.date_scanned = datetime.datetime.utcnow()
- def _scanFailed(self, retry, failure):
+ def _scanFailed(self, retry, exc):
"""Deal with failures encountered during the scan cycle.
1. Print the error in the log
@@ -562,26 +561,22 @@ class WorkerScanner:
# If we don't recognise the exception include a stack trace with
# the error.
- error_message = failure.getErrorMessage()
- if failure.check(
- BuildWorkerFailure,
- CannotBuild,
- CannotResumeHost,
- BuildDaemonError,
- CannotFetchFile,
+ if isinstance(
+ exc,
+ (
+ BuildWorkerFailure,
+ CannotBuild,
+ CannotResumeHost,
+ BuildDaemonError,
+ CannotFetchFile,
+ ),
):
self.logger.info(
- "Scanning %s failed with: %s"
- % (self.builder_name, error_message)
+ "Scanning %s failed with: %r" % (self.builder_name, exc)
)
else:
self.logger.info(
- "Scanning %s failed with: %s\n%s"
- % (
- self.builder_name,
- failure.getErrorMessage(),
- failure.getTraceback(),
- )
+ "Scanning %s failed" % self.builder_name, exc_info=exc
)
# Decide if we need to terminate the job or reset/fail the builder.
@@ -602,7 +597,7 @@ class WorkerScanner:
else:
labels["build"] = False
self.statsd_client.incr("builders.judged_failed", labels=labels)
- recover_failure(self.logger, vitals, builder, retry, failure.value)
+ recover_failure(self.logger, vitals, builder, retry, exc)
transaction.commit()
except Exception:
# Catastrophic code failure! Not much we can do.
@@ -687,6 +682,7 @@ class WorkerScanner:
vitals = self.builder_factory.getVitals(self.builder_name)
interactor = self.interactor_factory()
worker = self.worker_factory(vitals)
+ self.can_retry = True
if vitals.build_queue is not None:
if vitals.clean_status != BuilderCleanStatus.DIRTY:
@@ -759,15 +755,14 @@ class WorkerScanner:
"%s is in manual mode, not dispatching.", vitals.name
)
return
- # Try to find and dispatch a job. If it fails, don't
- # attempt to just retry the scan; we need to reset
- # the job so the dispatch will be reattempted.
+ # Try to find and dispatch a job. If it fails, don't attempt to
+ # just retry the scan; we need to reset the job so the dispatch
+ # will be reattempted.
builder = self.builder_factory[self.builder_name]
- d = interactor.findAndStartJob(
+ self.can_retry = False
+ yield interactor.findAndStartJob(
vitals, builder, worker, self.builder_factory
)
- d.addErrback(functools.partial(self._scanFailed, False))
- yield d
if builder.currentjob is not None:
# After a successful dispatch we can reset the
# failure_count.