launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #30658
[Merge] ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master
William Grant has proposed merging ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master with ~wgrant/launchpad:buildd-manager-failure-metrics as a prerequisite.
Commit message:
Cope more gracefully with intermittent builder glitches
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/+git/launchpad/+merge/454694
buildd-manager would previously immediately count any single scan
failure against the builder and job. This meant that three glitches --
say, network timeouts -- over the course of job would result in the
build being requeued. A builder's failure count is reset on successful
dispatch, but a job's deliberately isn't since we want to fail builds
that are repeatedly killing builders. This meant that a single network
glitch in the second attempt at a build would cause it to be failed.
This added layer of failure counting substantially reduces the
likelihood of those two scenarios, by requiring five consecutive
unsuccessful scans before a single failure is counted against a builder
or job. This means that brief network interruptions, or indeed temporary
insanity on buildd-manager's part, should no longer cause builds to be
requeued or failed at all.
The only significant downside of this change is that recovery from
legitimate failures will now take a few minutes longer. But that's much
less of a concern with the very large build farm we have nowadays.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 2bc0b81..372b637 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -54,6 +54,10 @@ from lp.services.statsd.interfaces.statsd_client import IStatsdClient
BUILDD_MANAGER_LOG_NAME = "worker-scanner"
+# The number of times the scan of a builder can fail before we start
+# attributing it to the builder and job.
+SCAN_FAILURE_THRESHOLD = 5
+
# The number of times a builder can consecutively fail before we
# reset its current job.
JOB_RESET_THRESHOLD = 3
@@ -515,6 +519,12 @@ class WorkerScanner:
self.can_retry = True
+ # The build and job failure counts are persisted, but we only really
+ # care about the consecutive scan failure count over the space of a
+ # couple of minutes, so it's okay if it resets on buildd-manager
+ # restart.
+ self.scan_failure_count = 0
+
# We cache the build cookie, keyed on the BuildQueue, to avoid
# hitting the DB on every scan.
self._cached_build_cookie = None
@@ -553,6 +563,12 @@ class WorkerScanner:
try:
yield self.scan()
+
+ # We got through a scan without error, so reset the consecutive
+ # failure count. We don't reset the persistent builder or job
+ # failure counts, since the build might consistently break the
+ # builder later in the build.
+ self.scan_failure_count = 0
except Exception as e:
self._scanFailed(self.can_retry, e)
@@ -591,12 +607,35 @@ class WorkerScanner:
"Scanning %s failed" % self.builder_name, exc_info=exc
)
+ # Certain errors can't possibly be a glitch, and they can insta-fail
+ # even if the scan phase would normally allow a retry.
+ if isinstance(exc, (BuildDaemonIsolationError, CannotResumeHost)):
+ retry = False
+
# Decide if we need to terminate the job or reset/fail the builder.
vitals = self.builder_factory.getVitals(self.builder_name)
builder = self.builder_factory[self.builder_name]
try:
labels = get_statsd_labels(builder, builder.current_build)
self.statsd_client.incr("builders.failure", labels=labels)
+ self.scan_failure_count += 1
+
+ # To avoid counting network glitches or similar against innocent
+ # builds and jobs, we allow a scan to fail a few times in a row
+ # without consequence, and just retry. If we exceed the threshold,
+ # we then persistently record a single failure against the build
+ # and job.
+ if retry and self.scan_failure_count < SCAN_FAILURE_THRESHOLD:
+ self.statsd_client.incr(
+ "builders.failure.scan_retried",
+ labels={
+ "failures": str(self.scan_failure_count),
+ **labels,
+ },
+ )
+ return
+
+ self.scan_failure_count = 0
builder.gotFailure()
if builder.current_build is not None:
@@ -610,6 +649,7 @@ class WorkerScanner:
"Miserable failure when trying to handle failure:\n",
exc_info=True,
)
+ finally:
transaction.abort()
@defer.inlineCallbacks
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index 01e6044..d4a4723 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -40,6 +40,7 @@ from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
from lp.buildmaster.manager import (
BUILDER_FAILURE_THRESHOLD,
JOB_RESET_THRESHOLD,
+ SCAN_FAILURE_THRESHOLD,
BuilddManager,
BuilderFactory,
PrefetchedBuilderFactory,
@@ -133,7 +134,13 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
self.assertEqual(job.specific_build.status, BuildStatus.BUILDING)
self.assertEqual(job.logtail, logtail)
- def _getScanner(self, builder_name=None, clock=None, builder_factory=None):
+ def _getScanner(
+ self,
+ builder_name=None,
+ clock=None,
+ builder_factory=None,
+ scan_failure_count=0,
+ ):
"""Instantiate a WorkerScanner object.
Replace its default logging handler by a testing version.
@@ -148,6 +155,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
builder_name, builder_factory, manager, BufferLogger(), clock=clock
)
scanner.logger.name = "worker-scanner"
+ scanner.scan_failure_count = scan_failure_count
return scanner
@@ -458,6 +466,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
@defer.inlineCallbacks
def _assertFailureCounting(
self,
+ scan_count,
builder_count,
job_count,
expected_builder_count,
@@ -470,7 +479,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
def failing_scan():
return defer.fail(Exception("fake exception"))
- scanner = self._getScanner()
+ scanner = self._getScanner(scan_failure_count=scan_count)
scanner.scan = failing_scan
from lp.buildmaster import manager as manager_module
@@ -497,10 +506,101 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
)
self.assertEqual(1, manager_module.recover_failure.call_count)
+ @defer.inlineCallbacks
+ def test_scan_persistent_failure_counts(self):
+ # The first few scan exceptions just result in retries, not penalties
+ # for the involved parties. A builder or job failure is only recorded
+ # after several scan failures.
+ scanner = self._getScanner()
+ builder = getUtility(IBuilderSet)[scanner.builder_name]
+ transaction.commit()
+
+ self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
+
+ # Rack up almost enough failures.
+ for i in range(1, SCAN_FAILURE_THRESHOLD):
+ yield scanner.singleCycle()
+ self.assertEqual(i, scanner.scan_failure_count)
+ self.assertEqual(0, builder.failure_count)
+ self.assertEqual(0, builder.current_build.failure_count)
+
+ # Once we reach the consecutive failure threshold, the builder and
+ # build each get a failure and the count is reset.
+ yield scanner.singleCycle()
+ self.assertEqual(0, scanner.scan_failure_count)
+ self.assertEqual(1, builder.failure_count)
+ self.assertEqual(1, builder.current_build.failure_count)
+
+ @defer.inlineCallbacks
+ def test_scan_intermittent_failure_retries(self):
+ # The first few scan exceptions just result in retries, not penalties
+ # for the involved parties. A builder or job failure is only recorded
+ # after several scan failures.
+ def failing_scan():
+ return defer.fail(Exception("fake exception"))
+
+ scanner = self._getScanner()
+ builder = getUtility(IBuilderSet)[scanner.builder_name]
+ transaction.commit()
+
+ # Rack up a couple of failures.
+ self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
+
+ yield scanner.singleCycle()
+ self.assertEqual(1, scanner.scan_failure_count)
+ self.assertEqual(0, builder.failure_count)
+ self.assertEqual(0, builder.current_build.failure_count)
+ self.assertEqual(
+ [
+ mock.call(
+ "builders.failure,arch=386,build=True,builder_name=bob,"
+ "env=test,job_type=PACKAGEBUILD,region=,virtualized=False"
+ ),
+ mock.call(
+ "builders.failure.scan_retried,arch=386,build=True,"
+ "builder_name=bob,env=test,failures=1,"
+ "job_type=PACKAGEBUILD,region=,virtualized=False"
+ ),
+ ],
+ self.stats_client.incr.mock_calls,
+ )
+ self.stats_client.incr.reset_mock()
+
+ yield scanner.singleCycle()
+ self.assertEqual(2, scanner.scan_failure_count)
+ self.assertEqual(0, builder.failure_count)
+ self.assertEqual(0, builder.current_build.failure_count)
+ self.assertEqual(
+ [
+ mock.call(
+ "builders.failure,arch=386,build=True,builder_name=bob,"
+ "env=test,job_type=PACKAGEBUILD,region=,virtualized=False"
+ ),
+ mock.call(
+ "builders.failure.scan_retried,arch=386,build=True,"
+ "builder_name=bob,env=test,failures=2,"
+ "job_type=PACKAGEBUILD,region=,virtualized=False"
+ ),
+ ],
+ self.stats_client.incr.mock_calls,
+ )
+ self.stats_client.incr.reset_mock()
+
+ # Since we didn't meet SCAN_FAILURE_THRESHOLD, a success just resets
+ # the count and no harm is done to innocent bystanders.
+ self.patch(scanner, "scan", lambda: defer.succeed(None))
+ yield scanner.singleCycle()
+ self.assertEqual(0, scanner.scan_failure_count)
+ self.assertEqual(0, builder.failure_count)
+ self.assertEqual(0, builder.current_build.failure_count)
+ self.assertEqual([], self.stats_client.incr.mock_calls)
+ self.stats_client.incr.reset_mock()
+
def test_scan_first_fail(self):
# The first failure of a job should result in the failure_count
# on the job and the builder both being incremented.
return self._assertFailureCounting(
+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
builder_count=0,
job_count=0,
expected_builder_count=1,
@@ -511,6 +611,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
# The first failure of a job should result in the failure_count
# on the job and the builder both being incremented.
return self._assertFailureCounting(
+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
builder_count=1,
job_count=0,
expected_builder_count=2,
@@ -521,6 +622,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
# The first failure of a job should result in the failure_count
# on the job and the builder both being incremented.
return self._assertFailureCounting(
+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
builder_count=0,
job_count=1,
expected_builder_count=1,
@@ -532,7 +634,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
def failing_scan():
return defer.fail(Exception("fake exception"))
- scanner = self._getScanner()
+ scanner = self._getScanner(
+ scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+ )
scanner.scan = failing_scan
builder = getUtility(IBuilderSet)[scanner.builder_name]
builder.failure_count = BUILDER_FAILURE_THRESHOLD
@@ -547,7 +651,10 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
def failing_scan():
return defer.fail(Exception("fake exception"))
- scanner = self._getScanner()
+ # TODO: Add and test metrics for retried scan failures.
+ scanner = self._getScanner(
+ scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+ )
scanner.scan = failing_scan
builder = getUtility(IBuilderSet)[scanner.builder_name]
builder.failure_count = BUILDER_FAILURE_THRESHOLD
@@ -680,7 +787,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
# Run 'scan' and check its results.
switch_dbuser(config.builddmaster.dbuser)
clock = task.Clock()
- scanner = self._getScanner(clock=clock)
+ scanner = self._getScanner(
+ clock=clock, scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+ )
yield scanner.scan()
# An abort request should be sent.
@@ -689,6 +798,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
# Advance time a little. Nothing much should happen.
clock.advance(1)
+ scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
yield scanner.scan()
self.assertEqual(1, worker.call_log.count("abort"))
self.assertEqual(BuildStatus.CANCELLING, build.status)
@@ -697,6 +807,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
# we should have also called the resume() method on the worker that
# resets the virtual machine.
clock.advance(WorkerScanner.CANCEL_TIMEOUT)
+ scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
yield scanner.singleCycle()
self.assertEqual(1, worker.call_log.count("abort"))
self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
@@ -916,7 +1027,8 @@ class TestPrefetchedBuilderFactory(TestCaseWithFactory):
# with the non-prefetching BuilderFactory.
self.assertEqual(7, len(all_vitals))
self.assertEqual(
- 4, len([v for v in all_vitals if v.build_queue is not None])
+ SCAN_FAILURE_THRESHOLD - 1,
+ len([v for v in all_vitals if v.build_queue is not None]),
)
self.assertContentEqual(BuilderFactory().iterVitals(), all_vitals)