launchpad-reviewers team mailing list archive

Thread
Date

[Merge] ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master

To: mp+454694@xxxxxxxxxxxxxxxxxx
From: William Grant <mp+454694@xxxxxxxxxxxxxxxxxx>
Date: Fri, 27 Oct 2023 04:30:21 -0000
Reply-to: mp+454694@xxxxxxxxxxxxxxxxxx
Sender: noreply@xxxxxxxxxxxxx

William Grant has proposed merging ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master with ~wgrant/launchpad:buildd-manager-failure-metrics as a prerequisite.

Commit message:
Cope more gracefully with intermittent builder glitches

Requested reviews:
Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/+git/launchpad/+merge/454694

buildd-manager would previously immediately count any single scan
failure against the builder and job. This meant that three glitches --
say, network timeouts -- over the course of job would result in the
build being requeued. A builder's failure count is reset on successful
dispatch, but a job's deliberately isn't since we want to fail builds
that are repeatedly killing builders. This meant that a single network
glitch in the second attempt at a build would cause it to be failed.

This added layer of failure counting substantially reduces the
likelihood of those two scenarios, by requiring five consecutive
unsuccessful scans before a single failure is counted against a builder
or job. This means that brief network interruptions, or indeed temporary
insanity on buildd-manager's part, should no longer cause builds to be
requeued or failed at all.

The only significant downside of this change is that recovery from
legitimate failures will now take a few minutes longer. But that's much
less of a concern with the very large build farm we have nowadays.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master.

diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 2bc0b81..372b637 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -54,6 +54,10 @@ from lp.services.statsd.interfaces.statsd_client import IStatsdClient
 BUILDD_MANAGER_LOG_NAME = "worker-scanner"
 
 
+# The number of times the scan of a builder can fail before we start
+# attributing it to the builder and job.
+SCAN_FAILURE_THRESHOLD = 5
+
 # The number of times a builder can consecutively fail before we
 # reset its current job.
 JOB_RESET_THRESHOLD = 3
@@ -515,6 +519,12 @@ class WorkerScanner:
 
         self.can_retry = True
 
+        # The build and job failure counts are persisted, but we only really
+        # care about the consecutive scan failure count over the space of a
+        # couple of minutes, so it's okay if it resets on buildd-manager
+        # restart.
+        self.scan_failure_count = 0
+
         # We cache the build cookie, keyed on the BuildQueue, to avoid
         # hitting the DB on every scan.
         self._cached_build_cookie = None
@@ -553,6 +563,12 @@ class WorkerScanner:
 
         try:
             yield self.scan()
+
+            # We got through a scan without error, so reset the consecutive
+            # failure count. We don't reset the persistent builder or job
+            # failure counts, since the build might consistently break the
+            # builder later in the build.
+            self.scan_failure_count = 0
         except Exception as e:
             self._scanFailed(self.can_retry, e)
 
@@ -591,12 +607,35 @@ class WorkerScanner:
                 "Scanning %s failed" % self.builder_name, exc_info=exc
             )
 
+        # Certain errors can't possibly be a glitch, and they can insta-fail
+        # even if the scan phase would normally allow a retry.
+        if isinstance(exc, (BuildDaemonIsolationError, CannotResumeHost)):
+            retry = False
+
         # Decide if we need to terminate the job or reset/fail the builder.
         vitals = self.builder_factory.getVitals(self.builder_name)
         builder = self.builder_factory[self.builder_name]
         try:
             labels = get_statsd_labels(builder, builder.current_build)
             self.statsd_client.incr("builders.failure", labels=labels)
+            self.scan_failure_count += 1
+
+            # To avoid counting network glitches or similar against innocent
+            # builds and jobs, we allow a scan to fail a few times in a row
+            # without consequence, and just retry. If we exceed the threshold,
+            # we then persistently record a single failure against the build
+            # and job.
+            if retry and self.scan_failure_count < SCAN_FAILURE_THRESHOLD:
+                self.statsd_client.incr(
+                    "builders.failure.scan_retried",
+                    labels={
+                        "failures": str(self.scan_failure_count),
+                        **labels,
+                    },
+                )
+                return
+
+            self.scan_failure_count = 0
 
             builder.gotFailure()
             if builder.current_build is not None:
@@ -610,6 +649,7 @@ class WorkerScanner:
                 "Miserable failure when trying to handle failure:\n",
                 exc_info=True,
             )
+        finally:
             transaction.abort()
 
     @defer.inlineCallbacks
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index 01e6044..d4a4723 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -40,6 +40,7 @@ from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
 from lp.buildmaster.manager import (
     BUILDER_FAILURE_THRESHOLD,
     JOB_RESET_THRESHOLD,
+    SCAN_FAILURE_THRESHOLD,
     BuilddManager,
     BuilderFactory,
     PrefetchedBuilderFactory,
@@ -133,7 +134,13 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         self.assertEqual(job.specific_build.status, BuildStatus.BUILDING)
         self.assertEqual(job.logtail, logtail)
 
-    def _getScanner(self, builder_name=None, clock=None, builder_factory=None):
+    def _getScanner(
+        self,
+        builder_name=None,
+        clock=None,
+        builder_factory=None,
+        scan_failure_count=0,
+    ):
         """Instantiate a WorkerScanner object.
 
         Replace its default logging handler by a testing version.
@@ -148,6 +155,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
             builder_name, builder_factory, manager, BufferLogger(), clock=clock
         )
         scanner.logger.name = "worker-scanner"
+        scanner.scan_failure_count = scan_failure_count
 
         return scanner
 
@@ -458,6 +466,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
     @defer.inlineCallbacks
     def _assertFailureCounting(
         self,
+        scan_count,
         builder_count,
         job_count,
         expected_builder_count,
@@ -470,7 +479,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         def failing_scan():
             return defer.fail(Exception("fake exception"))
 
-        scanner = self._getScanner()
+        scanner = self._getScanner(scan_failure_count=scan_count)
         scanner.scan = failing_scan
         from lp.buildmaster import manager as manager_module
 
@@ -497,10 +506,101 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         )
         self.assertEqual(1, manager_module.recover_failure.call_count)
 
+    @defer.inlineCallbacks
+    def test_scan_persistent_failure_counts(self):
+        # The first few scan exceptions just result in retries, not penalties
+        # for the involved parties. A builder or job failure is only recorded
+        # after several scan failures.
+        scanner = self._getScanner()
+        builder = getUtility(IBuilderSet)[scanner.builder_name]
+        transaction.commit()
+
+        self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
+
+        # Rack up almost enough failures.
+        for i in range(1, SCAN_FAILURE_THRESHOLD):
+            yield scanner.singleCycle()
+            self.assertEqual(i, scanner.scan_failure_count)
+            self.assertEqual(0, builder.failure_count)
+            self.assertEqual(0, builder.current_build.failure_count)
+
+        # Once we reach the consecutive failure threshold, the builder and
+        # build each get a failure and the count is reset.
+        yield scanner.singleCycle()
+        self.assertEqual(0, scanner.scan_failure_count)
+        self.assertEqual(1, builder.failure_count)
+        self.assertEqual(1, builder.current_build.failure_count)
+
+    @defer.inlineCallbacks
+    def test_scan_intermittent_failure_retries(self):
+        # The first few scan exceptions just result in retries, not penalties
+        # for the involved parties. A builder or job failure is only recorded
+        # after several scan failures.
+        def failing_scan():
+            return defer.fail(Exception("fake exception"))
+
+        scanner = self._getScanner()
+        builder = getUtility(IBuilderSet)[scanner.builder_name]
+        transaction.commit()
+
+        # Rack up a couple of failures.
+        self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
+
+        yield scanner.singleCycle()
+        self.assertEqual(1, scanner.scan_failure_count)
+        self.assertEqual(0, builder.failure_count)
+        self.assertEqual(0, builder.current_build.failure_count)
+        self.assertEqual(
+            [
+                mock.call(
+                    "builders.failure,arch=386,build=True,builder_name=bob,"
+                    "env=test,job_type=PACKAGEBUILD,region=,virtualized=False"
+                ),
+                mock.call(
+                    "builders.failure.scan_retried,arch=386,build=True,"
+                    "builder_name=bob,env=test,failures=1,"
+                    "job_type=PACKAGEBUILD,region=,virtualized=False"
+                ),
+            ],
+            self.stats_client.incr.mock_calls,
+        )
+        self.stats_client.incr.reset_mock()
+
+        yield scanner.singleCycle()
+        self.assertEqual(2, scanner.scan_failure_count)
+        self.assertEqual(0, builder.failure_count)
+        self.assertEqual(0, builder.current_build.failure_count)
+        self.assertEqual(
+            [
+                mock.call(
+                    "builders.failure,arch=386,build=True,builder_name=bob,"
+                    "env=test,job_type=PACKAGEBUILD,region=,virtualized=False"
+                ),
+                mock.call(
+                    "builders.failure.scan_retried,arch=386,build=True,"
+                    "builder_name=bob,env=test,failures=2,"
+                    "job_type=PACKAGEBUILD,region=,virtualized=False"
+                ),
+            ],
+            self.stats_client.incr.mock_calls,
+        )
+        self.stats_client.incr.reset_mock()
+
+        # Since we didn't meet SCAN_FAILURE_THRESHOLD, a success just resets
+        # the count and no harm is done to innocent bystanders.
+        self.patch(scanner, "scan", lambda: defer.succeed(None))
+        yield scanner.singleCycle()
+        self.assertEqual(0, scanner.scan_failure_count)
+        self.assertEqual(0, builder.failure_count)
+        self.assertEqual(0, builder.current_build.failure_count)
+        self.assertEqual([], self.stats_client.incr.mock_calls)
+        self.stats_client.incr.reset_mock()
+
     def test_scan_first_fail(self):
         # The first failure of a job should result in the failure_count
         # on the job and the builder both being incremented.
         return self._assertFailureCounting(
+            scan_count=SCAN_FAILURE_THRESHOLD - 1,
             builder_count=0,
             job_count=0,
             expected_builder_count=1,
@@ -511,6 +611,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         # The first failure of a job should result in the failure_count
         # on the job and the builder both being incremented.
         return self._assertFailureCounting(
+            scan_count=SCAN_FAILURE_THRESHOLD - 1,
             builder_count=1,
             job_count=0,
             expected_builder_count=2,
@@ -521,6 +622,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         # The first failure of a job should result in the failure_count
         # on the job and the builder both being incremented.
         return self._assertFailureCounting(
+            scan_count=SCAN_FAILURE_THRESHOLD - 1,
             builder_count=0,
             job_count=1,
             expected_builder_count=1,
@@ -532,7 +634,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         def failing_scan():
             return defer.fail(Exception("fake exception"))
 
-        scanner = self._getScanner()
+        scanner = self._getScanner(
+            scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+        )
         scanner.scan = failing_scan
         builder = getUtility(IBuilderSet)[scanner.builder_name]
         builder.failure_count = BUILDER_FAILURE_THRESHOLD
@@ -547,7 +651,10 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         def failing_scan():
             return defer.fail(Exception("fake exception"))
 
-        scanner = self._getScanner()
+        # TODO: Add and test metrics for retried scan failures.
+        scanner = self._getScanner(
+            scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+        )
         scanner.scan = failing_scan
         builder = getUtility(IBuilderSet)[scanner.builder_name]
         builder.failure_count = BUILDER_FAILURE_THRESHOLD
@@ -680,7 +787,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         # Run 'scan' and check its results.
         switch_dbuser(config.builddmaster.dbuser)
         clock = task.Clock()
-        scanner = self._getScanner(clock=clock)
+        scanner = self._getScanner(
+            clock=clock, scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
+        )
         yield scanner.scan()
 
         # An abort request should be sent.
@@ -689,6 +798,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
 
         # Advance time a little.  Nothing much should happen.
         clock.advance(1)
+        scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
         yield scanner.scan()
         self.assertEqual(1, worker.call_log.count("abort"))
         self.assertEqual(BuildStatus.CANCELLING, build.status)
@@ -697,6 +807,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         # we should have also called the resume() method on the worker that
         # resets the virtual machine.
         clock.advance(WorkerScanner.CANCEL_TIMEOUT)
+        scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
         yield scanner.singleCycle()
         self.assertEqual(1, worker.call_log.count("abort"))
         self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
@@ -916,7 +1027,8 @@ class TestPrefetchedBuilderFactory(TestCaseWithFactory):
         # with the non-prefetching BuilderFactory.
         self.assertEqual(7, len(all_vitals))
         self.assertEqual(
-            4, len([v for v in all_vitals if v.build_queue is not None])
+            SCAN_FAILURE_THRESHOLD - 1,
+            len([v for v in all_vitals if v.build_queue is not None]),
         )
         self.assertContentEqual(BuilderFactory().iterVitals(), all_vitals)