← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/BuildDaemonIsolationError into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/BuildDaemonIsolationError into lp:launchpad with lp:~wgrant/launchpad/re-assessFailureCounts as a prerequisite.

Commit message:
Promote high-risk exceptions to BuildDaemonIsolationErrors, which insta-fail both the job and the builder for safety.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/BuildDaemonIsolationError/+merge/224161

We have a number of sanity checks in buildd-manager which are assertions about security and isolation aspects of the build farm. For example, a builder that's building something must also be dirty in the database, and a builder that's clean in the database must have an idle slave. This branch promotes those exceptions from BuildDaemonError to BuildDaemonIsolationError, and makes the new exception immediately fatal to both the builder and the job, regardless of previous failures.
-- 
https://code.launchpad.net/~wgrant/launchpad/BuildDaemonIsolationError/+merge/224161
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/BuildDaemonIsolationError into lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py	2014-06-23 15:35:56 +0000
+++ lib/lp/buildmaster/interactor.py	2014-06-23 15:35:56 +0000
@@ -27,6 +27,7 @@
     )
 from lp.buildmaster.interfaces.builder import (
     BuildDaemonError,
+    BuildDaemonIsolationError,
     CannotFetchFile,
     CannotResumeHost,
     )
@@ -366,11 +367,11 @@
 
         # Set the build behaviour depending on the provided build queue item.
         if not builder.builderok:
-            raise BuildDaemonError(
+            raise BuildDaemonIsolationError(
                 "Attempted to start a build on a known-bad builder.")
 
         if builder.clean_status != BuilderCleanStatus.CLEAN:
-            raise BuildDaemonError(
+            raise BuildDaemonIsolationError(
                 "Attempted to start build on a dirty slave.")
 
         builder.setCleanStatus(BuilderCleanStatus.DIRTY)

=== modified file 'lib/lp/buildmaster/interfaces/builder.py'
--- lib/lp/buildmaster/interfaces/builder.py	2014-06-20 12:02:25 +0000
+++ lib/lp/buildmaster/interfaces/builder.py	2014-06-23 15:35:56 +0000
@@ -7,6 +7,7 @@
 
 __all__ = [
     'BuildDaemonError',
+    'BuildDaemonIsolationError',
     'BuildSlaveFailure',
     'CannotBuild',
     'CannotFetchFile',
@@ -71,6 +72,10 @@
     """The class of errors raised by the buildd classes"""
 
 
+class BuildDaemonIsolationError(BuildDaemonError):
+    """A build isolation violation has been detected."""
+
+
 class CannotFetchFile(BuildDaemonError):
     """The slave was unable to fetch the file."""
 

=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-06-23 15:35:56 +0000
+++ lib/lp/buildmaster/manager.py	2014-06-23 15:35:56 +0000
@@ -36,6 +36,7 @@
     )
 from lp.buildmaster.interfaces.builder import (
     BuildDaemonError,
+    BuildDaemonIsolationError,
     BuildSlaveFailure,
     CannotBuild,
     CannotFetchFile,
@@ -131,7 +132,7 @@
         return (b for n, b in sorted(self.vitals_map.iteritems()))
 
 
-def judge_failure(builder_count, job_count, retry=True):
+def judge_failure(builder_count, job_count, exc, retry=True):
     """Judge how to recover from a scan failure.
 
     Assesses the failure counts of a builder and its current job, and
@@ -144,6 +145,11 @@
     :return: A tuple of (builder action, job action). True means reset,
         False means fail, None means take no action.
     """
+    if isinstance(exc, BuildDaemonIsolationError):
+        # We have a potential security issue. Insta-kill both regardless
+        # of any failure counts.
+        return (False, False)
+
     if builder_count == job_count:
         # We can't tell which is to blame. Retry a few times, and then
         # reset the job so it can be retried elsewhere. If the job is at
@@ -186,7 +192,7 @@
     job = builder.currentjob
     builder_action, job_action = judge_failure(
         builder.failure_count, job.specific_build.failure_count if job else 0,
-        retry=retry)
+        exception, retry=retry)
 
     if job is not None:
         if job_action == False:
@@ -427,7 +433,8 @@
             if vitals.clean_status != BuilderCleanStatus.DIRTY:
                 # This is probably a grave bug with security implications,
                 # as a slave that has a job must be cleaned afterwards.
-                raise BuildDaemonError("Non-dirty builder allegedly building.")
+                raise BuildDaemonIsolationError(
+                    "Non-dirty builder allegedly building.")
 
             lost_reason = None
             if not vitals.builderok:
@@ -474,7 +481,7 @@
             if vitals.clean_status == BuilderCleanStatus.CLEAN:
                 slave_status = yield slave.status()
                 if slave_status.get('builder_status') != 'BuilderStatus.IDLE':
-                    raise BuildDaemonError(
+                    raise BuildDaemonIsolationError(
                         'Allegedly clean slave not idle (%r instead)'
                         % slave_status.get('builder_status'))
                 self.updateVersion(vitals, slave_status)

=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
--- lib/lp/buildmaster/tests/test_interactor.py	2014-06-23 15:35:56 +0000
+++ lib/lp/buildmaster/tests/test_interactor.py	2014-06-23 15:35:56 +0000
@@ -40,7 +40,7 @@
     extract_vitals_from_db,
     )
 from lp.buildmaster.interfaces.builder import (
-    BuildDaemonError,
+    BuildDaemonIsolationError,
     CannotFetchFile,
     CannotResumeHost,
     )
@@ -442,7 +442,7 @@
             result=candidate)
         vitals = extract_vitals_from_db(builder)
         with ExpectedException(
-                BuildDaemonError,
+                BuildDaemonIsolationError,
                 "Attempted to start build on a dirty slave."):
             yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
 

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-23 15:35:56 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-23 15:35:56 +0000
@@ -36,7 +36,7 @@
     extract_vitals_from_db,
     )
 from lp.buildmaster.interfaces.builder import (
-    BuildDaemonError,
+    BuildDaemonIsolationError,
     IBuilderSet,
     )
 from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
@@ -465,6 +465,26 @@
         self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
 
     @defer.inlineCallbacks
+    def test_isolation_error_means_death(self):
+        # Certain failures immediately kill both the job and the
+        # builder. For example, a building builder that isn't dirty
+        # probably indicates some potentially grave security bug.
+        builder = getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME]
+        build = builder.current_build
+        self.assertIsNotNone(build.buildqueue_record)
+        self.assertEqual(BuildStatus.BUILDING, build.status)
+        self.assertEqual(0, build.failure_count)
+        self.assertEqual(0, builder.failure_count)
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
+        transaction.commit()
+        yield self._getScanner().singleCycle()
+        self.assertFalse(builder.builderok)
+        self.assertEqual(
+            'Non-dirty builder allegedly building.', builder.failnotes)
+        self.assertIsNone(build.buildqueue_record)
+        self.assertEqual(BuildStatus.FAILEDTOBUILD, build.status)
+
+    @defer.inlineCallbacks
     def test_update_slave_version(self):
         # If the reported slave version differs from the DB's record of it,
         # then scanning the builder updates the DB.
@@ -805,7 +825,8 @@
             slave=slave, builder_factory=MockBuilderFactory(builder, bq))
 
         with ExpectedException(
-                BuildDaemonError, "Non-dirty builder allegedly building."):
+                BuildDaemonIsolationError,
+                "Non-dirty builder allegedly building."):
             yield scanner.scan()
         self.assertEqual([], slave.call_log)
 
@@ -819,7 +840,7 @@
             slave=slave, builder_factory=MockBuilderFactory(builder, None))
 
         with ExpectedException(
-                BuildDaemonError,
+                BuildDaemonIsolationError,
                 "Allegedly clean slave not idle "
                 "\('BuilderStatus.BUILDING' instead\)"):
             yield scanner.scan()
@@ -872,7 +893,7 @@
             (None, None),
             judge_failure(
                 Builder.JOB_RESET_THRESHOLD - 1,
-                Builder.JOB_RESET_THRESHOLD - 1))
+                Builder.JOB_RESET_THRESHOLD - 1, Exception()))
 
     def test_same_count_exceeding_threshold(self):
         # Several consecutive failures suggest that something might be
@@ -880,38 +901,52 @@
         self.assertEqual(
             (None, True),
             judge_failure(
-                Builder.JOB_RESET_THRESHOLD, Builder.JOB_RESET_THRESHOLD))
+                Builder.JOB_RESET_THRESHOLD, Builder.JOB_RESET_THRESHOLD,
+                Exception()))
+
+    def test_same_count_no_retries(self):
+        self.assertEqual(
+            (None, True),
+            judge_failure(
+                Builder.JOB_RESET_THRESHOLD - 1,
+                Builder.JOB_RESET_THRESHOLD - 1, Exception(), retry=False))
 
     def test_bad_builder_below_threshold(self):
         self.assertEqual(
             (None, True),
-            judge_failure(Builder.RESET_THRESHOLD - 1, 1))
+            judge_failure(Builder.RESET_THRESHOLD - 1, 1, Exception()))
 
     def test_bad_builder_at_reset_threshold(self):
         self.assertEqual(
             (True, True),
-            judge_failure(Builder.RESET_THRESHOLD, 1))
+            judge_failure(Builder.RESET_THRESHOLD, 1, Exception()))
 
     def test_bad_builder_above_reset_threshold(self):
         self.assertEqual(
             (None, True),
             judge_failure(
-                Builder.RESET_THRESHOLD + 1, Builder.RESET_THRESHOLD))
+                Builder.RESET_THRESHOLD + 1, Builder.RESET_THRESHOLD,
+                Exception()))
 
     def test_bad_builder_second_reset(self):
         self.assertEqual(
             (True, True),
-            judge_failure(Builder.RESET_THRESHOLD * 2, 1))
+            judge_failure(Builder.RESET_THRESHOLD * 2, 1, Exception()))
 
     def test_bad_builder_gives_up(self):
         self.assertEqual(
             (False, True),
-            judge_failure(Builder.RESET_THRESHOLD * 3, 1))
+            judge_failure(Builder.RESET_THRESHOLD * 3, 1, Exception()))
 
     def test_bad_job_fails(self):
         self.assertEqual(
             (None, False),
-            judge_failure(1, 2))
+            judge_failure(1, 2, Exception()))
+
+    def test_isolation_violation_double_kills(self):
+        self.assertEqual(
+            (False, False),
+            judge_failure(1, 1, BuildDaemonIsolationError()))
 
 
 class TestCancellationChecking(TestCaseWithFactory):


Follow ups