← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/resetOrFail-dirties into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/resetOrFail-dirties into lp:launchpad.

Commit message:
Quick fix to make resetOrFail work with virt reset protocol 2.0: it now just dirties the builder, so cleanSlave is invoked as normal on the next scan. There's now exactly one path that resets a builder.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/resetOrFail-dirties/+merge/223917

resetOrFail failed to recover virtual builders using protocol 2.0, as it invoked resumeSlaveHost directly without handling the asynchronicity. I've adjusted it to instead just ensure that the slave is dirty, triggering the normal scan() -> cleanSlave() -> resumeSlaveHost() path on the following cycle.

This means that resumption is handled in just one place. resetOrFail and assessFailureCounts can now be significantly cleaned up, but that can wait for next week.
-- 
https://code.launchpad.net/~wgrant/launchpad/resetOrFail-dirties/+merge/223917
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/resetOrFail-dirties into lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py	2014-06-20 11:12:40 +0000
+++ lib/lp/buildmaster/interactor.py	2014-06-20 13:12:39 +0000
@@ -386,11 +386,11 @@
         the fault of failing jobs, or when the builder has entered an
         ABORTED state without having been asked to do so.
 
-        In case of a virtualized/PPA buildd slave an attempt will be made
-        to reset it (using `resumeSlaveHost`).
+        In case of a virtualized builder we ensure it's dirty so the
+        next scan will reset it.
 
-        Conversely, a non-virtualized buildd slave will be (marked as)
-        failed straightaway (using `failBuilder`).
+        A non-virtualized builder will be failed straightaway (using
+        `failBuilder`).
 
         :param logger: The logger object to be used for logging.
         :param exception: An exception to be used for logging.
@@ -399,15 +399,15 @@
         """
         error_message = str(exception)
         if vitals.virtualized:
-            # Virtualized/PPA builder: attempt a reset, unless the failure
-            # was itself a failure to reset.  (In that case, the slave
-            # scanner will try again until we reach the failure threshold.)
-            if not isinstance(exception, CannotResumeHost):
-                logger.warn(
-                    "Resetting builder: %s -- %s" % (
-                        vitals.url, error_message),
-                    exc_info=True)
-                return cls.resumeSlaveHost(vitals, slave)
+            # Virtualized/PPA builder: mark it dirty so the next
+            # scan will attempt a reset.
+            logger.warn(
+                "Dirtying builder: %s -- %s" % (vitals.url, error_message),
+                exc_info=True)
+            if builder.clean_status != BuilderCleanStatus.DIRTY:
+                builder.setCleanStatus(BuilderCleanStatus.DIRTY)
+                transaction.commit()
+            return defer.succeed(True)
         else:
             # XXX: This should really let the failure bubble up to the
             # scan() method that does the failure counting.

=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
--- lib/lp/buildmaster/tests/test_interactor.py	2014-06-20 11:55:59 +0000
+++ lib/lp/buildmaster/tests/test_interactor.py	2014-06-20 13:12:39 +0000
@@ -161,19 +161,6 @@
         d = self.resumeSlaveHost(MockBuilder(virtualized=True, vm_host="pop"))
         return assert_fails_with(d, CannotResumeHost)
 
-    def test_resetOrFail_resume_failure(self):
-        reset_fail_config = """
-            [builddmaster]
-            vm_resume_command: /bin/false"""
-        config.push('reset fail', reset_fail_config)
-        self.addCleanup(config.pop, 'reset fail')
-        builder = MockBuilder(virtualized=True, vm_host="pop", builderok=True)
-        vitals = extract_vitals_from_db(builder)
-        d = BuilderInteractor.resetOrFail(
-            vitals, BuilderInteractor.makeSlaveFromVitals(vitals), builder,
-            DevNullLogger(), Exception())
-        return assert_fails_with(d, CannotResumeHost)
-
     @defer.inlineCallbacks
     def test_resetOrFail_nonvirtual(self):
         builder = MockBuilder(virtualized=False, builderok=True)

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-20 12:02:25 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-20 13:12:39 +0000
@@ -538,7 +538,7 @@
         clock.advance(SlaveScanner.CANCEL_TIMEOUT)
         yield scanner.scan()
         self.assertEqual(1, slave.call_log.count("abort"))
-        self.assertEqual(1, slave.call_log.count("resume"))
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
         self.assertEqual(BuildStatus.CANCELLED, build.status)
 
     @defer.inlineCallbacks
@@ -910,7 +910,7 @@
     @defer.inlineCallbacks
     def test_cancelling_build_is_cancelled(self):
         # If a BuildQueue is CANCELLING and the cancel timeout expires,
-        # make sure True is returned and the slave was resumed.
+        # make sure True is returned and the slave is dirty.
         slave = OkSlave()
         self.builder.vm_host = "fake_vm_host"
         buildqueue = self.builder.currentjob
@@ -928,7 +928,7 @@
         clock.advance(SlaveScanner.CANCEL_TIMEOUT)
         result = yield scanner.checkCancellation(
             self.vitals, slave, self.interactor)
-        self.assertEqual(1, slave.call_log.count("resume"))
+        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertTrue(result)
         self.assertEqual(BuildStatus.CANCELLED, build.status)
 
@@ -943,7 +943,7 @@
         build.cancel()
         result = yield self._getScanner().checkCancellation(
             self.vitals, slave, self.interactor)
-        self.assertEqual(1, slave.call_log.count("resume"))
+        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertTrue(result)
         self.assertEqual(BuildStatus.CANCELLED, build.status)
 
@@ -1045,6 +1045,7 @@
     def test_virtual_builder_reset_thresholds(self):
         self.builder.virtualized = True
         self.builder.vm_host = 'foo'
+        self.builder.setCleanStatus(BuilderCleanStatus.CLEAN)
 
         for failure_count in range(
             Builder.RESET_THRESHOLD - 1,
@@ -1053,9 +1054,15 @@
             yield self._assessFailureCounts("failnotes")
             self.assertIs(None, self.builder.currentjob)
             self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
-            self.assertEqual(
-                failure_count // Builder.RESET_THRESHOLD,
-                self.slave.call_log.count('resume'))
+            # The builder should be CLEAN until it hits the
+            # RESET_THRESHOLD.
+            if failure_count % Builder.RESET_THRESHOLD != 0:
+                self.assertEqual(
+                    BuilderCleanStatus.CLEAN, self.builder.clean_status)
+            else:
+                self.assertEqual(
+                    BuilderCleanStatus.DIRTY, self.builder.clean_status)
+                self.builder.setCleanStatus(BuilderCleanStatus.CLEAN)
             self.assertTrue(self.builder.builderok)
 
         self.builder.failure_count = (
@@ -1064,8 +1071,7 @@
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
         self.assertEqual(
-            Builder.RESET_FAILURE_THRESHOLD - 1,
-            self.slave.call_log.count('resume'))
+            BuilderCleanStatus.CLEAN, self.builder.clean_status)
         self.assertFalse(self.builder.builderok)
         self.assertEqual("failnotes", self.builder.failnotes)
 


Follow ups