← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad.

Commit message:
Rip the custom error handling out of checkCancellation. assessFailureCounts knows when and how to mark builds as cancelled.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/checkCancellation-nicify/+merge/224245

This branch rips the error handling out of checkCancellation. Instead, assessFailureCounts has been taught to cancel jobs that are cancelling rather than resetting or failing them.

checkCancellation previously called resetOrFail itself, but that will now be triggered in the next scan once _scanFailed dirties the builder.
-- 
https://code.launchpad.net/~wgrant/launchpad/checkCancellation-nicify/+merge/224245
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad.
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-06-23 14:52:15 +0000
+++ lib/lp/buildmaster/manager.py	2014-06-24 08:25:56 +0000
@@ -190,14 +190,25 @@
     """
     del get_property_cache(builder).currentjob
     job = builder.currentjob
+
+    # If a job is being cancelled we won't bother retrying a failure.
+    # Just mark it as cancelled and clear the builder for normal cleanup.
+    cancelling = job is not None and job.status == BuildQueueStatus.CANCELLING
     builder_action, job_action = judge_failure(
         builder.failure_count, job.specific_build.failure_count if job else 0,
-        exception, retry=retry)
+        exception, retry=retry and not cancelling)
 
-    if job is not None:
+    if job is not None and job_action is not None:
         if job_action == False:
+            # We've decided the job is bad, so unblame the builder.
+            builder.resetFailureCount()
+
+        if cancelling:
+            # We've previously been asked to cancel the job, so just set
+            # it to cancelled rather than retrying or failing.
+            job.markAsCancelled()
+        elif job_action == False:
             # Fail and dequeue the job.
-            builder.resetFailureCount()
             job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
             job.destroySelf()
         elif job_action == True:
@@ -339,60 +350,38 @@
             transaction.abort()
 
     @defer.inlineCallbacks
-    def checkCancellation(self, vitals, slave, interactor):
+    def checkCancellation(self, vitals, slave):
         """See if there is a pending cancellation request.
 
         If the current build is in status CANCELLING then terminate it
         immediately.
 
-        :return: A deferred whose value is True if we recovered the builder
-            by resuming a slave host, so that there is no need to update its
-            status.
+        :return: A deferred which fires when this cancellation cycle is done.
         """
-        if vitals.build_queue is None:
-            self.date_cancel = None
-            defer.returnValue(False)
         if vitals.build_queue.status != BuildQueueStatus.CANCELLING:
             self.date_cancel = None
-            defer.returnValue(False)
-
-        try:
-            if self.date_cancel is None:
-                self.logger.info(
-                    "Cancelling BuildQueue %d (%s) on %s",
-                    vitals.build_queue.id, self.getExpectedCookie(vitals),
-                    vitals.name)
-                yield slave.abort()
-                self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
-                defer.returnValue(False)
-            else:
-                # The BuildFarmJob will normally set the build's status to
-                # something other than CANCELLING once the builder responds to
-                # the cancel request.  This timeout is in case it doesn't.
-                if self._clock.seconds() < self.date_cancel:
-                    self.logger.info(
-                        "Waiting for BuildQueue %d (%s) on %s to cancel",
-                        vitals.build_queue.id, self.getExpectedCookie(vitals),
-                        vitals.name)
-                    defer.returnValue(False)
-                else:
-                    raise BuildSlaveFailure(
-                        "Timeout waiting for BuildQueue %d (%s) on %s to "
-                        "cancel" % (
-                        vitals.build_queue.id, self.getExpectedCookie(vitals),
-                        vitals.name))
-        except Exception as e:
+        elif self.date_cancel is None:
             self.logger.info(
-                "Failure while cancelling BuildQueue %d (%s) on %s",
+                "Cancelling BuildQueue %d (%s) on %s",
                 vitals.build_queue.id, self.getExpectedCookie(vitals),
                 vitals.name)
-            self.date_cancel = None
-            vitals.build_queue.markAsCancelled()
-            transaction.commit()
-            resumed = yield interactor.resetOrFail(
-                vitals, slave, self.builder_factory[vitals.name], self.logger,
-                e)
-            defer.returnValue(resumed)
+            yield slave.abort()
+            self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
+        else:
+            # The BuildFarmJob will normally set the build's status to
+            # something other than CANCELLING once the builder responds to
+            # the cancel request.  This timeout is in case it doesn't.
+            if self._clock.seconds() < self.date_cancel:
+                self.logger.info(
+                    "Waiting for BuildQueue %d (%s) on %s to cancel",
+                    vitals.build_queue.id, self.getExpectedCookie(vitals),
+                    vitals.name)
+            else:
+                raise BuildSlaveFailure(
+                    "Timeout waiting for BuildQueue %d (%s) on %s to "
+                    "cancel" % (
+                    vitals.build_queue.id, self.getExpectedCookie(vitals),
+                    vitals.name))
 
     def getExpectedCookie(self, vitals):
         """Return the build cookie expected to be held by the slave.
@@ -461,10 +450,7 @@
                 transaction.commit()
                 return
 
-            cancelled = yield self.checkCancellation(
-                vitals, slave, interactor)
-            if cancelled:
-                return
+            yield self.checkCancellation(vitals, slave)
 
             # The slave and DB agree on the builder's state.  Scan the
             # slave and get the logtail, or collect the build if it's

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-23 15:14:46 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-24 08:25:56 +0000
@@ -37,6 +37,7 @@
     )
 from lp.buildmaster.interfaces.builder import (
     BuildDaemonIsolationError,
+    BuildSlaveFailure,
     IBuilderSet,
     )
 from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
@@ -557,7 +558,7 @@
         # we should have also called the resume() method on the slave that
         # resets the virtual machine.
         clock.advance(SlaveScanner.CANCEL_TIMEOUT)
-        yield scanner.scan()
+        yield scanner.singleCycle()
         self.assertEqual(1, slave.call_log.count("abort"))
         self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
         self.assertEqual(BuildStatus.CANCELLED, build.status)
@@ -960,7 +961,6 @@
         builder_name = BOB_THE_BUILDER_NAME
         self.builder = getUtility(IBuilderSet)[builder_name]
         self.builder.virtualized = True
-        self.interactor = BuilderInteractor()
 
     @property
     def vitals(self):
@@ -972,67 +972,56 @@
         scanner.logger.name = 'slave-scanner'
         return scanner
 
-    def test_ignores_nonvirtual(self):
-        # If the builder is nonvirtual make sure we return False.
-        self.builder.virtualized = False
-        d = self._getScanner().checkCancellation(
-            self.vitals, None, self.interactor)
-        return d.addCallback(self.assertFalse)
-
-    def test_ignores_no_buildqueue(self):
-        # If the builder has no buildqueue associated,
-        # make sure we return False.
-        buildqueue = self.builder.currentjob
-        buildqueue.reset()
-        d = self._getScanner().checkCancellation(
-            self.vitals, None, self.interactor)
-        return d.addCallback(self.assertFalse)
-
     def test_ignores_build_not_cancelling(self):
         # If the active build is not in a CANCELLING state, ignore it.
-        d = self._getScanner().checkCancellation(
-            self.vitals, None, self.interactor)
-        return d.addCallback(self.assertFalse)
-
-    @defer.inlineCallbacks
-    def test_cancelling_build_is_cancelled(self):
+        slave = BuildingSlave()
+        scanner = self._getScanner()
+        yield scanner.checkCancellation(self.vitals, slave)
+        self.assertEqual([], slave.call_log)
+
+    @defer.inlineCallbacks
+    def test_cancelling_build_is_aborted(self):
+        # The first time we see a CANCELLING build, we abort the slave.
+        slave = BuildingSlave()
+        self.builder.current_build.cancel()
+        scanner = self._getScanner()
+        yield scanner.checkCancellation(self.vitals, slave)
+        self.assertEqual(["abort"], slave.call_log)
+
+        # A further scan is a no-op, as we remember that we've already
+        # requested that the slave abort.
+        yield scanner.checkCancellation(self.vitals, slave)
+        self.assertEqual(["abort"], slave.call_log)
+
+    @defer.inlineCallbacks
+    def test_timed_out_cancel_errors(self):
         # If a BuildQueue is CANCELLING and the cancel timeout expires,
-        # make sure True is returned and the slave is dirty.
+        # an exception is raised so the normal scan error handler can
+        # finalise the build.
         slave = OkSlave()
-        self.builder.vm_host = "fake_vm_host"
-        buildqueue = self.builder.currentjob
-        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
+        build = self.builder.current_build
         build.cancel()
         clock = task.Clock()
         scanner = self._getScanner(clock=clock)
 
-        result = yield scanner.checkCancellation(
-            self.vitals, slave, self.interactor)
-        self.assertNotIn("resume", slave.call_log)
-        self.assertFalse(result)
+        yield scanner.checkCancellation(self.vitals, slave)
+        self.assertEqual(["abort"], slave.call_log)
         self.assertEqual(BuildStatus.CANCELLING, build.status)
 
         clock.advance(SlaveScanner.CANCEL_TIMEOUT)
-        result = yield scanner.checkCancellation(
-            self.vitals, slave, self.interactor)
-        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
-        self.assertTrue(result)
-        self.assertEqual(BuildStatus.CANCELLED, build.status)
+        with ExpectedException(
+                BuildSlaveFailure, "Timeout waiting for .* to cancel"):
+            yield scanner.checkCancellation(self.vitals, slave)
 
     @defer.inlineCallbacks
-    def test_lost_build_is_cancelled(self):
-        # If the builder reports a fault while attempting to abort it, then
-        # immediately resume the slave as if the cancel timeout had expired.
+    def test_failed_abort_errors(self):
+        # If the builder reports a fault while attempting to abort it,
+        # an exception is raised so the build can be finalised.
         slave = LostBuildingBrokenSlave()
-        self.builder.vm_host = "fake_vm_host"
-        buildqueue = self.builder.currentjob
-        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
-        build.cancel()
-        result = yield self._getScanner().checkCancellation(
-            self.vitals, slave, self.interactor)
-        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
-        self.assertTrue(result)
-        self.assertEqual(BuildStatus.CANCELLED, build.status)
+        self.builder.current_build.cancel()
+        with ExpectedException(
+                xmlrpclib.Fault, "<Fault 8002: 'Could not abort'>"):
+            yield self._getScanner().checkCancellation(self.vitals, slave)
 
 
 class TestBuilddManager(TestCase):
@@ -1118,6 +1107,19 @@
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
 
     @defer.inlineCallbacks
+    def test_reset_during_cancellation_cancels(self):
+        self.buildqueue.cancel()
+        self.assertEqual(BuildStatus.CANCELLING, self.build.status)
+
+        naked_build = removeSecurityProxy(self.build)
+        self.builder.failure_count = 1
+        naked_build.failure_count = 1
+
+        yield self._assessFailureCounts("failnotes")
+        self.assertIs(None, self.builder.currentjob)
+        self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+
+    @defer.inlineCallbacks
     def test_job_failing_more_than_builder_fails_job(self):
         self.build.gotFailure()
         self.build.gotFailure()
@@ -1129,6 +1131,18 @@
         self.assertEqual(0, self.builder.failure_count)
 
     @defer.inlineCallbacks
+    def test_failure_during_cancellation_cancels(self):
+        self.buildqueue.cancel()
+        self.assertEqual(BuildStatus.CANCELLING, self.build.status)
+
+        self.build.gotFailure()
+        self.build.gotFailure()
+        self.builder.gotFailure()
+        yield self._assessFailureCounts("failnotes")
+        self.assertIs(None, self.builder.currentjob)
+        self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+
+    @defer.inlineCallbacks
     def test_virtual_builder_reset_thresholds(self):
         self.builder.virtualized = True
         self.builder.vm_host = 'foo'


Follow ups