launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #16996
Re: [Merge] lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad
Okay, that makes sense, so the assessFailureCounts() will handler the BuildSlaveFailure appropriately and reset slaves when necessary.
Just a minor test comment.
Diff comments:
> === modified file 'lib/lp/buildmaster/manager.py'
> --- lib/lp/buildmaster/manager.py 2014-06-23 14:52:15 +0000
> +++ lib/lp/buildmaster/manager.py 2014-06-24 08:25:56 +0000
> @@ -190,14 +190,25 @@
> """
> del get_property_cache(builder).currentjob
> job = builder.currentjob
> +
> + # If a job is being cancelled we won't bother retrying a failure.
> + # Just mark it as cancelled and clear the builder for normal cleanup.
> + cancelling = job is not None and job.status == BuildQueueStatus.CANCELLING
> builder_action, job_action = judge_failure(
> builder.failure_count, job.specific_build.failure_count if job else 0,
> - exception, retry=retry)
> + exception, retry=retry and not cancelling)
>
> - if job is not None:
> + if job is not None and job_action is not None:
> if job_action == False:
> + # We've decided the job is bad, so unblame the builder.
> + builder.resetFailureCount()
> +
> + if cancelling:
> + # We've previously been asked to cancel the job, so just set
> + # it to cancelled rather than retrying or failing.
> + job.markAsCancelled()
> + elif job_action == False:
> # Fail and dequeue the job.
> - builder.resetFailureCount()
> job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
> job.destroySelf()
> elif job_action == True:
> @@ -339,60 +350,38 @@
> transaction.abort()
>
> @defer.inlineCallbacks
> - def checkCancellation(self, vitals, slave, interactor):
> + def checkCancellation(self, vitals, slave):
> """See if there is a pending cancellation request.
>
> If the current build is in status CANCELLING then terminate it
> immediately.
>
> - :return: A deferred whose value is True if we recovered the builder
> - by resuming a slave host, so that there is no need to update its
> - status.
> + :return: A deferred which fires when this cancellation cycle is done.
> """
> - if vitals.build_queue is None:
> - self.date_cancel = None
> - defer.returnValue(False)
> if vitals.build_queue.status != BuildQueueStatus.CANCELLING:
> self.date_cancel = None
> - defer.returnValue(False)
> -
> - try:
> - if self.date_cancel is None:
> - self.logger.info(
> - "Cancelling BuildQueue %d (%s) on %s",
> - vitals.build_queue.id, self.getExpectedCookie(vitals),
> - vitals.name)
> - yield slave.abort()
> - self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
> - defer.returnValue(False)
> - else:
> - # The BuildFarmJob will normally set the build's status to
> - # something other than CANCELLING once the builder responds to
> - # the cancel request. This timeout is in case it doesn't.
> - if self._clock.seconds() < self.date_cancel:
> - self.logger.info(
> - "Waiting for BuildQueue %d (%s) on %s to cancel",
> - vitals.build_queue.id, self.getExpectedCookie(vitals),
> - vitals.name)
> - defer.returnValue(False)
> - else:
> - raise BuildSlaveFailure(
> - "Timeout waiting for BuildQueue %d (%s) on %s to "
> - "cancel" % (
> - vitals.build_queue.id, self.getExpectedCookie(vitals),
> - vitals.name))
> - except Exception as e:
> + elif self.date_cancel is None:
> self.logger.info(
> - "Failure while cancelling BuildQueue %d (%s) on %s",
> + "Cancelling BuildQueue %d (%s) on %s",
> vitals.build_queue.id, self.getExpectedCookie(vitals),
> vitals.name)
> - self.date_cancel = None
> - vitals.build_queue.markAsCancelled()
> - transaction.commit()
> - resumed = yield interactor.resetOrFail(
> - vitals, slave, self.builder_factory[vitals.name], self.logger,
> - e)
> - defer.returnValue(resumed)
> + yield slave.abort()
> + self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
> + else:
> + # The BuildFarmJob will normally set the build's status to
> + # something other than CANCELLING once the builder responds to
> + # the cancel request. This timeout is in case it doesn't.
> + if self._clock.seconds() < self.date_cancel:
> + self.logger.info(
> + "Waiting for BuildQueue %d (%s) on %s to cancel",
> + vitals.build_queue.id, self.getExpectedCookie(vitals),
> + vitals.name)
> + else:
> + raise BuildSlaveFailure(
> + "Timeout waiting for BuildQueue %d (%s) on %s to "
> + "cancel" % (
> + vitals.build_queue.id, self.getExpectedCookie(vitals),
> + vitals.name))
>
> def getExpectedCookie(self, vitals):
> """Return the build cookie expected to be held by the slave.
> @@ -461,10 +450,7 @@
> transaction.commit()
> return
>
> - cancelled = yield self.checkCancellation(
> - vitals, slave, interactor)
> - if cancelled:
> - return
> + yield self.checkCancellation(vitals, slave)
>
> # The slave and DB agree on the builder's state. Scan the
> # slave and get the logtail, or collect the build if it's
>
> === modified file 'lib/lp/buildmaster/tests/test_manager.py'
> --- lib/lp/buildmaster/tests/test_manager.py 2014-06-23 15:14:46 +0000
> +++ lib/lp/buildmaster/tests/test_manager.py 2014-06-24 08:25:56 +0000
> @@ -37,6 +37,7 @@
> )
> from lp.buildmaster.interfaces.builder import (
> BuildDaemonIsolationError,
> + BuildSlaveFailure,
> IBuilderSet,
> )
> from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
> @@ -557,7 +558,7 @@
> # we should have also called the resume() method on the slave that
> # resets the virtual machine.
> clock.advance(SlaveScanner.CANCEL_TIMEOUT)
> - yield scanner.scan()
> + yield scanner.singleCycle()
> self.assertEqual(1, slave.call_log.count("abort"))
> self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
> self.assertEqual(BuildStatus.CANCELLED, build.status)
> @@ -960,7 +961,6 @@
> builder_name = BOB_THE_BUILDER_NAME
> self.builder = getUtility(IBuilderSet)[builder_name]
> self.builder.virtualized = True
> - self.interactor = BuilderInteractor()
>
> @property
> def vitals(self):
> @@ -972,67 +972,56 @@
> scanner.logger.name = 'slave-scanner'
> return scanner
>
> - def test_ignores_nonvirtual(self):
> - # If the builder is nonvirtual make sure we return False.
> - self.builder.virtualized = False
> - d = self._getScanner().checkCancellation(
> - self.vitals, None, self.interactor)
> - return d.addCallback(self.assertFalse)
> -
> - def test_ignores_no_buildqueue(self):
> - # If the builder has no buildqueue associated,
> - # make sure we return False.
> - buildqueue = self.builder.currentjob
> - buildqueue.reset()
> - d = self._getScanner().checkCancellation(
> - self.vitals, None, self.interactor)
> - return d.addCallback(self.assertFalse)
> -
> def test_ignores_build_not_cancelling(self):
> # If the active build is not in a CANCELLING state, ignore it.
> - d = self._getScanner().checkCancellation(
> - self.vitals, None, self.interactor)
> - return d.addCallback(self.assertFalse)
> -
> - @defer.inlineCallbacks
> - def test_cancelling_build_is_cancelled(self):
> + slave = BuildingSlave()
> + scanner = self._getScanner()
> + yield scanner.checkCancellation(self.vitals, slave)
> + self.assertEqual([], slave.call_log)
> +
> + @defer.inlineCallbacks
> + def test_cancelling_build_is_aborted(self):
> + # The first time we see a CANCELLING build, we abort the slave.
> + slave = BuildingSlave()
> + self.builder.current_build.cancel()
> + scanner = self._getScanner()
> + yield scanner.checkCancellation(self.vitals, slave)
> + self.assertEqual(["abort"], slave.call_log)
> +
> + # A further scan is a no-op, as we remember that we've already
> + # requested that the slave abort.
> + yield scanner.checkCancellation(self.vitals, slave)
> + self.assertEqual(["abort"], slave.call_log)
> +
> + @defer.inlineCallbacks
> + def test_timed_out_cancel_errors(self):
> # If a BuildQueue is CANCELLING and the cancel timeout expires,
> - # make sure True is returned and the slave is dirty.
> + # an exception is raised so the normal scan error handler can
> + # finalise the build.
> slave = OkSlave()
> - self.builder.vm_host = "fake_vm_host"
> - buildqueue = self.builder.currentjob
> - build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
> + build = self.builder.current_build
> build.cancel()
> clock = task.Clock()
> scanner = self._getScanner(clock=clock)
>
> - result = yield scanner.checkCancellation(
> - self.vitals, slave, self.interactor)
> - self.assertNotIn("resume", slave.call_log)
> - self.assertFalse(result)
> + yield scanner.checkCancellation(self.vitals, slave)
> + self.assertEqual(["abort"], slave.call_log)
> self.assertEqual(BuildStatus.CANCELLING, build.status)
>
> clock.advance(SlaveScanner.CANCEL_TIMEOUT)
> - result = yield scanner.checkCancellation(
> - self.vitals, slave, self.interactor)
> - self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> - self.assertTrue(result)
> - self.assertEqual(BuildStatus.CANCELLED, build.status)
> + with ExpectedException(
> + BuildSlaveFailure, "Timeout waiting for .* to cancel"):
> + yield scanner.checkCancellation(self.vitals, slave)
>
> @defer.inlineCallbacks
> - def test_lost_build_is_cancelled(self):
> - # If the builder reports a fault while attempting to abort it, then
> - # immediately resume the slave as if the cancel timeout had expired.
> + def test_failed_abort_errors(self):
> + # If the builder reports a fault while attempting to abort it,
> + # an exception is raised so the build can be finalised.
> slave = LostBuildingBrokenSlave()
> - self.builder.vm_host = "fake_vm_host"
> - buildqueue = self.builder.currentjob
> - build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
> - build.cancel()
> - result = yield self._getScanner().checkCancellation(
> - self.vitals, slave, self.interactor)
> - self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> - self.assertTrue(result)
> - self.assertEqual(BuildStatus.CANCELLED, build.status)
> + self.builder.current_build.cancel()
> + with ExpectedException(
> + xmlrpclib.Fault, "<Fault 8002: 'Could not abort'>"):
> + yield self._getScanner().checkCancellation(self.vitals, slave)
>
>
> class TestBuilddManager(TestCase):
> @@ -1118,6 +1107,19 @@
> self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
>
> @defer.inlineCallbacks
> + def test_reset_during_cancellation_cancels(self):
> + self.buildqueue.cancel()
> + self.assertEqual(BuildStatus.CANCELLING, self.build.status)
> +
> + naked_build = removeSecurityProxy(self.build)
> + self.builder.failure_count = 1
> + naked_build.failure_count = 1
> +
> + yield self._assessFailureCounts("failnotes")
> + self.assertIs(None, self.builder.currentjob)
> + self.assertEqual(BuildStatus.CANCELLED, self.build.status)
> +
> + @defer.inlineCallbacks
> def test_job_failing_more_than_builder_fails_job(self):
> self.build.gotFailure()
> self.build.gotFailure()
> @@ -1129,6 +1131,18 @@
> self.assertEqual(0, self.builder.failure_count)
>
> @defer.inlineCallbacks
> + def test_failure_during_cancellation_cancels(self):
> + self.buildqueue.cancel()
> + self.assertEqual(BuildStatus.CANCELLING, self.build.status)
> +
> + self.build.gotFailure()
> + self.build.gotFailure()
Why do you have to do it twice ?
> + self.builder.gotFailure()
> + yield self._assessFailureCounts("failnotes")
> + self.assertIs(None, self.builder.currentjob)
> + self.assertEqual(BuildStatus.CANCELLED, self.build.status)
> +
> + @defer.inlineCallbacks
> def test_virtual_builder_reset_thresholds(self):
> self.builder.virtualized = True
> self.builder.vm_host = 'foo'
>
--
https://code.launchpad.net/~wgrant/launchpad/checkCancellation-nicify/+merge/224245
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad.
References