← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad

 

Okay, that makes sense, so the assessFailureCounts() will handler the BuildSlaveFailure appropriately and reset slaves when necessary.

Just a minor test comment.

Diff comments:

> === modified file 'lib/lp/buildmaster/manager.py'
> --- lib/lp/buildmaster/manager.py	2014-06-23 14:52:15 +0000
> +++ lib/lp/buildmaster/manager.py	2014-06-24 08:25:56 +0000
> @@ -190,14 +190,25 @@
>      """
>      del get_property_cache(builder).currentjob
>      job = builder.currentjob
> +
> +    # If a job is being cancelled we won't bother retrying a failure.
> +    # Just mark it as cancelled and clear the builder for normal cleanup.
> +    cancelling = job is not None and job.status == BuildQueueStatus.CANCELLING
>      builder_action, job_action = judge_failure(
>          builder.failure_count, job.specific_build.failure_count if job else 0,
> -        exception, retry=retry)
> +        exception, retry=retry and not cancelling)
>  
> -    if job is not None:
> +    if job is not None and job_action is not None:
>          if job_action == False:
> +            # We've decided the job is bad, so unblame the builder.
> +            builder.resetFailureCount()
> +
> +        if cancelling:
> +            # We've previously been asked to cancel the job, so just set
> +            # it to cancelled rather than retrying or failing.
> +            job.markAsCancelled()
> +        elif job_action == False:
>              # Fail and dequeue the job.
> -            builder.resetFailureCount()
>              job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
>              job.destroySelf()
>          elif job_action == True:
> @@ -339,60 +350,38 @@
>              transaction.abort()
>  
>      @defer.inlineCallbacks
> -    def checkCancellation(self, vitals, slave, interactor):
> +    def checkCancellation(self, vitals, slave):
>          """See if there is a pending cancellation request.
>  
>          If the current build is in status CANCELLING then terminate it
>          immediately.
>  
> -        :return: A deferred whose value is True if we recovered the builder
> -            by resuming a slave host, so that there is no need to update its
> -            status.
> +        :return: A deferred which fires when this cancellation cycle is done.
>          """
> -        if vitals.build_queue is None:
> -            self.date_cancel = None
> -            defer.returnValue(False)
>          if vitals.build_queue.status != BuildQueueStatus.CANCELLING:
>              self.date_cancel = None
> -            defer.returnValue(False)
> -
> -        try:
> -            if self.date_cancel is None:
> -                self.logger.info(
> -                    "Cancelling BuildQueue %d (%s) on %s",
> -                    vitals.build_queue.id, self.getExpectedCookie(vitals),
> -                    vitals.name)
> -                yield slave.abort()
> -                self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
> -                defer.returnValue(False)
> -            else:
> -                # The BuildFarmJob will normally set the build's status to
> -                # something other than CANCELLING once the builder responds to
> -                # the cancel request.  This timeout is in case it doesn't.
> -                if self._clock.seconds() < self.date_cancel:
> -                    self.logger.info(
> -                        "Waiting for BuildQueue %d (%s) on %s to cancel",
> -                        vitals.build_queue.id, self.getExpectedCookie(vitals),
> -                        vitals.name)
> -                    defer.returnValue(False)
> -                else:
> -                    raise BuildSlaveFailure(
> -                        "Timeout waiting for BuildQueue %d (%s) on %s to "
> -                        "cancel" % (
> -                        vitals.build_queue.id, self.getExpectedCookie(vitals),
> -                        vitals.name))
> -        except Exception as e:
> +        elif self.date_cancel is None:
>              self.logger.info(
> -                "Failure while cancelling BuildQueue %d (%s) on %s",
> +                "Cancelling BuildQueue %d (%s) on %s",
>                  vitals.build_queue.id, self.getExpectedCookie(vitals),
>                  vitals.name)
> -            self.date_cancel = None
> -            vitals.build_queue.markAsCancelled()
> -            transaction.commit()
> -            resumed = yield interactor.resetOrFail(
> -                vitals, slave, self.builder_factory[vitals.name], self.logger,
> -                e)
> -            defer.returnValue(resumed)
> +            yield slave.abort()
> +            self.date_cancel = self._clock.seconds() + self.CANCEL_TIMEOUT
> +        else:
> +            # The BuildFarmJob will normally set the build's status to
> +            # something other than CANCELLING once the builder responds to
> +            # the cancel request.  This timeout is in case it doesn't.
> +            if self._clock.seconds() < self.date_cancel:
> +                self.logger.info(
> +                    "Waiting for BuildQueue %d (%s) on %s to cancel",
> +                    vitals.build_queue.id, self.getExpectedCookie(vitals),
> +                    vitals.name)
> +            else:
> +                raise BuildSlaveFailure(
> +                    "Timeout waiting for BuildQueue %d (%s) on %s to "
> +                    "cancel" % (
> +                    vitals.build_queue.id, self.getExpectedCookie(vitals),
> +                    vitals.name))
>  
>      def getExpectedCookie(self, vitals):
>          """Return the build cookie expected to be held by the slave.
> @@ -461,10 +450,7 @@
>                  transaction.commit()
>                  return
>  
> -            cancelled = yield self.checkCancellation(
> -                vitals, slave, interactor)
> -            if cancelled:
> -                return
> +            yield self.checkCancellation(vitals, slave)
>  
>              # The slave and DB agree on the builder's state.  Scan the
>              # slave and get the logtail, or collect the build if it's
> 
> === modified file 'lib/lp/buildmaster/tests/test_manager.py'
> --- lib/lp/buildmaster/tests/test_manager.py	2014-06-23 15:14:46 +0000
> +++ lib/lp/buildmaster/tests/test_manager.py	2014-06-24 08:25:56 +0000
> @@ -37,6 +37,7 @@
>      )
>  from lp.buildmaster.interfaces.builder import (
>      BuildDaemonIsolationError,
> +    BuildSlaveFailure,
>      IBuilderSet,
>      )
>  from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
> @@ -557,7 +558,7 @@
>          # we should have also called the resume() method on the slave that
>          # resets the virtual machine.
>          clock.advance(SlaveScanner.CANCEL_TIMEOUT)
> -        yield scanner.scan()
> +        yield scanner.singleCycle()
>          self.assertEqual(1, slave.call_log.count("abort"))
>          self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
>          self.assertEqual(BuildStatus.CANCELLED, build.status)
> @@ -960,7 +961,6 @@
>          builder_name = BOB_THE_BUILDER_NAME
>          self.builder = getUtility(IBuilderSet)[builder_name]
>          self.builder.virtualized = True
> -        self.interactor = BuilderInteractor()
>  
>      @property
>      def vitals(self):
> @@ -972,67 +972,56 @@
>          scanner.logger.name = 'slave-scanner'
>          return scanner
>  
> -    def test_ignores_nonvirtual(self):
> -        # If the builder is nonvirtual make sure we return False.
> -        self.builder.virtualized = False
> -        d = self._getScanner().checkCancellation(
> -            self.vitals, None, self.interactor)
> -        return d.addCallback(self.assertFalse)
> -
> -    def test_ignores_no_buildqueue(self):
> -        # If the builder has no buildqueue associated,
> -        # make sure we return False.
> -        buildqueue = self.builder.currentjob
> -        buildqueue.reset()
> -        d = self._getScanner().checkCancellation(
> -            self.vitals, None, self.interactor)
> -        return d.addCallback(self.assertFalse)
> -
>      def test_ignores_build_not_cancelling(self):
>          # If the active build is not in a CANCELLING state, ignore it.
> -        d = self._getScanner().checkCancellation(
> -            self.vitals, None, self.interactor)
> -        return d.addCallback(self.assertFalse)
> -
> -    @defer.inlineCallbacks
> -    def test_cancelling_build_is_cancelled(self):
> +        slave = BuildingSlave()
> +        scanner = self._getScanner()
> +        yield scanner.checkCancellation(self.vitals, slave)
> +        self.assertEqual([], slave.call_log)
> +
> +    @defer.inlineCallbacks
> +    def test_cancelling_build_is_aborted(self):
> +        # The first time we see a CANCELLING build, we abort the slave.
> +        slave = BuildingSlave()
> +        self.builder.current_build.cancel()
> +        scanner = self._getScanner()
> +        yield scanner.checkCancellation(self.vitals, slave)
> +        self.assertEqual(["abort"], slave.call_log)
> +
> +        # A further scan is a no-op, as we remember that we've already
> +        # requested that the slave abort.
> +        yield scanner.checkCancellation(self.vitals, slave)
> +        self.assertEqual(["abort"], slave.call_log)
> +
> +    @defer.inlineCallbacks
> +    def test_timed_out_cancel_errors(self):
>          # If a BuildQueue is CANCELLING and the cancel timeout expires,
> -        # make sure True is returned and the slave is dirty.
> +        # an exception is raised so the normal scan error handler can
> +        # finalise the build.
>          slave = OkSlave()
> -        self.builder.vm_host = "fake_vm_host"
> -        buildqueue = self.builder.currentjob
> -        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
> +        build = self.builder.current_build
>          build.cancel()
>          clock = task.Clock()
>          scanner = self._getScanner(clock=clock)
>  
> -        result = yield scanner.checkCancellation(
> -            self.vitals, slave, self.interactor)
> -        self.assertNotIn("resume", slave.call_log)
> -        self.assertFalse(result)
> +        yield scanner.checkCancellation(self.vitals, slave)
> +        self.assertEqual(["abort"], slave.call_log)
>          self.assertEqual(BuildStatus.CANCELLING, build.status)
>  
>          clock.advance(SlaveScanner.CANCEL_TIMEOUT)
> -        result = yield scanner.checkCancellation(
> -            self.vitals, slave, self.interactor)
> -        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> -        self.assertTrue(result)
> -        self.assertEqual(BuildStatus.CANCELLED, build.status)
> +        with ExpectedException(
> +                BuildSlaveFailure, "Timeout waiting for .* to cancel"):
> +            yield scanner.checkCancellation(self.vitals, slave)
>  
>      @defer.inlineCallbacks
> -    def test_lost_build_is_cancelled(self):
> -        # If the builder reports a fault while attempting to abort it, then
> -        # immediately resume the slave as if the cancel timeout had expired.
> +    def test_failed_abort_errors(self):
> +        # If the builder reports a fault while attempting to abort it,
> +        # an exception is raised so the build can be finalised.
>          slave = LostBuildingBrokenSlave()
> -        self.builder.vm_host = "fake_vm_host"
> -        buildqueue = self.builder.currentjob
> -        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(buildqueue)
> -        build.cancel()
> -        result = yield self._getScanner().checkCancellation(
> -            self.vitals, slave, self.interactor)
> -        self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> -        self.assertTrue(result)
> -        self.assertEqual(BuildStatus.CANCELLED, build.status)
> +        self.builder.current_build.cancel()
> +        with ExpectedException(
> +                xmlrpclib.Fault, "<Fault 8002: 'Could not abort'>"):
> +            yield self._getScanner().checkCancellation(self.vitals, slave)
>  
>  
>  class TestBuilddManager(TestCase):
> @@ -1118,6 +1107,19 @@
>          self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
>  
>      @defer.inlineCallbacks
> +    def test_reset_during_cancellation_cancels(self):
> +        self.buildqueue.cancel()
> +        self.assertEqual(BuildStatus.CANCELLING, self.build.status)
> +
> +        naked_build = removeSecurityProxy(self.build)
> +        self.builder.failure_count = 1
> +        naked_build.failure_count = 1
> +
> +        yield self._assessFailureCounts("failnotes")
> +        self.assertIs(None, self.builder.currentjob)
> +        self.assertEqual(BuildStatus.CANCELLED, self.build.status)
> +
> +    @defer.inlineCallbacks
>      def test_job_failing_more_than_builder_fails_job(self):
>          self.build.gotFailure()
>          self.build.gotFailure()
> @@ -1129,6 +1131,18 @@
>          self.assertEqual(0, self.builder.failure_count)
>  
>      @defer.inlineCallbacks
> +    def test_failure_during_cancellation_cancels(self):
> +        self.buildqueue.cancel()
> +        self.assertEqual(BuildStatus.CANCELLING, self.build.status)
> +
> +        self.build.gotFailure()
> +        self.build.gotFailure()

Why do you have to do it twice ?

> +        self.builder.gotFailure()
> +        yield self._assessFailureCounts("failnotes")
> +        self.assertIs(None, self.builder.currentjob)
> +        self.assertEqual(BuildStatus.CANCELLED, self.build.status)
> +
> +    @defer.inlineCallbacks
>      def test_virtual_builder_reset_thresholds(self):
>          self.builder.virtualized = True
>          self.builder.vm_host = 'foo'
> 


-- 
https://code.launchpad.net/~wgrant/launchpad/checkCancellation-nicify/+merge/224245
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/checkCancellation-nicify into lp:launchpad.


References