← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~wgrant/launchpad/ppa-reset-2.0-basics into lp:launchpad

 

Review: Approve

Looks good to me; just one minor suggestion.

Diff comments:

> === modified file 'lib/lp/buildmaster/interactor.py'
> --- lib/lp/buildmaster/interactor.py	2014-05-10 18:40:36 +0000
> +++ lib/lp/buildmaster/interactor.py	2014-06-17 13:03:35 +0000
> @@ -21,6 +21,7 @@
>      removeSecurityProxy,
>      )
>  
> +from lp.buildmaster.enums import BuilderCleanStatus
>  from lp.buildmaster.interfaces.builder import (
>      BuildDaemonError,
>      CannotFetchFile,
> @@ -216,7 +217,7 @@
>  BuilderVitals = namedtuple(
>      'BuilderVitals',
>      ('name', 'url', 'virtualized', 'vm_host', 'builderok', 'manual',
> -     'build_queue', 'version'))
> +     'build_queue', 'version', 'clean_status'))
>  
>  _BQ_UNSPECIFIED = object()
>  
> @@ -226,7 +227,8 @@
>          build_queue = builder.currentjob
>      return BuilderVitals(
>          builder.name, builder.url, builder.virtualized, builder.vm_host,
> -        builder.builderok, builder.manual, build_queue, builder.version)
> +        builder.builderok, builder.manual, build_queue, builder.version,
> +        builder.clean_status)
>  
>  
>  class BuilderInteractor(object):
> @@ -249,44 +251,6 @@
>          return behaviour
>  
>      @classmethod
> -    @defer.inlineCallbacks
> -    def rescueIfLost(cls, vitals, slave, slave_status, expected_cookie,
> -                     logger=None):
> -        """Reset the slave if its job information doesn't match the DB.
> -
> -        This checks the build ID reported in the slave status against
> -        the given cookie. If it isn't building what we think it should
> -        be, the current build will be aborted and the slave cleaned in
> -        preparation for a new task.
> -
> -        :return: A Deferred that fires when the dialog with the slave is
> -            finished.  Its return value is True if the slave is lost,
> -            False otherwise.
> -        """
> -        # Determine the slave's current build cookie.
> -        status = slave_status['builder_status']
> -        slave_cookie = slave_status.get('build_id')
> -
> -        if slave_cookie == expected_cookie:
> -            # The master and slave agree about the current job. Continue.
> -            defer.returnValue(False)
> -        else:
> -            # The master and slave disagree. The master is our master,
> -            # so try to rescue the slave.
> -            # An IDLE slave doesn't need rescuing (SlaveScanner.scan
> -            # will rescue the DB side instead), and we just have to wait
> -            # out an ABORTING one.
> -            if status == 'BuilderStatus.WAITING':
> -                yield slave.clean()
> -            elif status == 'BuilderStatus.BUILDING':
> -                yield slave.abort()
> -            if logger:
> -                logger.info(
> -                    "Builder slave '%s' rescued from %r (expected %r)." %
> -                    (vitals.name, slave_cookie, expected_cookie))
> -            defer.returnValue(True)
> -
> -    @classmethod
>      def resumeSlaveHost(cls, vitals, slave):
>          """Resume the slave host to a known good condition.
>  
> @@ -323,6 +287,49 @@
>  
>      @classmethod
>      @defer.inlineCallbacks
> +    def cleanSlave(cls, vitals, slave):
> +        """Prepare a slave for a new build.
> +
> +        :return: A Deferred that fires when this stage of the resume
> +            operations finishes. If the value is True, the slave is now clean.
> +            If it's False, the clean is still in progress and this must be
> +            called again later.
> +        """
> +        if vitals.virtualized:
> +            # If we are building a virtual build, resume the virtual
> +            # machine.  Before we try and contact the resumed slave,
> +            # we're going to send it a message.  This is to ensure
> +            # it's accepting packets from the outside world, because
> +            # testing has shown that the first packet will randomly
> +            # fail for no apparent reason.  This could be a quirk of
> +            # the Xen guest, we're not sure.  We also don't care
> +            # about the result from this message, just that it's
> +            # sent, hence the "addBoth".  See bug 586359.
> +            yield cls.resumeSlaveHost(vitals, slave)
> +            yield slave.echo("ping")
> +            defer.returnValue(True)
> +        else:
> +            slave_status = yield slave.status()
> +            status = slave_status.get('builder_status', None)
> +            if status == 'BuilderStatus.IDLE':
> +                # This is as clean as we can get it.
> +                defer.returnValue(True)
> +            elif status == 'BuilderStatus.BUILDING':
> +                # Asynchronously abort() the slave and wait until WAITING.
> +                yield slave.abort()
> +                defer.returnValue(False)
> +            elif status == 'BuilderStatus.ABORTING':
> +                # Wait it out until WAITING.
> +                defer.returnValue(False)
> +            elif status == 'BuilderStatus.WAITING':
> +                # Just a synchronous clean() call and we'll be idle.
> +                yield slave.clean()
> +                defer.returnValue(True)
> +            raise BuildDaemonError(
> +                "Invalid status during clean: %r" % status)
> +
> +    @classmethod
> +    @defer.inlineCallbacks
>      def _startBuild(cls, build_queue_item, vitals, builder, slave, behaviour,
>                      logger):
>          """Start a build on this builder.
> @@ -342,17 +349,12 @@
>              raise BuildDaemonError(
>                  "Attempted to start a build on a known-bad builder.")
>  
> -        # If we are building a virtual build, resume the virtual
> -        # machine.  Before we try and contact the resumed slave, we're
> -        # going to send it a message.  This is to ensure it's accepting
> -        # packets from the outside world, because testing has shown that
> -        # the first packet will randomly fail for no apparent reason.
> -        # This could be a quirk of the Xen guest, we're not sure.  We
> -        # also don't care about the result from this message, just that
> -        # it's sent, hence the "addBoth".  See bug 586359.
> -        if builder.virtualized:
> -            yield cls.resumeSlaveHost(vitals, slave)
> -            yield slave.echo("ping")
> +        if builder.clean_status != BuilderCleanStatus.CLEAN:
> +            raise BuildDaemonError(
> +                "Attempted to start build on a dirty slave.")
> +
> +        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
> +        transaction.commit()
>  
>          yield behaviour.dispatchBuildToSlave(build_queue_item.id, logger)
>  
> @@ -448,9 +450,9 @@
>          :return: A Deferred that fires when the slave dialog is finished.
>          """
>          # IDLE is deliberately not handled here, because it should be
> -        # impossible to get past rescueIfLost unless the slave matches
> -        # the DB, and this method isn't called unless the DB says
> -        # there's a job.
> +        # impossible to get past the cookie check unless the slave
> +        # matches the DB, and this method isn't called unless the DB
> +        # says there's a job.
>          builder_status = slave_status['builder_status']
>          if builder_status == 'BuilderStatus.BUILDING':
>              # Build still building, collect the logtail.
> 
> === modified file 'lib/lp/buildmaster/manager.py'
> --- lib/lp/buildmaster/manager.py	2014-02-05 23:22:30 +0000
> +++ lib/lp/buildmaster/manager.py	2014-06-17 13:03:35 +0000
> @@ -26,6 +26,7 @@
>  from zope.component import getUtility
>  
>  from lp.buildmaster.enums import (
> +    BuilderCleanStatus,
>      BuildQueueStatus,
>      BuildStatus,
>      )
> @@ -413,70 +414,91 @@
>          vitals = self.builder_factory.getVitals(self.builder_name)
>          interactor = self.interactor_factory()
>          slave = self.slave_factory(vitals)
> -        if vitals.builderok:
> -            self.logger.debug("Scanning %s." % self.builder_name)
> -            slave_status = yield slave.status()
> -        else:
> -            slave_status = None
> -
> -        # Confirm that the DB and slave sides are in a valid, mutually
> -        # agreeable state.
> -        lost_reason = None
> -        if not vitals.builderok:
> -            lost_reason = '%s is disabled' % vitals.name
> -        else:
> -            self.updateVersion(vitals, slave_status)
> -            cancelled = yield self.checkCancellation(vitals, slave, interactor)
> -            if cancelled:
> -                return
> -            assert slave_status is not None
> -            lost = yield interactor.rescueIfLost(
> -                vitals, slave, slave_status, self.getExpectedCookie(vitals),
> -                self.logger)
> -            if lost:
> -                lost_reason = '%s is lost' % vitals.name
> -
> -        # The slave is lost or the builder is disabled. We can't
> -        # continue to update the job status or dispatch a new job, so
> -        # just rescue the assigned job, if any, so it can be dispatched
> -        # to another slave.
> -        if lost_reason is not None:
> -            if vitals.build_queue is not None:
> +
> +        if vitals.build_queue is not None:
> +            if vitals.clean_status != BuilderCleanStatus.DIRTY:
> +                # This is probably a grave bug with security implications,
> +                # as a slave that has a job must be cleaned afterwards.
> +                raise BuildDaemonError("Non-dirty builder allegedly building.")
> +
> +            lost_reason = None
> +            if not vitals.builderok:
> +                lost_reason = '%s is disabled' % vitals.name
> +            else:
> +                slave_status = yield slave.status()
> +                # Ensure that the slave has the job that we think it
> +                # should.
> +                slave_cookie = slave_status.get('build_id')
> +                expected_cookie = self.getExpectedCookie(vitals)
> +                if slave_cookie != expected_cookie:
> +                    lost_reason = (
> +                        '%s is lost (expected %r, got %r)' % (
> +                            vitals.name, expected_cookie, slave_cookie))
> +
> +            if lost_reason is not None:
> +                # The slave is either confused or disabled, so reset and
> +                # requeue the job. The next scan cycle will clean up the
> +                # slave if appropriate.
>                  self.logger.warn(
>                      "%s. Resetting BuildQueue %d.", lost_reason,
>                      vitals.build_queue.id)
>                  vitals.build_queue.reset()
>                  transaction.commit()
> -            return
> -
> -        # We've confirmed that the slave state matches the DB. Continue
> -        # with updating the job status, or dispatching a new job if the
> -        # builder is idle.
> -        if vitals.build_queue is not None:
> -            # Scan the slave and get the logtail, or collect the build
> -            # if it's ready.  Yes, "updateBuild" is a bad name.
> +                return
> +
> +            cancelled = yield self.checkCancellation(
> +                vitals, slave, interactor)
> +            if cancelled:
> +                return
> +
> +            # The slave and DB agree on the builder's state.  Scan the
> +            # slave and get the logtail, or collect the build if it's
> +            # ready.  Yes, "updateBuild" is a bad name.
>              assert slave_status is not None
>              yield interactor.updateBuild(
>                  vitals, slave, slave_status, self.builder_factory,
>                  self.behaviour_factory)
> -        elif vitals.manual:
> -            # If the builder is in manual mode, don't dispatch anything.
> -            self.logger.debug(
> -                '%s is in manual mode, not dispatching.' % vitals.name)
>          else:
> -            # See if there is a job we can dispatch to the builder slave.
> -            builder = self.builder_factory[self.builder_name]
> -            # Try to dispatch the job. If it fails, don't attempt to
> -            # just retry the scan; we need to reset the job so the
> -            # dispatch will be reattempted.
> -            d = interactor.findAndStartJob(vitals, builder, slave)
> -            d.addErrback(functools.partial(self._scanFailed, False))
> -            yield d
> -            if builder.currentjob is not None:
> -                # After a successful dispatch we can reset the
> -                # failure_count.
> -                builder.resetFailureCount()
> -                transaction.commit()
> +            if not vitals.builderok:
> +                return
> +            # We think the builder is idle. If it's clean, dispatch. If
> +            # it's dirty, clean.
> +            if vitals.clean_status == BuilderCleanStatus.CLEAN:
> +                slave_status = yield slave.status()
> +                if slave_status.get('builder_status') != 'BuilderStatus.IDLE':
> +                    raise BuildDaemonError(
> +                        'Allegedly clean slave not idle (%r instead)'
> +                        % slave_status.get('builder_status'))
> +                self.updateVersion(vitals, slave_status)
> +                if vitals.manual:
> +                    # If the builder is in manual mode, don't dispatch
> +                    # anything.
> +                    self.logger.debug(
> +                        '%s is in manual mode, not dispatching.', vitals.name)
> +                    return
> +                # Try to find and dispatch a job. If it fails, don't
> +                # attempt to just retry the scan; we need to reset
> +                # the job so the dispatch will be reattempted.
> +                builder = self.builder_factory[self.builder_name]
> +                d = interactor.findAndStartJob(vitals, builder, slave)
> +                d.addErrback(functools.partial(self._scanFailed, False))
> +                yield d
> +                if builder.currentjob is not None:
> +                    # After a successful dispatch we can reset the
> +                    # failure_count.
> +                    builder.resetFailureCount()
> +                    transaction.commit()
> +            else:

Worth an "assert vitals.clean_status == BuilderCleanStatus.DIRTY" or similar here?  CLEANING is for the new protocol, and we aren't expecting it yet, but it would have saved me a bit of thought while reading this logic.

> +                # Ask the BuilderInteractor to clean the slave. It might
> +                # be immediately cleaned on return, in which case we go
> +                # straight back to CLEAN, or we might have to spin
> +                # through another few cycles.
> +                done = yield interactor.cleanSlave(vitals, slave)
> +                if done:
> +                    builder = self.builder_factory[self.builder_name]
> +                    builder.setCleanStatus(BuilderCleanStatus.CLEAN)
> +                    self.logger.debug('%s has been cleaned.', vitals.name)
> +                    transaction.commit()
>  
>  
>  class NewBuildersScanner:
> 
> === modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
> --- lib/lp/buildmaster/tests/mock_slaves.py	2014-06-06 10:32:55 +0000
> +++ lib/lp/buildmaster/tests/mock_slaves.py	2014-06-17 13:03:35 +0000
> @@ -30,6 +30,7 @@
>  from twisted.internet import defer
>  from twisted.web import xmlrpc
>  
> +from lp.buildmaster.enums import BuilderCleanStatus
>  from lp.buildmaster.interactor import BuilderSlave
>  from lp.buildmaster.interfaces.builder import CannotFetchFile
>  from lp.services.config import config
> @@ -48,7 +49,7 @@
>  
>      def __init__(self, name='mock-builder', builderok=True, manual=False,
>                   virtualized=True, vm_host=None, url='http://fake:0000',
> -                 version=None):
> +                 version=None, clean_status=BuilderCleanStatus.DIRTY):
>          self.currentjob = None
>          self.builderok = builderok
>          self.manual = manual
> @@ -58,6 +59,10 @@
>          self.vm_host = vm_host
>          self.failnotes = None
>          self.version = version
> +        self.clean_status = clean_status
> +
> +    def setCleanStatus(self, clean_status):
> +        self.clean_status = clean_status
>  
>      def failBuilder(self, reason):
>          self.builderok = False
> 
> === modified file 'lib/lp/buildmaster/tests/test_interactor.py'
> --- lib/lp/buildmaster/tests/test_interactor.py	2014-06-06 11:15:29 +0000
> +++ lib/lp/buildmaster/tests/test_interactor.py	2014-06-17 13:03:35 +0000
> @@ -16,19 +16,24 @@
>      SynchronousDeferredRunTest,
>      )
>  from testtools.matchers import ContainsAll
> +from testtools.testcase import ExpectedException
>  from twisted.internet import defer
>  from twisted.internet.task import Clock
>  from twisted.python.failure import Failure
>  from twisted.web.client import getPage
>  from zope.security.proxy import removeSecurityProxy
>  
> -from lp.buildmaster.enums import BuildStatus
> +from lp.buildmaster.enums import (
> +    BuilderCleanStatus,
> +    BuildStatus,
> +    )
>  from lp.buildmaster.interactor import (
>      BuilderInteractor,
>      BuilderSlave,
>      extract_vitals_from_db,
>      )
>  from lp.buildmaster.interfaces.builder import (
> +    BuildDaemonError,
>      CannotFetchFile,
>      CannotResumeHost,
>      )
> @@ -151,44 +156,6 @@
>          self.assertEqual(5, slave.timeout)
>  
>      @defer.inlineCallbacks
> -    def test_rescueIfLost_aborts_lost_and_broken_slave(self):
> -        # A slave that's 'lost' should be aborted; when the slave is
> -        # broken then abort() should also throw a fault.
> -        slave = LostBuildingBrokenSlave()
> -        slave_status = yield slave.status()
> -        try:
> -            yield BuilderInteractor.rescueIfLost(
> -                extract_vitals_from_db(MockBuilder()), slave, slave_status,
> -                'trivial')
> -        except xmlrpclib.Fault:
> -            self.assertIn('abort', slave.call_log)
> -        else:
> -            self.fail("xmlrpclib.Fault not raised")
> -
> -    @defer.inlineCallbacks
> -    def test_recover_idle_slave(self):
> -        # An idle slave is not rescued, even if it's not meant to be
> -        # idle. SlaveScanner.scan() will clean up the DB side, because
> -        # we still report that it's lost.
> -        slave = OkSlave()
> -        slave_status = yield slave.status()
> -        lost = yield BuilderInteractor.rescueIfLost(
> -            extract_vitals_from_db(MockBuilder()), slave, slave_status,
> -            'trivial')
> -        self.assertTrue(lost)
> -        self.assertEqual(['status'], slave.call_log)
> -
> -    @defer.inlineCallbacks
> -    def test_recover_ok_slave(self):
> -        # An idle slave that's meant to be idle is not rescued.
> -        slave = OkSlave()
> -        slave_status = yield slave.status()
> -        lost = yield BuilderInteractor.rescueIfLost(
> -            extract_vitals_from_db(MockBuilder()), slave, slave_status, None)
> -        self.assertFalse(lost)
> -        self.assertEqual(['status'], slave.call_log)
> -
> -    @defer.inlineCallbacks
>      def test_recover_waiting_slave_with_good_id(self):
>          # rescueIfLost does not attempt to abort or clean a builder that is
>          # WAITING.
> @@ -240,6 +207,76 @@
>          self.assertEqual(['status', 'abort'], building_slave.call_log)
>  
>  
> +class TestBuilderInteractorCleanSlave(TestCase):
> +
> +    run_tests_with = AsynchronousDeferredRunTest
> +
> +    @defer.inlineCallbacks
> +    def assertCleanCalls(self, builder, slave, calls, done):
> +        actually_done = yield BuilderInteractor.cleanSlave(
> +            extract_vitals_from_db(builder), slave)
> +        self.assertEqual(done, actually_done)
> +        self.assertEqual(calls, slave.method_log)
> +
> +    @defer.inlineCallbacks
> +    def test_virtual(self):
> +        # We don't care what the status of a virtual builder is; we just
> +        # reset its VM and check that it's back.
> +        yield self.assertCleanCalls(
> +            MockBuilder(
> +                virtualized=True, vm_host='lol',
> +                clean_status=BuilderCleanStatus.DIRTY),
> +            OkSlave(), ['resume', 'echo'], True)
> +
> +    @defer.inlineCallbacks
> +    def test_nonvirtual_idle(self):
> +        # An IDLE non-virtual slave is already as clean as we can get it.
> +        yield self.assertCleanCalls(
> +            MockBuilder(
> +                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
> +            OkSlave(), ['status'], True)
> +
> +    @defer.inlineCallbacks
> +    def test_nonvirtual_building(self):
> +        # A BUILDING non-virtual slave needs to be aborted. It'll go
> +        # through ABORTING and eventually be picked up from WAITING.
> +        yield self.assertCleanCalls(
> +            MockBuilder(
> +                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
> +            BuildingSlave(), ['status', 'abort'], False)
> +
> +    @defer.inlineCallbacks
> +    def test_nonvirtual_aborting(self):
> +        # An ABORTING non-virtual slave must be waited out. It should
> +        # hit WAITING eventually.
> +        yield self.assertCleanCalls(
> +            MockBuilder(
> +                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
> +            AbortingSlave(), ['status'], False)
> +
> +    @defer.inlineCallbacks
> +    def test_nonvirtual_waiting(self):
> +        # A WAITING non-virtual slave just needs clean() called.
> +        yield self.assertCleanCalls(
> +            MockBuilder(
> +                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
> +            WaitingSlave(), ['status', 'clean'], True)
> +
> +    @defer.inlineCallbacks
> +    def test_nonvirtual_broken(self):
> +        # A broken non-virtual builder is probably unrecoverable, so the
> +        # method just crashes.
> +        vitals = extract_vitals_from_db(MockBuilder(
> +            virtualized=False, clean_status=BuilderCleanStatus.DIRTY))
> +        slave = LostBuildingBrokenSlave()
> +        try:
> +            yield BuilderInteractor.cleanSlave(vitals, slave)
> +        except xmlrpclib.Fault:
> +            self.assertEqual(['status', 'abort'], slave.call_log)
> +        else:
> +            self.fail("abort() should crash.")
> +
> +
>  class TestBuilderSlaveStatus(TestCase):
>      # Verify what BuilderSlave.status returns with slaves in different
>      # states.
> @@ -321,6 +358,7 @@
>          processor = self.factory.makeProcessor(name="i386")
>          builder = self.factory.makeBuilder(
>              processors=[processor], virtualized=True, vm_host="bladh")
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>          self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
>          distroseries = self.factory.makeDistroSeries()
>          das = self.factory.makeDistroArchSeries(
> @@ -377,21 +415,30 @@
>  
>          return d.addCallback(check_build_started)
>  
> -    def test_virtual_job_dispatch_pings_before_building(self):
> -        # We need to send a ping to the builder to work around a bug
> -        # where sometimes the first network packet sent is dropped.
> -        builder, build = self._setupBinaryBuildAndBuilder()
> -        candidate = build.queueBuild()
> -        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
> -            result=candidate)
> -        vitals = extract_vitals_from_db(builder)
> -        slave = OkSlave()
> -        d = BuilderInteractor.findAndStartJob(vitals, builder, slave)
> -
> -        def check_build_started(candidate):
> -            self.assertIn(('echo', 'ping'), slave.call_log)
> -
> -        return d.addCallback(check_build_started)
> +    @defer.inlineCallbacks
> +    def test_findAndStartJob_requires_clean_slave(self):
> +        # findAndStartJob ensures that its slave starts CLEAN.
> +        builder, build = self._setupBinaryBuildAndBuilder()
> +        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
> +        candidate = build.queueBuild()
> +        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
> +            result=candidate)
> +        vitals = extract_vitals_from_db(builder)
> +        with ExpectedException(
> +                BuildDaemonError,
> +                "Attempted to start build on a dirty slave."):
> +            yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
> +
> +    @defer.inlineCallbacks
> +    def test_findAndStartJob_dirties_slave(self):
> +        # findAndStartJob marks its builder DIRTY before dispatching.
> +        builder, build = self._setupBinaryBuildAndBuilder()
> +        candidate = build.queueBuild()
> +        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
> +            result=candidate)
> +        vitals = extract_vitals_from_db(builder)
> +        yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
> +        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
>  
>  
>  class TestSlave(TestCase):
> 
> === modified file 'lib/lp/buildmaster/tests/test_manager.py'
> --- lib/lp/buildmaster/tests/test_manager.py	2014-06-06 10:32:55 +0000
> +++ lib/lp/buildmaster/tests/test_manager.py	2014-06-17 13:03:35 +0000
> @@ -13,6 +13,7 @@
>      AsynchronousDeferredRunTest,
>      )
>  from testtools.matchers import Equals
> +from testtools.testcase import ExpectedException
>  import transaction
>  from twisted.internet import (
>      defer,
> @@ -25,6 +26,7 @@
>  from zope.security.proxy import removeSecurityProxy
>  
>  from lp.buildmaster.enums import (
> +    BuilderCleanStatus,
>      BuildQueueStatus,
>      BuildStatus,
>      )
> @@ -33,7 +35,10 @@
>      BuilderSlave,
>      extract_vitals_from_db,
>      )
> -from lp.buildmaster.interfaces.builder import IBuilderSet
> +from lp.buildmaster.interfaces.builder import (
> +    BuildDaemonError,
> +    IBuilderSet,
> +    )
>  from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
>      IBuildFarmJobBehaviour,
>      )
> @@ -113,6 +118,7 @@
>          job = builder.currentjob
>          if job is not None:
>              job.reset()
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>  
>          transaction.commit()
>  
> @@ -155,6 +161,7 @@
>          # Set this to 1 here so that _checkDispatch can make sure it's
>          # reset to 0 after a successful dispatch.
>          builder.failure_count = 1
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>  
>          # Run 'scan' and check its result.
>          switch_dbuser(config.builddmaster.dbuser)
> @@ -258,6 +265,7 @@
>      def test_scan_with_nothing_to_dispatch(self):
>          factory = LaunchpadObjectFactory()
>          builder = factory.makeBuilder()
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>          self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
>          transaction.commit()
>          scanner = self._getScanner(builder_name=builder.name)
> @@ -422,9 +430,9 @@
>          self.assertFalse(builder.builderok)
>  
>      @defer.inlineCallbacks
> -    def test_fail_to_resume_slave_resets_job(self):
> -        # If an attempt to resume and dispatch a slave fails, it should
> -        # reset the job via job.reset()
> +    def test_fail_to_resume_leaves_it_dirty(self):
> +        # If an attempt to resume a slave fails, its failure count is
> +        # incremented and it is left DIRTY.
>  
>          # Make a slave with a failing resume() method.
>          slave = OkSlave()
> @@ -435,26 +443,21 @@
>          builder = removeSecurityProxy(
>              getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME])
>          self._resetBuilder(builder)
> +        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
> +        builder.virtualized = True
>          self.assertEqual(0, builder.failure_count)
>          self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
>          builder.vm_host = "fake_vm_host"
> -
> -        scanner = self._getScanner()
> -
> -        # Get the next job that will be dispatched.
> -        job = removeSecurityProxy(builder._findBuildCandidate())
> -        job.virtualized = True
> -        builder.virtualized = True
>          transaction.commit()
> -        yield scanner.singleCycle()
> -
> -        # The failure_count will have been incremented on the builder, we
> -        # can check that to see that a dispatch attempt did indeed occur.
> +
> +        # A spin of the scanner will see the DIRTY builder and reset it.
> +        # Our patched reset will fail.
> +        yield self._getScanner().singleCycle()
> +
> +        # The failure_count will have been incremented on the builder,
> +        # and it will be left DIRTY.
>          self.assertEqual(1, builder.failure_count)
> -        # There should also be no builder set on the job.
> -        self.assertIsNone(job.builder)
> -        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
> -        self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
> +        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
>  
>      @defer.inlineCallbacks
>      def test_update_slave_version(self):
> @@ -569,15 +572,25 @@
>              builder.name, BuilderFactory(), BufferLogger(),
>              slave_factory=get_slave, clock=clock)
>  
> -        # The slave is idle and there's a build candidate, so the first
> -        # scan will reset the builder and dispatch the build.
> +        # The slave is idle and dirty, so the first scan will clean it
> +        # with a reset.
> +        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
> +        yield scanner.scan()
> +        self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
> +        self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
> +        self.assertIs(None, builder.currentjob)
> +
> +        # The slave is idle and clean, and there's a build candidate, so
> +        # the next scan will dispatch the build.
> +        get_slave.result = OkSlave()
>          yield scanner.scan()
>          self.assertEqual(
> -            ['status', 'resume', 'echo', 'ensurepresent', 'build'],
> +            ['status', 'ensurepresent', 'build'],
>              get_slave.result.method_log)
>          self.assertEqual(bq, builder.currentjob)
>          self.assertEqual(BuildQueueStatus.RUNNING, bq.status)
>          self.assertEqual(BuildStatus.BUILDING, build.status)
> +        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
>  
>          # build() has been called, so switch in a BUILDING slave.
>          # Scans will now just do a status() each, as the logtail is
> @@ -596,7 +609,8 @@
>          # When the build finishes, the scanner will notice, call
>          # handleStatus(), and then clean the builder.  Our fake
>          # _handleStatus_OK doesn't do anything special, but there'd
> -        # usually be file retrievals in the middle.
> +        # usually be file retrievals in the middle. The builder remains
> +        # dirty afterward.
>          get_slave.result = WaitingSlave(
>              build_id=IBuildFarmJobBehaviour(build).getBuildCookie())
>          yield scanner.scan()
> @@ -604,13 +618,15 @@
>          self.assertIs(None, builder.currentjob)
>          self.assertEqual(BuildStatus.UPLOADING, build.status)
>          self.assertEqual(builder, build.builder)
> +        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
>  
> -        # We're clean, so let's flip back to an idle slave and
> -        # confirm that a scan does nothing special.
> +        # We're idle and dirty, so let's flip back to an idle slave and
> +        # confirm that the slave gets cleaned.
>          get_slave.result = OkSlave()
>          yield scanner.scan()
> -        self.assertEqual(['status'], get_slave.result.method_log)
> +        self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
>          self.assertIs(None, builder.currentjob)
> +        self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
>  
>  
>  class TestPrefetchedBuilderFactory(TestCaseWithFactory):
> @@ -740,74 +756,104 @@
>  
>      run_tests_with = AsynchronousDeferredRunTest
>  
> +    def getScanner(self, builder_factory=None, interactor=None, slave=None,
> +                   behaviour=None):
> +        if builder_factory is None:
> +            builder_factory = MockBuilderFactory(
> +                MockBuilder(virtualized=False), None)
> +        if interactor is None:
> +            interactor = BuilderInteractor()
> +            interactor.updateBuild = FakeMethod()
> +        if slave is None:
> +            slave = OkSlave()
> +        if behaviour is None:
> +            behaviour = TrivialBehaviour()
> +        return SlaveScanner(
> +            'mock', builder_factory, BufferLogger(),
> +            interactor_factory=FakeMethod(interactor),
> +            slave_factory=FakeMethod(slave),
> +            behaviour_factory=FakeMethod(behaviour))
> +
>      @defer.inlineCallbacks
>      def test_scan_with_job(self):
>          # SlaveScanner.scan calls updateBuild() when a job is building.
>          slave = BuildingSlave('trivial')
>          bq = FakeBuildQueue()
> -
> -        # Instrument updateBuild.
> -        interactor = BuilderInteractor()
> -        interactor.updateBuild = FakeMethod()
> -
> -        scanner = SlaveScanner(
> -            'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
> -            interactor_factory=FakeMethod(interactor),
> -            slave_factory=FakeMethod(slave),
> -            behaviour_factory=FakeMethod(TrivialBehaviour()))
> +        scanner = self.getScanner(
> +            builder_factory=MockBuilderFactory(MockBuilder(), bq),
> +            slave=slave)
>  
>          yield scanner.scan()
>          self.assertEqual(['status'], slave.call_log)
> -        self.assertEqual(1, interactor.updateBuild.call_count)
> +        self.assertEqual(
> +            1, scanner.interactor_factory.result.updateBuild.call_count)
>          self.assertEqual(0, bq.reset.call_count)
>  
>      @defer.inlineCallbacks
> -    def test_scan_aborts_lost_slave_with_job(self):
> -        # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
> -        # slaves that don't have the expected job.
> +    def test_scan_recovers_lost_slave_with_job(self):
> +        # SlaveScanner.scan identifies slaves that aren't building what
> +        # they should be, resets the jobs, and then aborts the slaves.
>          slave = BuildingSlave('nontrivial')
>          bq = FakeBuildQueue()
> -
> -        # Instrument updateBuild.
> -        interactor = BuilderInteractor()
> -        interactor.updateBuild = FakeMethod()
> -
> -        scanner = SlaveScanner(
> -            'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
> -            interactor_factory=FakeMethod(interactor),
> -            slave_factory=FakeMethod(slave),
> -            behaviour_factory=FakeMethod(TrivialBehaviour()))
> +        builder = MockBuilder(virtualized=False)
> +        scanner = self.getScanner(
> +            builder_factory=MockBuilderFactory(builder, bq),
> +            slave=slave)
>  
>          # A single scan will call status(), notice that the slave is lost,
> -        # abort() the slave, then reset() the job without calling
> -        # updateBuild().
> +        # and reset() the job without calling updateBuild().
>          yield scanner.scan()
> -        self.assertEqual(['status', 'abort'], slave.call_log)
> -        self.assertEqual(0, interactor.updateBuild.call_count)
> +        self.assertEqual(['status'], slave.call_log)
> +        self.assertEqual(
> +            0, scanner.interactor_factory.result.updateBuild.call_count)
>          self.assertEqual(1, bq.reset.call_count)
> +        # The reset would normally have unset build_queue.
> +        scanner.builder_factory.updateTestData(builder, None)
> +
> +        # The next scan will see a dirty idle builder with a BUILDING
> +        # slave, and abort() it.
> +        yield scanner.scan()
> +        self.assertEqual(['status', 'status', 'abort'], slave.call_log)
>  
>      @defer.inlineCallbacks
> -    def test_scan_aborts_lost_slave_when_idle(self):
> -        # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
> -        # slaves that aren't meant to have a job.
> +    def test_scan_recovers_lost_slave_when_idle(self):
> +        # SlaveScanner.scan identifies slaves that are building when
> +        # they shouldn't be and aborts them.
>          slave = BuildingSlave()
> -
> -        # Instrument updateBuild.
> -        interactor = BuilderInteractor()
> -        interactor.updateBuild = FakeMethod()
> -
> -        scanner = SlaveScanner(
> -            'mock', MockBuilderFactory(MockBuilder(), None), BufferLogger(),
> -            interactor_factory=FakeMethod(interactor),
> -            slave_factory=FakeMethod(slave),
> -            behaviour_factory=FakeMethod(None))
> -
> -        # A single scan will call status(), notice that the slave is lost,
> -        # abort() the slave, then reset() the job without calling
> -        # updateBuild().
> +        scanner = self.getScanner(slave=slave)
>          yield scanner.scan()
>          self.assertEqual(['status', 'abort'], slave.call_log)
> -        self.assertEqual(0, interactor.updateBuild.call_count)
> +
> +    @defer.inlineCallbacks
> +    def test_scan_building_but_not_dirty_builder_explodes(self):
> +        # Builders with a build assigned must be dirty for safety
> +        # reasons. If we run into one that's clean, we blow up.
> +        slave = BuildingSlave()
> +        builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
> +        bq = FakeBuildQueue()
> +        scanner = self.getScanner(
> +            slave=slave, builder_factory=MockBuilderFactory(builder, bq))
> +
> +        with ExpectedException(
> +                BuildDaemonError, "Non-dirty builder allegedly building."):
> +            yield scanner.scan()
> +        self.assertEqual([], slave.call_log)
> +
> +    @defer.inlineCallbacks
> +    def test_scan_clean_but_not_idle_slave_explodes(self):
> +        # Clean builders by definition have slaves that are idle. If
> +        # an ostensibly clean slave isn't idle, blow up.
> +        slave = BuildingSlave()
> +        builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
> +        scanner = self.getScanner(
> +            slave=slave, builder_factory=MockBuilderFactory(builder, None))
> +
> +        with ExpectedException(
> +                BuildDaemonError,
> +                "Allegedly clean slave not idle "
> +                "\('BuilderStatus.BUILDING' instead\)"):
> +            yield scanner.scan()
> +        self.assertEqual(['status'], slave.call_log)
>  
>      def test_getExpectedCookie_caches(self):
>          bf = MockBuilderFactory(MockBuilder(), FakeBuildQueue())
> 
> === modified file 'lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py'
> --- lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py	2014-06-06 10:32:55 +0000
> +++ lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py	2014-06-17 13:03:35 +0000
> @@ -19,6 +19,7 @@
>  from zope.security.proxy import removeSecurityProxy
>  
>  from lp.buildmaster.enums import (
> +    BuilderCleanStatus,
>      BuildQueueStatus,
>      BuildStatus,
>      )
> @@ -146,6 +147,7 @@
>          archive = self.factory.makeArchive(virtualized=False)
>          slave = OkSlave()
>          builder = self.factory.makeBuilder(virtualized=False)
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>          vitals = extract_vitals_from_db(builder)
>          build = self.factory.makeBinaryPackageBuild(
>              builder=builder, archive=archive)
> @@ -170,6 +172,7 @@
>          slave = OkSlave()
>          builder = self.factory.makeBuilder(
>              virtualized=True, vm_host="foohost")
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>          vitals = extract_vitals_from_db(builder)
>          build = self.factory.makeBinaryPackageBuild(
>              builder=builder, archive=archive)
> @@ -198,6 +201,7 @@
>              virtualized=False, purpose=ArchivePurpose.PARTNER)
>          slave = OkSlave()
>          builder = self.factory.makeBuilder(virtualized=False)
> +        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
>          vitals = extract_vitals_from_db(builder)
>          build = self.factory.makeBinaryPackageBuild(
>              builder=builder, archive=archive)
> 


-- 
https://code.launchpad.net/~wgrant/launchpad/ppa-reset-2.0-basics/+merge/223397
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.