← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/ppa-reset-2.0-basics into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/ppa-reset-2.0-basics into lp:launchpad with lp:~wgrant/launchpad/ppa-reset-2.0-model as a prerequisite.

Requested reviews:
  Colin Watson (cjwatson)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/ppa-reset-2.0-basics/+merge/223397
-- 
https://code.launchpad.net/~wgrant/launchpad/ppa-reset-2.0-basics/+merge/223397
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.
=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py	2014-05-10 18:40:36 +0000
+++ lib/lp/buildmaster/interactor.py	2014-06-17 13:03:35 +0000
@@ -21,6 +21,7 @@
     removeSecurityProxy,
     )
 
+from lp.buildmaster.enums import BuilderCleanStatus
 from lp.buildmaster.interfaces.builder import (
     BuildDaemonError,
     CannotFetchFile,
@@ -216,7 +217,7 @@
 BuilderVitals = namedtuple(
     'BuilderVitals',
     ('name', 'url', 'virtualized', 'vm_host', 'builderok', 'manual',
-     'build_queue', 'version'))
+     'build_queue', 'version', 'clean_status'))
 
 _BQ_UNSPECIFIED = object()
 
@@ -226,7 +227,8 @@
         build_queue = builder.currentjob
     return BuilderVitals(
         builder.name, builder.url, builder.virtualized, builder.vm_host,
-        builder.builderok, builder.manual, build_queue, builder.version)
+        builder.builderok, builder.manual, build_queue, builder.version,
+        builder.clean_status)
 
 
 class BuilderInteractor(object):
@@ -249,44 +251,6 @@
         return behaviour
 
     @classmethod
-    @defer.inlineCallbacks
-    def rescueIfLost(cls, vitals, slave, slave_status, expected_cookie,
-                     logger=None):
-        """Reset the slave if its job information doesn't match the DB.
-
-        This checks the build ID reported in the slave status against
-        the given cookie. If it isn't building what we think it should
-        be, the current build will be aborted and the slave cleaned in
-        preparation for a new task.
-
-        :return: A Deferred that fires when the dialog with the slave is
-            finished.  Its return value is True if the slave is lost,
-            False otherwise.
-        """
-        # Determine the slave's current build cookie.
-        status = slave_status['builder_status']
-        slave_cookie = slave_status.get('build_id')
-
-        if slave_cookie == expected_cookie:
-            # The master and slave agree about the current job. Continue.
-            defer.returnValue(False)
-        else:
-            # The master and slave disagree. The master is our master,
-            # so try to rescue the slave.
-            # An IDLE slave doesn't need rescuing (SlaveScanner.scan
-            # will rescue the DB side instead), and we just have to wait
-            # out an ABORTING one.
-            if status == 'BuilderStatus.WAITING':
-                yield slave.clean()
-            elif status == 'BuilderStatus.BUILDING':
-                yield slave.abort()
-            if logger:
-                logger.info(
-                    "Builder slave '%s' rescued from %r (expected %r)." %
-                    (vitals.name, slave_cookie, expected_cookie))
-            defer.returnValue(True)
-
-    @classmethod
     def resumeSlaveHost(cls, vitals, slave):
         """Resume the slave host to a known good condition.
 
@@ -323,6 +287,49 @@
 
     @classmethod
     @defer.inlineCallbacks
+    def cleanSlave(cls, vitals, slave):
+        """Prepare a slave for a new build.
+
+        :return: A Deferred that fires when this stage of the resume
+            operations finishes. If the value is True, the slave is now clean.
+            If it's False, the clean is still in progress and this must be
+            called again later.
+        """
+        if vitals.virtualized:
+            # If we are building a virtual build, resume the virtual
+            # machine.  Before we try and contact the resumed slave,
+            # we're going to send it a message.  This is to ensure
+            # it's accepting packets from the outside world, because
+            # testing has shown that the first packet will randomly
+            # fail for no apparent reason.  This could be a quirk of
+            # the Xen guest, we're not sure.  We also don't care
+            # about the result from this message, just that it's
+            # sent, hence the "addBoth".  See bug 586359.
+            yield cls.resumeSlaveHost(vitals, slave)
+            yield slave.echo("ping")
+            defer.returnValue(True)
+        else:
+            slave_status = yield slave.status()
+            status = slave_status.get('builder_status', None)
+            if status == 'BuilderStatus.IDLE':
+                # This is as clean as we can get it.
+                defer.returnValue(True)
+            elif status == 'BuilderStatus.BUILDING':
+                # Asynchronously abort() the slave and wait until WAITING.
+                yield slave.abort()
+                defer.returnValue(False)
+            elif status == 'BuilderStatus.ABORTING':
+                # Wait it out until WAITING.
+                defer.returnValue(False)
+            elif status == 'BuilderStatus.WAITING':
+                # Just a synchronous clean() call and we'll be idle.
+                yield slave.clean()
+                defer.returnValue(True)
+            raise BuildDaemonError(
+                "Invalid status during clean: %r" % status)
+
+    @classmethod
+    @defer.inlineCallbacks
     def _startBuild(cls, build_queue_item, vitals, builder, slave, behaviour,
                     logger):
         """Start a build on this builder.
@@ -342,17 +349,12 @@
             raise BuildDaemonError(
                 "Attempted to start a build on a known-bad builder.")
 
-        # If we are building a virtual build, resume the virtual
-        # machine.  Before we try and contact the resumed slave, we're
-        # going to send it a message.  This is to ensure it's accepting
-        # packets from the outside world, because testing has shown that
-        # the first packet will randomly fail for no apparent reason.
-        # This could be a quirk of the Xen guest, we're not sure.  We
-        # also don't care about the result from this message, just that
-        # it's sent, hence the "addBoth".  See bug 586359.
-        if builder.virtualized:
-            yield cls.resumeSlaveHost(vitals, slave)
-            yield slave.echo("ping")
+        if builder.clean_status != BuilderCleanStatus.CLEAN:
+            raise BuildDaemonError(
+                "Attempted to start build on a dirty slave.")
+
+        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
+        transaction.commit()
 
         yield behaviour.dispatchBuildToSlave(build_queue_item.id, logger)
 
@@ -448,9 +450,9 @@
         :return: A Deferred that fires when the slave dialog is finished.
         """
         # IDLE is deliberately not handled here, because it should be
-        # impossible to get past rescueIfLost unless the slave matches
-        # the DB, and this method isn't called unless the DB says
-        # there's a job.
+        # impossible to get past the cookie check unless the slave
+        # matches the DB, and this method isn't called unless the DB
+        # says there's a job.
         builder_status = slave_status['builder_status']
         if builder_status == 'BuilderStatus.BUILDING':
             # Build still building, collect the logtail.

=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-02-05 23:22:30 +0000
+++ lib/lp/buildmaster/manager.py	2014-06-17 13:03:35 +0000
@@ -26,6 +26,7 @@
 from zope.component import getUtility
 
 from lp.buildmaster.enums import (
+    BuilderCleanStatus,
     BuildQueueStatus,
     BuildStatus,
     )
@@ -413,70 +414,91 @@
         vitals = self.builder_factory.getVitals(self.builder_name)
         interactor = self.interactor_factory()
         slave = self.slave_factory(vitals)
-        if vitals.builderok:
-            self.logger.debug("Scanning %s." % self.builder_name)
-            slave_status = yield slave.status()
-        else:
-            slave_status = None
-
-        # Confirm that the DB and slave sides are in a valid, mutually
-        # agreeable state.
-        lost_reason = None
-        if not vitals.builderok:
-            lost_reason = '%s is disabled' % vitals.name
-        else:
-            self.updateVersion(vitals, slave_status)
-            cancelled = yield self.checkCancellation(vitals, slave, interactor)
-            if cancelled:
-                return
-            assert slave_status is not None
-            lost = yield interactor.rescueIfLost(
-                vitals, slave, slave_status, self.getExpectedCookie(vitals),
-                self.logger)
-            if lost:
-                lost_reason = '%s is lost' % vitals.name
-
-        # The slave is lost or the builder is disabled. We can't
-        # continue to update the job status or dispatch a new job, so
-        # just rescue the assigned job, if any, so it can be dispatched
-        # to another slave.
-        if lost_reason is not None:
-            if vitals.build_queue is not None:
+
+        if vitals.build_queue is not None:
+            if vitals.clean_status != BuilderCleanStatus.DIRTY:
+                # This is probably a grave bug with security implications,
+                # as a slave that has a job must be cleaned afterwards.
+                raise BuildDaemonError("Non-dirty builder allegedly building.")
+
+            lost_reason = None
+            if not vitals.builderok:
+                lost_reason = '%s is disabled' % vitals.name
+            else:
+                slave_status = yield slave.status()
+                # Ensure that the slave has the job that we think it
+                # should.
+                slave_cookie = slave_status.get('build_id')
+                expected_cookie = self.getExpectedCookie(vitals)
+                if slave_cookie != expected_cookie:
+                    lost_reason = (
+                        '%s is lost (expected %r, got %r)' % (
+                            vitals.name, expected_cookie, slave_cookie))
+
+            if lost_reason is not None:
+                # The slave is either confused or disabled, so reset and
+                # requeue the job. The next scan cycle will clean up the
+                # slave if appropriate.
                 self.logger.warn(
                     "%s. Resetting BuildQueue %d.", lost_reason,
                     vitals.build_queue.id)
                 vitals.build_queue.reset()
                 transaction.commit()
-            return
-
-        # We've confirmed that the slave state matches the DB. Continue
-        # with updating the job status, or dispatching a new job if the
-        # builder is idle.
-        if vitals.build_queue is not None:
-            # Scan the slave and get the logtail, or collect the build
-            # if it's ready.  Yes, "updateBuild" is a bad name.
+                return
+
+            cancelled = yield self.checkCancellation(
+                vitals, slave, interactor)
+            if cancelled:
+                return
+
+            # The slave and DB agree on the builder's state.  Scan the
+            # slave and get the logtail, or collect the build if it's
+            # ready.  Yes, "updateBuild" is a bad name.
             assert slave_status is not None
             yield interactor.updateBuild(
                 vitals, slave, slave_status, self.builder_factory,
                 self.behaviour_factory)
-        elif vitals.manual:
-            # If the builder is in manual mode, don't dispatch anything.
-            self.logger.debug(
-                '%s is in manual mode, not dispatching.' % vitals.name)
         else:
-            # See if there is a job we can dispatch to the builder slave.
-            builder = self.builder_factory[self.builder_name]
-            # Try to dispatch the job. If it fails, don't attempt to
-            # just retry the scan; we need to reset the job so the
-            # dispatch will be reattempted.
-            d = interactor.findAndStartJob(vitals, builder, slave)
-            d.addErrback(functools.partial(self._scanFailed, False))
-            yield d
-            if builder.currentjob is not None:
-                # After a successful dispatch we can reset the
-                # failure_count.
-                builder.resetFailureCount()
-                transaction.commit()
+            if not vitals.builderok:
+                return
+            # We think the builder is idle. If it's clean, dispatch. If
+            # it's dirty, clean.
+            if vitals.clean_status == BuilderCleanStatus.CLEAN:
+                slave_status = yield slave.status()
+                if slave_status.get('builder_status') != 'BuilderStatus.IDLE':
+                    raise BuildDaemonError(
+                        'Allegedly clean slave not idle (%r instead)'
+                        % slave_status.get('builder_status'))
+                self.updateVersion(vitals, slave_status)
+                if vitals.manual:
+                    # If the builder is in manual mode, don't dispatch
+                    # anything.
+                    self.logger.debug(
+                        '%s is in manual mode, not dispatching.', vitals.name)
+                    return
+                # Try to find and dispatch a job. If it fails, don't
+                # attempt to just retry the scan; we need to reset
+                # the job so the dispatch will be reattempted.
+                builder = self.builder_factory[self.builder_name]
+                d = interactor.findAndStartJob(vitals, builder, slave)
+                d.addErrback(functools.partial(self._scanFailed, False))
+                yield d
+                if builder.currentjob is not None:
+                    # After a successful dispatch we can reset the
+                    # failure_count.
+                    builder.resetFailureCount()
+                    transaction.commit()
+            else:
+                # Ask the BuilderInteractor to clean the slave. It might
+                # be immediately cleaned on return, in which case we go
+                # straight back to CLEAN, or we might have to spin
+                # through another few cycles.
+                done = yield interactor.cleanSlave(vitals, slave)
+                if done:
+                    builder = self.builder_factory[self.builder_name]
+                    builder.setCleanStatus(BuilderCleanStatus.CLEAN)
+                    self.logger.debug('%s has been cleaned.', vitals.name)
+                    transaction.commit()
 
 
 class NewBuildersScanner:

=== modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
--- lib/lp/buildmaster/tests/mock_slaves.py	2014-06-06 10:32:55 +0000
+++ lib/lp/buildmaster/tests/mock_slaves.py	2014-06-17 13:03:35 +0000
@@ -30,6 +30,7 @@
 from twisted.internet import defer
 from twisted.web import xmlrpc
 
+from lp.buildmaster.enums import BuilderCleanStatus
 from lp.buildmaster.interactor import BuilderSlave
 from lp.buildmaster.interfaces.builder import CannotFetchFile
 from lp.services.config import config
@@ -48,7 +49,7 @@
 
     def __init__(self, name='mock-builder', builderok=True, manual=False,
                  virtualized=True, vm_host=None, url='http://fake:0000',
-                 version=None):
+                 version=None, clean_status=BuilderCleanStatus.DIRTY):
         self.currentjob = None
         self.builderok = builderok
         self.manual = manual
@@ -58,6 +59,10 @@
         self.vm_host = vm_host
         self.failnotes = None
         self.version = version
+        self.clean_status = clean_status
+
+    def setCleanStatus(self, clean_status):
+        self.clean_status = clean_status
 
     def failBuilder(self, reason):
         self.builderok = False

=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
--- lib/lp/buildmaster/tests/test_interactor.py	2014-06-06 11:15:29 +0000
+++ lib/lp/buildmaster/tests/test_interactor.py	2014-06-17 13:03:35 +0000
@@ -16,19 +16,24 @@
     SynchronousDeferredRunTest,
     )
 from testtools.matchers import ContainsAll
+from testtools.testcase import ExpectedException
 from twisted.internet import defer
 from twisted.internet.task import Clock
 from twisted.python.failure import Failure
 from twisted.web.client import getPage
 from zope.security.proxy import removeSecurityProxy
 
-from lp.buildmaster.enums import BuildStatus
+from lp.buildmaster.enums import (
+    BuilderCleanStatus,
+    BuildStatus,
+    )
 from lp.buildmaster.interactor import (
     BuilderInteractor,
     BuilderSlave,
     extract_vitals_from_db,
     )
 from lp.buildmaster.interfaces.builder import (
+    BuildDaemonError,
     CannotFetchFile,
     CannotResumeHost,
     )
@@ -151,44 +156,6 @@
         self.assertEqual(5, slave.timeout)
 
     @defer.inlineCallbacks
-    def test_rescueIfLost_aborts_lost_and_broken_slave(self):
-        # A slave that's 'lost' should be aborted; when the slave is
-        # broken then abort() should also throw a fault.
-        slave = LostBuildingBrokenSlave()
-        slave_status = yield slave.status()
-        try:
-            yield BuilderInteractor.rescueIfLost(
-                extract_vitals_from_db(MockBuilder()), slave, slave_status,
-                'trivial')
-        except xmlrpclib.Fault:
-            self.assertIn('abort', slave.call_log)
-        else:
-            self.fail("xmlrpclib.Fault not raised")
-
-    @defer.inlineCallbacks
-    def test_recover_idle_slave(self):
-        # An idle slave is not rescued, even if it's not meant to be
-        # idle. SlaveScanner.scan() will clean up the DB side, because
-        # we still report that it's lost.
-        slave = OkSlave()
-        slave_status = yield slave.status()
-        lost = yield BuilderInteractor.rescueIfLost(
-            extract_vitals_from_db(MockBuilder()), slave, slave_status,
-            'trivial')
-        self.assertTrue(lost)
-        self.assertEqual(['status'], slave.call_log)
-
-    @defer.inlineCallbacks
-    def test_recover_ok_slave(self):
-        # An idle slave that's meant to be idle is not rescued.
-        slave = OkSlave()
-        slave_status = yield slave.status()
-        lost = yield BuilderInteractor.rescueIfLost(
-            extract_vitals_from_db(MockBuilder()), slave, slave_status, None)
-        self.assertFalse(lost)
-        self.assertEqual(['status'], slave.call_log)
-
-    @defer.inlineCallbacks
     def test_recover_waiting_slave_with_good_id(self):
         # rescueIfLost does not attempt to abort or clean a builder that is
         # WAITING.
@@ -240,6 +207,76 @@
         self.assertEqual(['status', 'abort'], building_slave.call_log)
 
 
+class TestBuilderInteractorCleanSlave(TestCase):
+
+    run_tests_with = AsynchronousDeferredRunTest
+
+    @defer.inlineCallbacks
+    def assertCleanCalls(self, builder, slave, calls, done):
+        actually_done = yield BuilderInteractor.cleanSlave(
+            extract_vitals_from_db(builder), slave)
+        self.assertEqual(done, actually_done)
+        self.assertEqual(calls, slave.method_log)
+
+    @defer.inlineCallbacks
+    def test_virtual(self):
+        # We don't care what the status of a virtual builder is; we just
+        # reset its VM and check that it's back.
+        yield self.assertCleanCalls(
+            MockBuilder(
+                virtualized=True, vm_host='lol',
+                clean_status=BuilderCleanStatus.DIRTY),
+            OkSlave(), ['resume', 'echo'], True)
+
+    @defer.inlineCallbacks
+    def test_nonvirtual_idle(self):
+        # An IDLE non-virtual slave is already as clean as we can get it.
+        yield self.assertCleanCalls(
+            MockBuilder(
+                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
+            OkSlave(), ['status'], True)
+
+    @defer.inlineCallbacks
+    def test_nonvirtual_building(self):
+        # A BUILDING non-virtual slave needs to be aborted. It'll go
+        # through ABORTING and eventually be picked up from WAITING.
+        yield self.assertCleanCalls(
+            MockBuilder(
+                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
+            BuildingSlave(), ['status', 'abort'], False)
+
+    @defer.inlineCallbacks
+    def test_nonvirtual_aborting(self):
+        # An ABORTING non-virtual slave must be waited out. It should
+        # hit WAITING eventually.
+        yield self.assertCleanCalls(
+            MockBuilder(
+                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
+            AbortingSlave(), ['status'], False)
+
+    @defer.inlineCallbacks
+    def test_nonvirtual_waiting(self):
+        # A WAITING non-virtual slave just needs clean() called.
+        yield self.assertCleanCalls(
+            MockBuilder(
+                virtualized=False, clean_status=BuilderCleanStatus.DIRTY),
+            WaitingSlave(), ['status', 'clean'], True)
+
+    @defer.inlineCallbacks
+    def test_nonvirtual_broken(self):
+        # A broken non-virtual builder is probably unrecoverable, so the
+        # method just crashes.
+        vitals = extract_vitals_from_db(MockBuilder(
+            virtualized=False, clean_status=BuilderCleanStatus.DIRTY))
+        slave = LostBuildingBrokenSlave()
+        try:
+            yield BuilderInteractor.cleanSlave(vitals, slave)
+        except xmlrpclib.Fault:
+            self.assertEqual(['status', 'abort'], slave.call_log)
+        else:
+            self.fail("abort() should crash.")
+
+
 class TestBuilderSlaveStatus(TestCase):
     # Verify what BuilderSlave.status returns with slaves in different
     # states.
@@ -321,6 +358,7 @@
         processor = self.factory.makeProcessor(name="i386")
         builder = self.factory.makeBuilder(
             processors=[processor], virtualized=True, vm_host="bladh")
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
         distroseries = self.factory.makeDistroSeries()
         das = self.factory.makeDistroArchSeries(
@@ -377,21 +415,30 @@
 
         return d.addCallback(check_build_started)
 
-    def test_virtual_job_dispatch_pings_before_building(self):
-        # We need to send a ping to the builder to work around a bug
-        # where sometimes the first network packet sent is dropped.
-        builder, build = self._setupBinaryBuildAndBuilder()
-        candidate = build.queueBuild()
-        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
-            result=candidate)
-        vitals = extract_vitals_from_db(builder)
-        slave = OkSlave()
-        d = BuilderInteractor.findAndStartJob(vitals, builder, slave)
-
-        def check_build_started(candidate):
-            self.assertIn(('echo', 'ping'), slave.call_log)
-
-        return d.addCallback(check_build_started)
+    @defer.inlineCallbacks
+    def test_findAndStartJob_requires_clean_slave(self):
+        # findAndStartJob ensures that its slave starts CLEAN.
+        builder, build = self._setupBinaryBuildAndBuilder()
+        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
+        candidate = build.queueBuild()
+        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
+            result=candidate)
+        vitals = extract_vitals_from_db(builder)
+        with ExpectedException(
+                BuildDaemonError,
+                "Attempted to start build on a dirty slave."):
+            yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
+
+    @defer.inlineCallbacks
+    def test_findAndStartJob_dirties_slave(self):
+        # findAndStartJob marks its builder DIRTY before dispatching.
+        builder, build = self._setupBinaryBuildAndBuilder()
+        candidate = build.queueBuild()
+        removeSecurityProxy(builder)._findBuildCandidate = FakeMethod(
+            result=candidate)
+        vitals = extract_vitals_from_db(builder)
+        yield BuilderInteractor.findAndStartJob(vitals, builder, OkSlave())
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
 
 
 class TestSlave(TestCase):

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-06 10:32:55 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-17 13:03:35 +0000
@@ -13,6 +13,7 @@
     AsynchronousDeferredRunTest,
     )
 from testtools.matchers import Equals
+from testtools.testcase import ExpectedException
 import transaction
 from twisted.internet import (
     defer,
@@ -25,6 +26,7 @@
 from zope.security.proxy import removeSecurityProxy
 
 from lp.buildmaster.enums import (
+    BuilderCleanStatus,
     BuildQueueStatus,
     BuildStatus,
     )
@@ -33,7 +35,10 @@
     BuilderSlave,
     extract_vitals_from_db,
     )
-from lp.buildmaster.interfaces.builder import IBuilderSet
+from lp.buildmaster.interfaces.builder import (
+    BuildDaemonError,
+    IBuilderSet,
+    )
 from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
     IBuildFarmJobBehaviour,
     )
@@ -113,6 +118,7 @@
         job = builder.currentjob
         if job is not None:
             job.reset()
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
 
         transaction.commit()
 
@@ -155,6 +161,7 @@
         # Set this to 1 here so that _checkDispatch can make sure it's
         # reset to 0 after a successful dispatch.
         builder.failure_count = 1
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
 
         # Run 'scan' and check its result.
         switch_dbuser(config.builddmaster.dbuser)
@@ -258,6 +265,7 @@
     def test_scan_with_nothing_to_dispatch(self):
         factory = LaunchpadObjectFactory()
         builder = factory.makeBuilder()
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
         transaction.commit()
         scanner = self._getScanner(builder_name=builder.name)
@@ -422,9 +430,9 @@
         self.assertFalse(builder.builderok)
 
     @defer.inlineCallbacks
-    def test_fail_to_resume_slave_resets_job(self):
-        # If an attempt to resume and dispatch a slave fails, it should
-        # reset the job via job.reset()
+    def test_fail_to_resume_leaves_it_dirty(self):
+        # If an attempt to resume a slave fails, its failure count is
+        # incremented and it is left DIRTY.
 
         # Make a slave with a failing resume() method.
         slave = OkSlave()
@@ -435,26 +443,21 @@
         builder = removeSecurityProxy(
             getUtility(IBuilderSet)[BOB_THE_BUILDER_NAME])
         self._resetBuilder(builder)
+        builder.setCleanStatus(BuilderCleanStatus.DIRTY)
+        builder.virtualized = True
         self.assertEqual(0, builder.failure_count)
         self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(slave))
         builder.vm_host = "fake_vm_host"
-
-        scanner = self._getScanner()
-
-        # Get the next job that will be dispatched.
-        job = removeSecurityProxy(builder._findBuildCandidate())
-        job.virtualized = True
-        builder.virtualized = True
         transaction.commit()
-        yield scanner.singleCycle()
-
-        # The failure_count will have been incremented on the builder, we
-        # can check that to see that a dispatch attempt did indeed occur.
+
+        # A spin of the scanner will see the DIRTY builder and reset it.
+        # Our patched reset will fail.
+        yield self._getScanner().singleCycle()
+
+        # The failure_count will have been incremented on the builder,
+        # and it will be left DIRTY.
         self.assertEqual(1, builder.failure_count)
-        # There should also be no builder set on the job.
-        self.assertIsNone(job.builder)
-        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
-        self.assertEqual(build.status, BuildStatus.NEEDSBUILD)
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
 
     @defer.inlineCallbacks
     def test_update_slave_version(self):
@@ -569,15 +572,25 @@
             builder.name, BuilderFactory(), BufferLogger(),
             slave_factory=get_slave, clock=clock)
 
-        # The slave is idle and there's a build candidate, so the first
-        # scan will reset the builder and dispatch the build.
+        # The slave is idle and dirty, so the first scan will clean it
+        # with a reset.
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
+        yield scanner.scan()
+        self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
+        self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
+        self.assertIs(None, builder.currentjob)
+
+        # The slave is idle and clean, and there's a build candidate, so
+        # the next scan will dispatch the build.
+        get_slave.result = OkSlave()
         yield scanner.scan()
         self.assertEqual(
-            ['status', 'resume', 'echo', 'ensurepresent', 'build'],
+            ['status', 'ensurepresent', 'build'],
             get_slave.result.method_log)
         self.assertEqual(bq, builder.currentjob)
         self.assertEqual(BuildQueueStatus.RUNNING, bq.status)
         self.assertEqual(BuildStatus.BUILDING, build.status)
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
 
         # build() has been called, so switch in a BUILDING slave.
         # Scans will now just do a status() each, as the logtail is
@@ -596,7 +609,8 @@
         # When the build finishes, the scanner will notice, call
         # handleStatus(), and then clean the builder.  Our fake
         # _handleStatus_OK doesn't do anything special, but there'd
-        # usually be file retrievals in the middle.
+        # usually be file retrievals in the middle. The builder remains
+        # dirty afterward.
         get_slave.result = WaitingSlave(
             build_id=IBuildFarmJobBehaviour(build).getBuildCookie())
         yield scanner.scan()
@@ -604,13 +618,15 @@
         self.assertIs(None, builder.currentjob)
         self.assertEqual(BuildStatus.UPLOADING, build.status)
         self.assertEqual(builder, build.builder)
+        self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
 
-        # We're clean, so let's flip back to an idle slave and
-        # confirm that a scan does nothing special.
+        # We're idle and dirty, so let's flip back to an idle slave and
+        # confirm that the slave gets cleaned.
         get_slave.result = OkSlave()
         yield scanner.scan()
-        self.assertEqual(['status'], get_slave.result.method_log)
+        self.assertEqual(['resume', 'echo'], get_slave.result.method_log)
         self.assertIs(None, builder.currentjob)
+        self.assertEqual(BuilderCleanStatus.CLEAN, builder.clean_status)
 
 
 class TestPrefetchedBuilderFactory(TestCaseWithFactory):
@@ -740,74 +756,104 @@
 
     run_tests_with = AsynchronousDeferredRunTest
 
+    def getScanner(self, builder_factory=None, interactor=None, slave=None,
+                   behaviour=None):
+        if builder_factory is None:
+            builder_factory = MockBuilderFactory(
+                MockBuilder(virtualized=False), None)
+        if interactor is None:
+            interactor = BuilderInteractor()
+            interactor.updateBuild = FakeMethod()
+        if slave is None:
+            slave = OkSlave()
+        if behaviour is None:
+            behaviour = TrivialBehaviour()
+        return SlaveScanner(
+            'mock', builder_factory, BufferLogger(),
+            interactor_factory=FakeMethod(interactor),
+            slave_factory=FakeMethod(slave),
+            behaviour_factory=FakeMethod(behaviour))
+
     @defer.inlineCallbacks
     def test_scan_with_job(self):
         # SlaveScanner.scan calls updateBuild() when a job is building.
         slave = BuildingSlave('trivial')
         bq = FakeBuildQueue()
-
-        # Instrument updateBuild.
-        interactor = BuilderInteractor()
-        interactor.updateBuild = FakeMethod()
-
-        scanner = SlaveScanner(
-            'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
-            interactor_factory=FakeMethod(interactor),
-            slave_factory=FakeMethod(slave),
-            behaviour_factory=FakeMethod(TrivialBehaviour()))
+        scanner = self.getScanner(
+            builder_factory=MockBuilderFactory(MockBuilder(), bq),
+            slave=slave)
 
         yield scanner.scan()
         self.assertEqual(['status'], slave.call_log)
-        self.assertEqual(1, interactor.updateBuild.call_count)
+        self.assertEqual(
+            1, scanner.interactor_factory.result.updateBuild.call_count)
         self.assertEqual(0, bq.reset.call_count)
 
     @defer.inlineCallbacks
-    def test_scan_aborts_lost_slave_with_job(self):
-        # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
-        # slaves that don't have the expected job.
+    def test_scan_recovers_lost_slave_with_job(self):
+        # SlaveScanner.scan identifies slaves that aren't building what
+        # they should be, resets the jobs, and then aborts the slaves.
         slave = BuildingSlave('nontrivial')
         bq = FakeBuildQueue()
-
-        # Instrument updateBuild.
-        interactor = BuilderInteractor()
-        interactor.updateBuild = FakeMethod()
-
-        scanner = SlaveScanner(
-            'mock', MockBuilderFactory(MockBuilder(), bq), BufferLogger(),
-            interactor_factory=FakeMethod(interactor),
-            slave_factory=FakeMethod(slave),
-            behaviour_factory=FakeMethod(TrivialBehaviour()))
+        builder = MockBuilder(virtualized=False)
+        scanner = self.getScanner(
+            builder_factory=MockBuilderFactory(builder, bq),
+            slave=slave)
 
         # A single scan will call status(), notice that the slave is lost,
-        # abort() the slave, then reset() the job without calling
-        # updateBuild().
+        # and reset() the job without calling updateBuild().
         yield scanner.scan()
-        self.assertEqual(['status', 'abort'], slave.call_log)
-        self.assertEqual(0, interactor.updateBuild.call_count)
+        self.assertEqual(['status'], slave.call_log)
+        self.assertEqual(
+            0, scanner.interactor_factory.result.updateBuild.call_count)
         self.assertEqual(1, bq.reset.call_count)
+        # The reset would normally have unset build_queue.
+        scanner.builder_factory.updateTestData(builder, None)
+
+        # The next scan will see a dirty idle builder with a BUILDING
+        # slave, and abort() it.
+        yield scanner.scan()
+        self.assertEqual(['status', 'status', 'abort'], slave.call_log)
 
     @defer.inlineCallbacks
-    def test_scan_aborts_lost_slave_when_idle(self):
-        # SlaveScanner.scan uses BuilderInteractor.rescueIfLost to abort
-        # slaves that aren't meant to have a job.
+    def test_scan_recovers_lost_slave_when_idle(self):
+        # SlaveScanner.scan identifies slaves that are building when
+        # they shouldn't be and aborts them.
         slave = BuildingSlave()
-
-        # Instrument updateBuild.
-        interactor = BuilderInteractor()
-        interactor.updateBuild = FakeMethod()
-
-        scanner = SlaveScanner(
-            'mock', MockBuilderFactory(MockBuilder(), None), BufferLogger(),
-            interactor_factory=FakeMethod(interactor),
-            slave_factory=FakeMethod(slave),
-            behaviour_factory=FakeMethod(None))
-
-        # A single scan will call status(), notice that the slave is lost,
-        # abort() the slave, then reset() the job without calling
-        # updateBuild().
+        scanner = self.getScanner(slave=slave)
         yield scanner.scan()
         self.assertEqual(['status', 'abort'], slave.call_log)
-        self.assertEqual(0, interactor.updateBuild.call_count)
+
+    @defer.inlineCallbacks
+    def test_scan_building_but_not_dirty_builder_explodes(self):
+        # Builders with a build assigned must be dirty for safety
+        # reasons. If we run into one that's clean, we blow up.
+        slave = BuildingSlave()
+        builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
+        bq = FakeBuildQueue()
+        scanner = self.getScanner(
+            slave=slave, builder_factory=MockBuilderFactory(builder, bq))
+
+        with ExpectedException(
+                BuildDaemonError, "Non-dirty builder allegedly building."):
+            yield scanner.scan()
+        self.assertEqual([], slave.call_log)
+
+    @defer.inlineCallbacks
+    def test_scan_clean_but_not_idle_slave_explodes(self):
+        # Clean builders by definition have slaves that are idle. If
+        # an ostensibly clean slave isn't idle, blow up.
+        slave = BuildingSlave()
+        builder = MockBuilder(clean_status=BuilderCleanStatus.CLEAN)
+        scanner = self.getScanner(
+            slave=slave, builder_factory=MockBuilderFactory(builder, None))
+
+        with ExpectedException(
+                BuildDaemonError,
+                "Allegedly clean slave not idle "
+                "\('BuilderStatus.BUILDING' instead\)"):
+            yield scanner.scan()
+        self.assertEqual(['status'], slave.call_log)
 
     def test_getExpectedCookie_caches(self):
         bf = MockBuilderFactory(MockBuilder(), FakeBuildQueue())

=== modified file 'lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py'
--- lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py	2014-06-06 10:32:55 +0000
+++ lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py	2014-06-17 13:03:35 +0000
@@ -19,6 +19,7 @@
 from zope.security.proxy import removeSecurityProxy
 
 from lp.buildmaster.enums import (
+    BuilderCleanStatus,
     BuildQueueStatus,
     BuildStatus,
     )
@@ -146,6 +147,7 @@
         archive = self.factory.makeArchive(virtualized=False)
         slave = OkSlave()
         builder = self.factory.makeBuilder(virtualized=False)
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         vitals = extract_vitals_from_db(builder)
         build = self.factory.makeBinaryPackageBuild(
             builder=builder, archive=archive)
@@ -170,6 +172,7 @@
         slave = OkSlave()
         builder = self.factory.makeBuilder(
             virtualized=True, vm_host="foohost")
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         vitals = extract_vitals_from_db(builder)
         build = self.factory.makeBinaryPackageBuild(
             builder=builder, archive=archive)
@@ -198,6 +201,7 @@
             virtualized=False, purpose=ArchivePurpose.PARTNER)
         slave = OkSlave()
         builder = self.factory.makeBuilder(virtualized=False)
+        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         vitals = extract_vitals_from_db(builder)
         build = self.factory.makeBinaryPackageBuild(
             builder=builder, archive=archive)


Follow ups