← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/vm-reset-proto-2.0 into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/vm-reset-proto-2.0 into lp:launchpad with lp:~wgrant/launchpad/builder-protocol-ui-api as a prerequisite.

Commit message:
Implement Launchpad virtual builder reset protocol 2.0. The reset trigger in this version is asynchronous, and the non-LP slave manager will set the builder back to CLEAN through the webservice later.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/vm-reset-proto-2.0/+merge/223898

Implement Launchpad virtual builder reset protocol 2.0, as defined in https://docs.google.com/a/canonical.com/document/d/1SfJ9jA0_l_8iryANEkhe2HE5mMTRy-zcz6t_UBce7Vw/edit

The key difference is that the reset trigger is asynchronous, and its exit indicates that a reset is queued, but not yet complete. We set clean_status to CLEANING once the trigger has returned, so we don't queue up multiple resets in a hilarious chain of doom. The slave manager at the other end of the reset queue will use the webservice API to set Builder.clean_status back to CLEAN when the builder is back, and buildd-manager will stop ignoring the slave.

I also removed all the slave clean() calls from the BuildFarmJobBehaviours. They were pointless, as virtualised builders are immediately reset anyway, and BuilderInteractor.cleanSlave will clean() non-virtualised builders when they need it. This also provides us an additional layer of safety around the reset trigger, as SlaveScanner.scan blows up if the reset trigger has returned but the slave doesn't report BuilderStatus.IDLE.
-- 
https://code.launchpad.net/~wgrant/launchpad/vm-reset-proto-2.0/+merge/223898
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/vm-reset-proto-2.0 into lp:launchpad.
=== modified file 'lib/lp/buildmaster/enums.py'
--- lib/lp/buildmaster/enums.py	2014-06-19 11:00:57 +0000
+++ lib/lp/buildmaster/enums.py	2014-06-20 11:34:14 +0000
@@ -219,5 +219,16 @@
     PROTO_1_1 = DBItem(11, """
         1.1
 
-        Original synchronous protocol with vm_host and buildd_name.
+        Original synchronous protocol with vm_host and buildd_name. The
+        reset trigger must exit cleanly once the slave is reset and
+        accepting requests.
+        """)
+
+    PROTO_2_0 = DBItem(20, """
+        2.0
+
+        Asynchronous protocol with vm_host and buildd_name. The reset
+        trigger must exit cleanly once the request is accepted, and use
+        the webservice to set Builder.clean_status back to 'Clean' when
+        the slave is reset and accepting requests.
         """)

=== modified file 'lib/lp/buildmaster/interactor.py'
--- lib/lp/buildmaster/interactor.py	2014-06-17 12:59:03 +0000
+++ lib/lp/buildmaster/interactor.py	2014-06-20 11:34:14 +0000
@@ -21,7 +21,10 @@
     removeSecurityProxy,
     )
 
-from lp.buildmaster.enums import BuilderCleanStatus
+from lp.buildmaster.enums import (
+    BuilderCleanStatus,
+    BuilderResetProtocol,
+    )
 from lp.buildmaster.interfaces.builder import (
     BuildDaemonError,
     CannotFetchFile,
@@ -216,8 +219,8 @@
 
 BuilderVitals = namedtuple(
     'BuilderVitals',
-    ('name', 'url', 'virtualized', 'vm_host', 'builderok', 'manual',
-     'build_queue', 'version', 'clean_status'))
+    ('name', 'url', 'virtualized', 'vm_host', 'vm_reset_protocol',
+     'builderok', 'manual', 'build_queue', 'version', 'clean_status'))
 
 _BQ_UNSPECIFIED = object()
 
@@ -227,8 +230,8 @@
         build_queue = builder.currentjob
     return BuilderVitals(
         builder.name, builder.url, builder.virtualized, builder.vm_host,
-        builder.builderok, builder.manual, build_queue, builder.version,
-        builder.clean_status)
+        builder.vm_reset_protocol, builder.builderok, builder.manual,
+        build_queue, builder.version, builder.clean_status)
 
 
 class BuilderInteractor(object):
@@ -287,7 +290,7 @@
 
     @classmethod
     @defer.inlineCallbacks
-    def cleanSlave(cls, vitals, slave):
+    def cleanSlave(cls, vitals, slave, builder_factory):
         """Prepare a slave for a new build.
 
         :return: A Deferred that fires when this stage of the resume
@@ -296,18 +299,35 @@
             called again later.
         """
         if vitals.virtualized:
-            # If we are building a virtual build, resume the virtual
-            # machine.  Before we try and contact the resumed slave,
-            # we're going to send it a message.  This is to ensure
-            # it's accepting packets from the outside world, because
-            # testing has shown that the first packet will randomly
-            # fail for no apparent reason.  This could be a quirk of
-            # the Xen guest, we're not sure.  We also don't care
-            # about the result from this message, just that it's
-            # sent, hence the "addBoth".  See bug 586359.
-            yield cls.resumeSlaveHost(vitals, slave)
-            yield slave.echo("ping")
-            defer.returnValue(True)
+            if vitals.vm_reset_protocol == BuilderResetProtocol.PROTO_1_1:
+                # In protocol 1.1 the reset trigger is synchronous, so
+                # once resumeSlaveHost returns the slave should be
+                # running.
+                builder_factory[vitals.name].setCleanStatus(
+                    BuilderCleanStatus.CLEANING)
+                transaction.commit()
+                yield cls.resumeSlaveHost(vitals, slave)
+                # We ping the resumed slave before we try to do anything
+                # useful with it. This is to ensure it's accepting
+                # packets from the outside world, because testing has
+                # shown that the first packet will randomly fail for no
+                # apparent reason.  This could be a quirk of the Xen
+                # guest, we're not sure. See bug 586359.
+                yield slave.echo("ping")
+                defer.returnValue(True)
+            elif vitals.vm_reset_protocol == BuilderResetProtocol.PROTO_2_0:
+                # In protocol 2.0 the reset trigger is asynchronous.
+                # If the trigger succeeds we'll leave the slave in
+                # CLEANING, and the non-LP slave management code will
+                # set it back to CLEAN later using the webservice.
+                if vitals.clean_status == BuilderCleanStatus.DIRTY:
+                    yield cls.resumeSlaveHost(vitals, slave)
+                    builder_factory[vitals.name].setCleanStatus(
+                        BuilderCleanStatus.CLEANING)
+                    transaction.commit()
+                defer.returnValue(False)
+            raise CannotResumeHost(
+                "Invalid vm_reset_protocol: %r" % vitals.vm_reset_protocol)
         else:
             slave_status = yield slave.status()
             status = slave_status.get('builder_status', None)

=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-06-17 12:59:03 +0000
+++ lib/lp/buildmaster/manager.py	2014-06-20 11:34:14 +0000
@@ -493,7 +493,8 @@
                 # be immediately cleaned on return, in which case we go
                 # straight back to CLEAN, or we might have to spin
                 # through another few cycles.
-                done = yield interactor.cleanSlave(vitals, slave)
+                done = yield interactor.cleanSlave(
+                    vitals, slave, self.builder_factory)
                 if done:
                     builder = self.builder_factory[self.builder_name]
                     builder.setCleanStatus(BuilderCleanStatus.CLEAN)

=== modified file 'lib/lp/buildmaster/model/buildfarmjobbehaviour.py'
--- lib/lp/buildmaster/model/buildfarmjobbehaviour.py	2014-05-14 02:32:31 +0000
+++ lib/lp/buildmaster/model/buildfarmjobbehaviour.py	2014-06-20 11:34:14 +0000
@@ -184,7 +184,6 @@
         if build.job_type == BuildFarmJobType.PACKAGEBUILD:
             build = build.buildqueue_record.specific_build
             if not build.current_source_publication:
-                yield self._slave.clean()
                 build.updateStatus(BuildStatus.SUPERSEDED)
                 self.build.buildqueue_record.destroySelf()
                 return
@@ -255,7 +254,6 @@
         if not os.path.exists(target_dir):
             os.mkdir(target_dir)
 
-        yield self._slave.clean()
         self.build.buildqueue_record.destroySelf()
         transaction.commit()
 
@@ -280,7 +278,6 @@
         yield self.storeLogFromSlave()
         if notify:
             self.build.notify()
-        yield self._slave.clean()
         self.build.buildqueue_record.destroySelf()
         transaction.commit()
 
@@ -324,9 +321,7 @@
             self._builder.handleFailure(logger)
             self.build.buildqueue_record.reset()
         transaction.commit()
-        yield self._slave.clean()
 
-    @defer.inlineCallbacks
     def _handleStatus_GIVENBACK(self, slave_status, logger, notify):
         """Handle automatic retry requested by builder.
 
@@ -334,6 +329,5 @@
         later, the build records is delayed by reducing the lastscore to
         ZERO.
         """
-        yield self._slave.clean()
         self.build.buildqueue_record.reset()
         transaction.commit()

=== modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
--- lib/lp/buildmaster/tests/mock_slaves.py	2014-06-17 12:59:03 +0000
+++ lib/lp/buildmaster/tests/mock_slaves.py	2014-06-20 11:34:14 +0000
@@ -30,7 +30,10 @@
 from twisted.internet import defer
 from twisted.web import xmlrpc
 
-from lp.buildmaster.enums import BuilderCleanStatus
+from lp.buildmaster.enums import (
+    BuilderCleanStatus,
+    BuilderResetProtocol,
+    )
 from lp.buildmaster.interactor import BuilderSlave
 from lp.buildmaster.interfaces.builder import CannotFetchFile
 from lp.services.config import config
@@ -49,7 +52,8 @@
 
     def __init__(self, name='mock-builder', builderok=True, manual=False,
                  virtualized=True, vm_host=None, url='http://fake:0000',
-                 version=None, clean_status=BuilderCleanStatus.DIRTY):
+                 version=None, clean_status=BuilderCleanStatus.DIRTY,
+                 vm_reset_protocol=BuilderResetProtocol.PROTO_1_1):
         self.currentjob = None
         self.builderok = builderok
         self.manual = manual
@@ -57,6 +61,7 @@
         self.name = name
         self.virtualized = virtualized
         self.vm_host = vm_host
+        self.vm_reset_protocol = vm_reset_protocol
         self.failnotes = None
         self.version = version
         self.clean_status = clean_status

=== modified file 'lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py'
--- lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py	2014-05-08 23:52:43 +0000
+++ lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py	2014-06-20 11:34:14 +0000
@@ -298,7 +298,6 @@
             self.assertEqual(BuildStatus.NEEDSBUILD, self.build.status)
             self.assertEqual(1, self.builder.failure_count)
             self.assertEqual(1, self.build.failure_count)
-            self.assertIn("clean", self.slave.call_log)
 
         d = self.behaviour.handleStatus(
             self.build.buildqueue_record, "ABORTED", {})

=== modified file 'lib/lp/buildmaster/tests/test_interactor.py'
--- lib/lp/buildmaster/tests/test_interactor.py	2014-06-18 03:53:59 +0000
+++ lib/lp/buildmaster/tests/test_interactor.py	2014-06-20 11:34:14 +0000
@@ -3,6 +3,11 @@
 
 """Test BuilderInteractor features."""
 
+__all__ = [
+    'FakeBuildQueue',
+    'MockBuilderFactory',
+    ]
+
 import os
 import signal
 import tempfile
@@ -25,6 +30,7 @@
 
 from lp.buildmaster.enums import (
     BuilderCleanStatus,
+    BuilderResetProtocol,
     BuildStatus,
     )
 from lp.buildmaster.interactor import (
@@ -64,6 +70,43 @@
     )
 
 
+class FakeBuildQueue:
+
+    def __init__(self):
+        self.id = 1
+        self.reset = FakeMethod()
+        self.status = BuildQueueStatus.RUNNING
+
+
+class MockBuilderFactory:
+    """A mock builder factory which uses a preset Builder and BuildQueue."""
+
+    def __init__(self, builder, build_queue):
+        self.updateTestData(builder, build_queue)
+        self.get_call_count = 0
+        self.getVitals_call_count = 0
+
+    def update(self):
+        return
+
+    def prescanUpdate(self):
+        return
+
+    def updateTestData(self, builder, build_queue):
+        self._builder = builder
+        self._build_queue = build_queue
+
+    def __getitem__(self, name):
+        self.get_call_count += 1
+        return self._builder
+
+    def getVitals(self, name):
+        self.getVitals_call_count += 1
+        return extract_vitals_from_db(self._builder, self._build_queue)
+
+
+
+
 class TestBuilderInteractor(TestCase):
 
     run_tests_with = AsynchronousDeferredRunTest.make_factory(timeout=10)
@@ -163,19 +206,56 @@
     @defer.inlineCallbacks
     def assertCleanCalls(self, builder, slave, calls, done):
         actually_done = yield BuilderInteractor.cleanSlave(
-            extract_vitals_from_db(builder), slave)
+            extract_vitals_from_db(builder), slave,
+            MockBuilderFactory(builder, None))
         self.assertEqual(done, actually_done)
         self.assertEqual(calls, slave.method_log)
 
     @defer.inlineCallbacks
-    def test_virtual(self):
-        # We don't care what the status of a virtual builder is; we just
-        # reset its VM and check that it's back.
+    def test_virtual_1_1(self):
+        # Virtual builders using protocol 1.1 get reset, and once the
+        # trigger completes we're happy that it's clean.
+        builder = MockBuilder(
+            virtualized=True, clean_status=BuilderCleanStatus.DIRTY,
+            vm_host='lol', vm_reset_protocol=BuilderResetProtocol.PROTO_1_1)
         yield self.assertCleanCalls(
-            MockBuilder(
-                virtualized=True, vm_host='lol',
-                clean_status=BuilderCleanStatus.DIRTY),
-            OkSlave(), ['resume', 'echo'], True)
+            builder, OkSlave(), ['resume', 'echo'], True)
+
+    @defer.inlineCallbacks
+    def test_virtual_2_0_dirty(self):
+        # Virtual builders using protocol 2.0 get reset and set to
+        # CLEANING. It's then up to the non-Launchpad reset code to set
+        # the builder back to CLEAN using the webservice.
+        builder = MockBuilder(
+            virtualized=True, clean_status=BuilderCleanStatus.DIRTY,
+            vm_host='lol', vm_reset_protocol=BuilderResetProtocol.PROTO_2_0)
+        yield self.assertCleanCalls(builder, OkSlave(), ['resume'], False)
+        self.assertEqual(BuilderCleanStatus.CLEANING, builder.clean_status)
+
+    @defer.inlineCallbacks
+    def test_virtual_2_0_cleaning(self):
+        # Virtual builders using protocol 2.0 only get touched when
+        # they're DIRTY. Once they're cleaning, they're not our problem
+        # until they return to CLEAN, so we ignore them.
+        builder = MockBuilder(
+            virtualized=True, clean_status=BuilderCleanStatus.CLEANING,
+            vm_host='lol', vm_reset_protocol=BuilderResetProtocol.PROTO_2_0)
+        yield self.assertCleanCalls(builder, OkSlave(), [], False)
+        self.assertEqual(BuilderCleanStatus.CLEANING, builder.clean_status)
+
+    @defer.inlineCallbacks
+    def test_virtual_no_protocol(self):
+        # Virtual builders fail to clean unless vm_reset_protocol is
+        # set.
+        builder = MockBuilder(
+            virtualized=True, clean_status=BuilderCleanStatus.DIRTY,
+            vm_host='lol')
+        builder.vm_reset_protocol = None
+        with ExpectedException(
+                CannotResumeHost, "Invalid vm_reset_protocol: None"):
+            yield BuilderInteractor.cleanSlave(
+                extract_vitals_from_db(builder), OkSlave(),
+                MockBuilderFactory(builder, None))
 
     @defer.inlineCallbacks
     def test_nonvirtual_idle(self):
@@ -215,11 +295,13 @@
     def test_nonvirtual_broken(self):
         # A broken non-virtual builder is probably unrecoverable, so the
         # method just crashes.
-        vitals = extract_vitals_from_db(MockBuilder(
-            virtualized=False, clean_status=BuilderCleanStatus.DIRTY))
+        builder = MockBuilder(
+            virtualized=False, clean_status=BuilderCleanStatus.DIRTY)
+        vitals = extract_vitals_from_db(builder)
         slave = LostBuildingBrokenSlave()
         try:
-            yield BuilderInteractor.cleanSlave(vitals, slave)
+            yield BuilderInteractor.cleanSlave(
+                vitals, slave, MockBuilderFactory(builder, None))
         except xmlrpclib.Fault:
             self.assertEqual(['status', 'abort'], slave.call_log)
         else:

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-17 12:59:03 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-20 11:34:14 +0000
@@ -63,6 +63,10 @@
     TrivialBehaviour,
     WaitingSlave,
     )
+from lp.buildmaster.tests.test_interactor import (
+    FakeBuildQueue,
+    MockBuilderFactory,
+    )
 from lp.registry.interfaces.distribution import IDistributionSet
 from lp.services.config import config
 from lp.services.log.logger import BufferLogger
@@ -717,41 +721,6 @@
         self.assertContentEqual(BuilderFactory().iterVitals(), all_vitals)
 
 
-class FakeBuildQueue:
-
-    def __init__(self):
-        self.id = 1
-        self.reset = FakeMethod()
-        self.status = BuildQueueStatus.RUNNING
-
-
-class MockBuilderFactory:
-    """A mock builder factory which uses a preset Builder and BuildQueue."""
-
-    def __init__(self, builder, build_queue):
-        self.updateTestData(builder, build_queue)
-        self.get_call_count = 0
-        self.getVitals_call_count = 0
-
-    def update(self):
-        return
-
-    def prescanUpdate(self):
-        return
-
-    def updateTestData(self, builder, build_queue):
-        self._builder = builder
-        self._build_queue = build_queue
-
-    def __getitem__(self, name):
-        self.get_call_count += 1
-        return self._builder
-
-    def getVitals(self, name):
-        self.getVitals_call_count += 1
-        return extract_vitals_from_db(self._builder, self._build_queue)
-
-
 class TestSlaveScannerWithoutDB(TestCase):
 
     run_tests_with = AsynchronousDeferredRunTest

=== modified file 'lib/lp/testing/factory.py'
--- lib/lp/testing/factory.py	2014-06-19 11:00:57 +0000
+++ lib/lp/testing/factory.py	2014-06-20 11:34:14 +0000
@@ -96,7 +96,10 @@
     ICveSet,
     )
 from lp.bugs.model.bug import FileBugData
-from lp.buildmaster.enums import BuildStatus
+from lp.buildmaster.enums import (
+    BuildStatus,
+    BuilderResetProtocol,
+    )
 from lp.buildmaster.interfaces.builder import IBuilderSet
 from lp.code.enums import (
     BranchMergeProposalStatus,
@@ -2748,7 +2751,7 @@
 
     def makeBuilder(self, processors=None, url=None, name=None, title=None,
                     owner=None, active=True, virtualized=True, vm_host=None,
-                    manual=False):
+                    vm_reset_protocol=None, manual=False):
         """Make a new builder for i386 virtualized builds by default.
 
         Note: the builder returned will not be able to actually build -
@@ -2765,11 +2768,14 @@
             title = self.getUniqueString('builder-title')
         if owner is None:
             owner = self.makePerson()
+        if virtualized and vm_reset_protocol is None:
+            vm_reset_protocol = BuilderResetProtocol.PROTO_1_1
 
         with admin_logged_in():
             return getUtility(IBuilderSet).new(
                 processors, url, name, title, owner, active,
-                virtualized, vm_host, manual=manual)
+                virtualized, vm_host, manual=manual,
+                vm_reset_protocol=vm_reset_protocol)
 
     def makeRecipeText(self, *branches):
         if len(branches) == 0:

=== modified file 'lib/lp/translations/model/translationtemplatesbuildbehaviour.py'
--- lib/lp/translations/model/translationtemplatesbuildbehaviour.py	2014-01-30 15:04:06 +0000
+++ lib/lp/translations/model/translationtemplatesbuildbehaviour.py	2014-06-20 11:34:14 +0000
@@ -175,7 +175,5 @@
         transaction.commit()
 
         yield self.storeLogFromSlave(build_queue=queue_item)
-
-        yield self._slave.clean()
         queue_item.destroySelf()
         transaction.commit()

=== modified file 'lib/lp/translations/tests/test_translationtemplatesbuildbehaviour.py'
--- lib/lp/translations/tests/test_translationtemplatesbuildbehaviour.py	2014-02-03 14:43:08 +0000
+++ lib/lp/translations/tests/test_translationtemplatesbuildbehaviour.py	2014-06-20 11:34:14 +0000
@@ -179,8 +179,6 @@
 
         def got_dispatch((status, info)):
             self.assertEqual(0, queue_item.destroySelf.call_count)
-            slave_call_log = slave.call_log
-            self.assertNotIn('clean', slave_call_log)
             self.assertEqual(0, behaviour._uploadTarball.call_count)
 
             return slave.status()
@@ -196,9 +194,7 @@
             self.assertEqual(BuildStatus.FULLYBUILT, behaviour.build.status)
             # Log file is stored.
             self.assertIsNotNone(behaviour.build.log)
-            slave_call_log = slave.call_log
             self.assertEqual(1, queue_item.destroySelf.call_count)
-            self.assertIn('clean', slave_call_log)
             self.assertEqual(1, behaviour._uploadTarball.call_count)
 
         d.addCallback(got_dispatch)
@@ -230,7 +226,6 @@
             # Log file is stored.
             self.assertIsNotNone(behaviour.build.log)
             self.assertEqual(1, queue_item.destroySelf.call_count)
-            self.assertIn('clean', slave.call_log)
             self.assertEqual(0, behaviour._uploadTarball.call_count)
 
         d.addCallback(got_dispatch)
@@ -260,7 +255,6 @@
         def build_updated(ignored):
             self.assertEqual(BuildStatus.FULLYBUILT, behaviour.build.status)
             self.assertEqual(1, queue_item.destroySelf.call_count)
-            self.assertIn('clean', slave.call_log)
             self.assertEqual(0, behaviour._uploadTarball.call_count)
 
         d.addCallback(got_dispatch)


Follow ups