← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479 into lp:launchpad/devel

 

Julian Edwards has proposed merging lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479 into lp:launchpad/devel.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #676479 The buildd-manager times out too quickly when connecting to builders
  https://bugs.launchpad.net/bugs/676479


This branch tries to fix an issue with the buildd-manager where it's disabling a lot of builders en-masse.

It appears to be a problem when dispatching a lot of recipe builds at once.  These are known to take the builder into swap as the bzr checkout process is quite heavyweight.  This makes the builder quite unresponsive.

The default Twisted TCP connection timeout is 30 seconds, which appears to not be enough to allow a heavily loaded builder slave to respond to an xmlrpc request.  Twisted also does not allow you to override this timeout, so to fix this I've basically copied some chunks from the Proxy class and altered parts to supply the timeout value to reactor.connectTCP().

The new test relies on the fact that connectTCP() is never actually called, so we don't need to worry about getting other types of connection errors.


-- 
https://code.launchpad.net/~julian-edwards/launchpad/disable-failures-bm-bug-676479/+merge/41068
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~julian-edwards/launchpad/disable-failures-bm-bug-676479 into lp:launchpad/devel.
=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py	2010-11-10 13:06:05 +0000
+++ lib/lp/buildmaster/model/builder.py	2010-11-17 16:37:07 +0000
@@ -8,6 +8,7 @@
 __all__ = [
     'Builder',
     'BuilderSet',
+    'ProxyWithConnectionTimeout',
     'rescueBuilderIfLost',
     'updateBuilderStatus',
     ]
@@ -99,6 +100,41 @@
     noisy = False
 
 
+class ProxyWithConnectionTimeout(xmlrpc.Proxy):
+    """Extend Twisted's Proxy to provide a configurable connection timeout."""
+
+    def __init__(self, url, user=None, password=None, allowNone=False,
+                 useDateTime=False, timeout=None):
+        xmlrpc.Proxy.__init__(
+            self, url, user, password, allowNone, useDateTime)
+        if timeout is None:
+            self.timeout = config.builddmaster.socket_timeout
+        else:
+            self.timeout = timeout
+
+    def callRemote(self, method, *args):
+        """Basically a carbon copy of the parent but passes the timeout
+        to connectTCP."""
+
+        def cancel(d):
+            factory.deferred = None
+            connector.disconnect()
+        factory = self.queryFactory(
+            self.path, self.host, method, self.user,
+            self.password, self.allowNone, args, cancel, self.useDateTime)
+        if self.secure:
+            from twisted.internet import ssl
+            connector = default_reactor.connectSSL(
+                self.host, self.port or 443, factory,
+                ssl.ClientContextFactory(),
+                timeout=self.timeout)
+        else:
+            connector = default_reactor.connectTCP(
+                self.host, self.port or 80, factory,
+                timeout=self.timeout)
+        return factory.deferred
+
+
 class BuilderSlave(object):
     """Add in a few useful methods for the XMLRPC slave.
 
@@ -141,7 +177,7 @@
         """
         rpc_url = urlappend(builder_url.encode('utf-8'), 'rpc')
         if proxy is None:
-            server_proxy = xmlrpc.Proxy(rpc_url, allowNone=True)
+            server_proxy = ProxyWithConnectionTimeout(rpc_url, allowNone=True)
             server_proxy.queryFactory = QuietQueryFactory
         else:
             server_proxy = proxy
@@ -213,7 +249,7 @@
         :param libraryfilealias: An `ILibraryFileAlias`.
         """
         url = libraryfilealias.http_url
-        logger.debug(
+        logger.info(
             "Asking builder on %s to ensure it has file %s (%s, %s)" % (
                 self._file_cache_url, libraryfilealias.filename, url,
                 libraryfilealias.content.sha1))
@@ -432,7 +468,7 @@
             return defer.fail(CannotResumeHost('Undefined vm_host.'))
 
         logger = self._getSlaveScannerLogger()
-        logger.debug("Resuming %s (%s)" % (self.name, self.url))
+        logger.info("Resuming %s (%s)" % (self.name, self.url))
 
         d = self.slave.resume()
         def got_resume_ok((stdout, stderr, returncode)):

=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
--- lib/lp/buildmaster/tests/test_builder.py	2010-11-10 22:40:05 +0000
+++ lib/lp/buildmaster/tests/test_builder.py	2010-11-17 16:37:07 +0000
@@ -43,6 +43,10 @@
     )
 from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
 from lp.buildmaster.interfaces.builder import CannotResumeHost
+from lp.buildmaster.model.builder import (
+    BuilderSlave,
+    ProxyWithConnectionTimeout,
+    )
 from lp.buildmaster.model.buildfarmjobbehavior import IdleBuildBehavior
 from lp.buildmaster.model.buildqueue import BuildQueue
 from lp.buildmaster.tests.mock_slaves import (
@@ -1059,6 +1063,53 @@
             self.slave.build(None, None, None, None, None))
 
 
+class TestSlaveConnectionTimeouts(TrialTestCase):
+    # Testing that we can override the default 30 second connection
+    # timeout.
+
+    layer = TwistedLayer
+
+    def setUp(self):
+        super(TestSlaveConnectionTimeouts, self).setUp()
+        self.slave_helper = SlaveTestHelpers()
+        self.slave_helper.setUp()
+        self.addCleanup(self.slave_helper.cleanUp)
+        self.clock = Clock()
+        self.proxy = ProxyWithConnectionTimeout("http://10.255.255.1/";)
+        self.slave = self.slave_helper.getClientSlave(
+            reactor=self.clock, proxy=self.proxy)
+
+    def test_connection_timeout(self):
+        # The default timeout of 30 seconds should not cause a timeout,
+        # only the config value should.
+        timeout_config = """
+        [builddmaster]
+        socket_timeout: 180
+        """
+        config.push('timeout', timeout_config)
+        self.addCleanup(config.pop, 'timeout')
+
+        d = self.slave.echo()
+        # Advance past the 30 second timeout.
+        self.clock.advance(31)
+        self.assertFalse(d.called)
+
+        # Now advance past the real socket timeout and expect a
+        # Failure.
+
+        def got_timeout(failure):
+            self.assertIsInstance(failure.value, CancelledError)
+
+        d.addBoth(got_timeout)
+        self.clock.advance(config.builddmaster.socket_timeout + 1)
+        self.assertTrue(d.called)
+
+    def test_BuilderSlave_uses_ProxyWithConnectionTimeout(self):
+        # Make sure that BuilderSlaves use the custom proxy class.
+        slave = BuilderSlave.makeBuilderSlave("url", "host")
+        self.assertIsInstance(slave._server, ProxyWithConnectionTimeout)
+
+
 class TestSlaveWithLibrarian(TrialTestCase):
     """Tests that need more of Launchpad to run."""
 

=== modified file 'lib/lp/code/model/recipebuilder.py'
--- lib/lp/code/model/recipebuilder.py	2010-10-27 14:25:19 +0000
+++ lib/lp/code/model/recipebuilder.py	2010-11-17 16:37:07 +0000
@@ -122,6 +122,8 @@
         if chroot is None:
             raise CannotBuild("Unable to find a chroot for %s" %
                               distroarchseries.displayname)
+        logger.info(
+            "Sending chroot file for recipe build to %s" % self._builder.name)
         d = self._builder.slave.cacheFile(logger, chroot)
 
         def got_cache_file(ignored):
@@ -131,7 +133,7 @@
             buildid = "%s-%s" % (self.build.id, build_queue_id)
             cookie = self.buildfarmjob.generateSlaveBuildCookie()
             chroot_sha1 = chroot.content.sha1
-            logger.debug(
+            logger.info(
                 "Initiating build %s on %s" % (buildid, self._builder.url))
 
             return self._builder.slave.build(