← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master.

Commit message:
Retry file downloads from builders

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/427333

With unreliable networking, attempts to download files (especially larger ones) from builders often fail.  Builds typically take much longer than file downloads, so it's in our interests to try a bit harder before failing the build.

I switched `getFiles` from using `twisted.internet.defer.gatherResults` to using `lp.services.twistedsupport.gatherResults` since that has more useful semantics for us: firing the actual first error rather than wrapping it in `defer.FirstError` makes error tests much less annoying to write, and the change in error-consuming behaviour avoids an otherwise inevitable `unhandled-error-in-deferred` error in tests, as well as silencing some annoying log messages (compare https://docs.twistedmatrix.com/en/stable/api/twisted.internet.defer.html#gatherResults).
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index 70b738f..3fe83c6 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -37,7 +37,7 @@ from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
 )
 from lp.services.config import config
 from lp.services.job.runner import QuietAMPConnector, VirtualEnvProcessStarter
-from lp.services.twistedsupport import cancel_on_timeout
+from lp.services.twistedsupport import cancel_on_timeout, gatherResults
 from lp.services.twistedsupport.processmonitor import ProcessWithTimeout
 from lp.services.webapp import urlappend
 
@@ -244,32 +244,37 @@ class BuilderWorker:
             errback with the error string.
         """
         file_url = self.getURL(sha_sum)
-        try:
-            # Download the file in a subprocess.  We used to download it
-            # asynchronously in Twisted, but in practice this only worked well
-            # up to a bit over a hundred builders; beyond that it struggled to
-            # keep up with incoming packets in time to avoid TCP timeouts
-            # (perhaps because of too much synchronous work being done on the
-            # reactor thread).
-            yield self.process_pool.doWork(
-                DownloadCommand,
-                file_url=file_url,
-                path_to_write=path_to_write,
-                timeout=self.timeout,
-            )
-            if logger is not None:
-                logger.info("Grabbed %s" % file_url)
-        except Exception as e:
-            if logger is not None:
-                logger.info(
-                    "Failed to grab %s: %s\n%s"
-                    % (
-                        file_url,
-                        e,
-                        " ".join(traceback.format_exception(*sys.exc_info())),
-                    )
+        for attempt in range(config.builddmaster.download_attempts):
+            try:
+                # Download the file in a subprocess.  We used to download it
+                # asynchronously in Twisted, but in practice this only
+                # worked well up to a bit over a hundred builders; beyond
+                # that it struggled to keep up with incoming packets in time
+                # to avoid TCP timeouts (perhaps because of too much
+                # synchronous work being done on the reactor thread).
+                yield self.process_pool.doWork(
+                    DownloadCommand,
+                    file_url=file_url,
+                    path_to_write=path_to_write,
+                    timeout=self.timeout,
                 )
-            raise
+                if logger is not None:
+                    logger.info("Grabbed %s" % file_url)
+                break
+            except Exception as e:
+                if logger is not None:
+                    logger.info(
+                        "Failed to grab %s: %s\n%s"
+                        % (
+                            file_url,
+                            e,
+                            " ".join(
+                                traceback.format_exception(*sys.exc_info())
+                            ),
+                        )
+                    )
+                if attempt == config.builddmaster.download_attempts - 1:
+                    raise
 
     def getFiles(self, files, logger=None):
         """Fetch many files from the builder.
@@ -280,7 +285,7 @@ class BuilderWorker:
 
         :return: A DeferredList that calls back when the download is done.
         """
-        dl = defer.gatherResults(
+        dl = gatherResults(
             [
                 self.getFile(builder_file, local_file, logger=logger)
                 for builder_file, local_file in files
diff --git a/lib/lp/buildmaster/tests/test_interactor.py b/lib/lp/buildmaster/tests/test_interactor.py
index bfd4c48..c26e64d 100644
--- a/lib/lp/buildmaster/tests/test_interactor.py
+++ b/lib/lp/buildmaster/tests/test_interactor.py
@@ -13,9 +13,11 @@ import os
 import signal
 import tempfile
 import xmlrpc.client
+from functools import partial
 
 import six
 import treq
+from fixtures import MockPatchObject
 from lpbuildd.builder import BuilderStatus
 from testtools.matchers import ContainsAll
 from testtools.testcase import ExpectedException
@@ -1003,3 +1005,54 @@ class TestWorkerWithLibrarian(TestCaseWithFactory):
         yield worker.getFiles([(empty_sha1, temp_name)])
         with open(temp_name, "rb") as f:
             self.assertEqual(b"", f.read())
+
+    @defer.inlineCallbacks
+    def test_getFiles_retries(self):
+        # getFiles retries failed download attempts rather than giving up on
+        # the first failure.
+        self.pushConfig("builddmaster", download_attempts=3)
+        tachandler = self.worker_helper.getServerWorker()
+        worker = self.worker_helper.getClientWorker()
+        count = 0
+
+        def fail_twice(original, *args, **kwargs):
+            nonlocal count
+            count += 1
+            if count < 3:
+                raise RuntimeError("Boom")
+            return original(*args, **kwargs)
+
+        self.useFixture(
+            MockPatchObject(
+                worker.process_pool,
+                "doWork",
+                side_effect=partial(fail_twice, worker.process_pool.doWork),
+            )
+        )
+        temp_dir = self.makeTemporaryDirectory()
+        temp_name = os.path.join(temp_dir, "log")
+        sha1 = hashlib.sha1(b"log").hexdigest()
+        self.worker_helper.makeCacheFile(tachandler, sha1, contents=b"log")
+        yield worker.getFiles([(sha1, temp_name)])
+        with open(temp_name, "rb") as f:
+            self.assertEqual(b"log", f.read())
+
+    @defer.inlineCallbacks
+    def test_getFiles_limited_retries(self):
+        # getFiles gives up on retrying downloads after the configured
+        # number of attempts.
+        self.pushConfig("builddmaster", download_attempts=3)
+        tachandler = self.worker_helper.getServerWorker()
+        worker = self.worker_helper.getClientWorker()
+
+        self.useFixture(
+            MockPatchObject(
+                worker.process_pool, "doWork", side_effect=RuntimeError("Boom")
+            )
+        )
+        temp_dir = self.makeTemporaryDirectory()
+        temp_name = os.path.join(temp_dir, "log")
+        sha1 = hashlib.sha1(b"log").hexdigest()
+        self.worker_helper.makeCacheFile(tachandler, sha1, contents=b"log")
+        with ExpectedException(RuntimeError, r"^Boom$"):
+            yield worker.getFiles([(sha1, temp_name)])
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index 4303e1c..640da63 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -92,6 +92,9 @@ virtualized_socket_timeout: 30
 # builders.
 download_connections: 128
 
+# How many times to attempt downloading each file from builders.
+download_attempts: 3
+
 # Activate the Build Notification system.
 # datatype: boolean
 send_build_notification: True