launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #28879
[Merge] ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master.
Commit message:
Retry file downloads from builders
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/427333
With unreliable networking, attempts to download files (especially larger ones) from builders often fail. Builds typically take much longer than file downloads, so it's in our interests to try a bit harder before failing the build.
I switched `getFiles` from using `twisted.internet.defer.gatherResults` to using `lp.services.twistedsupport.gatherResults` since that has more useful semantics for us: firing the actual first error rather than wrapping it in `defer.FirstError` makes error tests much less annoying to write, and the change in error-consuming behaviour avoids an otherwise inevitable `unhandled-error-in-deferred` error in tests, as well as silencing some annoying log messages (compare https://docs.twistedmatrix.com/en/stable/api/twisted.internet.defer.html#gatherResults).
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:buildd-manager-get-file-retry into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index 70b738f..3fe83c6 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -37,7 +37,7 @@ from lp.buildmaster.interfaces.buildfarmjobbehaviour import (
)
from lp.services.config import config
from lp.services.job.runner import QuietAMPConnector, VirtualEnvProcessStarter
-from lp.services.twistedsupport import cancel_on_timeout
+from lp.services.twistedsupport import cancel_on_timeout, gatherResults
from lp.services.twistedsupport.processmonitor import ProcessWithTimeout
from lp.services.webapp import urlappend
@@ -244,32 +244,37 @@ class BuilderWorker:
errback with the error string.
"""
file_url = self.getURL(sha_sum)
- try:
- # Download the file in a subprocess. We used to download it
- # asynchronously in Twisted, but in practice this only worked well
- # up to a bit over a hundred builders; beyond that it struggled to
- # keep up with incoming packets in time to avoid TCP timeouts
- # (perhaps because of too much synchronous work being done on the
- # reactor thread).
- yield self.process_pool.doWork(
- DownloadCommand,
- file_url=file_url,
- path_to_write=path_to_write,
- timeout=self.timeout,
- )
- if logger is not None:
- logger.info("Grabbed %s" % file_url)
- except Exception as e:
- if logger is not None:
- logger.info(
- "Failed to grab %s: %s\n%s"
- % (
- file_url,
- e,
- " ".join(traceback.format_exception(*sys.exc_info())),
- )
+ for attempt in range(config.builddmaster.download_attempts):
+ try:
+ # Download the file in a subprocess. We used to download it
+ # asynchronously in Twisted, but in practice this only
+ # worked well up to a bit over a hundred builders; beyond
+ # that it struggled to keep up with incoming packets in time
+ # to avoid TCP timeouts (perhaps because of too much
+ # synchronous work being done on the reactor thread).
+ yield self.process_pool.doWork(
+ DownloadCommand,
+ file_url=file_url,
+ path_to_write=path_to_write,
+ timeout=self.timeout,
)
- raise
+ if logger is not None:
+ logger.info("Grabbed %s" % file_url)
+ break
+ except Exception as e:
+ if logger is not None:
+ logger.info(
+ "Failed to grab %s: %s\n%s"
+ % (
+ file_url,
+ e,
+ " ".join(
+ traceback.format_exception(*sys.exc_info())
+ ),
+ )
+ )
+ if attempt == config.builddmaster.download_attempts - 1:
+ raise
def getFiles(self, files, logger=None):
"""Fetch many files from the builder.
@@ -280,7 +285,7 @@ class BuilderWorker:
:return: A DeferredList that calls back when the download is done.
"""
- dl = defer.gatherResults(
+ dl = gatherResults(
[
self.getFile(builder_file, local_file, logger=logger)
for builder_file, local_file in files
diff --git a/lib/lp/buildmaster/tests/test_interactor.py b/lib/lp/buildmaster/tests/test_interactor.py
index bfd4c48..c26e64d 100644
--- a/lib/lp/buildmaster/tests/test_interactor.py
+++ b/lib/lp/buildmaster/tests/test_interactor.py
@@ -13,9 +13,11 @@ import os
import signal
import tempfile
import xmlrpc.client
+from functools import partial
import six
import treq
+from fixtures import MockPatchObject
from lpbuildd.builder import BuilderStatus
from testtools.matchers import ContainsAll
from testtools.testcase import ExpectedException
@@ -1003,3 +1005,54 @@ class TestWorkerWithLibrarian(TestCaseWithFactory):
yield worker.getFiles([(empty_sha1, temp_name)])
with open(temp_name, "rb") as f:
self.assertEqual(b"", f.read())
+
+ @defer.inlineCallbacks
+ def test_getFiles_retries(self):
+ # getFiles retries failed download attempts rather than giving up on
+ # the first failure.
+ self.pushConfig("builddmaster", download_attempts=3)
+ tachandler = self.worker_helper.getServerWorker()
+ worker = self.worker_helper.getClientWorker()
+ count = 0
+
+ def fail_twice(original, *args, **kwargs):
+ nonlocal count
+ count += 1
+ if count < 3:
+ raise RuntimeError("Boom")
+ return original(*args, **kwargs)
+
+ self.useFixture(
+ MockPatchObject(
+ worker.process_pool,
+ "doWork",
+ side_effect=partial(fail_twice, worker.process_pool.doWork),
+ )
+ )
+ temp_dir = self.makeTemporaryDirectory()
+ temp_name = os.path.join(temp_dir, "log")
+ sha1 = hashlib.sha1(b"log").hexdigest()
+ self.worker_helper.makeCacheFile(tachandler, sha1, contents=b"log")
+ yield worker.getFiles([(sha1, temp_name)])
+ with open(temp_name, "rb") as f:
+ self.assertEqual(b"log", f.read())
+
+ @defer.inlineCallbacks
+ def test_getFiles_limited_retries(self):
+ # getFiles gives up on retrying downloads after the configured
+ # number of attempts.
+ self.pushConfig("builddmaster", download_attempts=3)
+ tachandler = self.worker_helper.getServerWorker()
+ worker = self.worker_helper.getClientWorker()
+
+ self.useFixture(
+ MockPatchObject(
+ worker.process_pool, "doWork", side_effect=RuntimeError("Boom")
+ )
+ )
+ temp_dir = self.makeTemporaryDirectory()
+ temp_name = os.path.join(temp_dir, "log")
+ sha1 = hashlib.sha1(b"log").hexdigest()
+ self.worker_helper.makeCacheFile(tachandler, sha1, contents=b"log")
+ with ExpectedException(RuntimeError, r"^Boom$"):
+ yield worker.getFiles([(sha1, temp_name)])
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index 4303e1c..640da63 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -92,6 +92,9 @@ virtualized_socket_timeout: 30
# builders.
download_connections: 128
+# How many times to attempt downloading each file from builders.
+download_attempts: 3
+
# Activate the Build Notification system.
# datatype: boolean
send_build_notification: True