← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:librarian-timeouts into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:librarian-timeouts into launchpad:master.

Commit message:
Add a socket timeout for librarian client connections

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/433219

The internal librarian client creates a socket with no timeout, so if anything goes wrong with the connection then it only recovers by being killed manually.  Add a socket timeout so that it gives up eventually.  (Note that this timeout applies to individual operations on the socket, not to uploading or downloading entire files.)

The default timeout is deliberately quite generous to minimize the chance of causing failures due to connections being merely slow rather than stuck, although we may still find we need to tune it.  This isn't intended to ensure that librarian requests fit within timeout budgets set for web requests or similar, but just as a backstop to ensure that we don't get stuck waiting forever.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:librarian-timeouts into launchpad:master.
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index 640865d..ca906a6 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -1269,6 +1269,10 @@ authentication_endpoint: none
 # authserver.
 authentication_timeout: 5
 
+# The timeout in seconds for librarian client socket operations.
+# datatype: integer
+client_socket_timeout: 60
+
 
 [librarian_gc]
 # The database user which will be used by this process.
diff --git a/lib/lp/services/librarian/client.py b/lib/lp/services/librarian/client.py
index 7be460f..083074a 100644
--- a/lib/lp/services/librarian/client.py
+++ b/lib/lp/services/librarian/client.py
@@ -83,6 +83,7 @@ class FileUploadClient:
         """
         try:
             self.state.s = socket.socket(AF_INET, SOCK_STREAM)
+            self.state.s.settimeout(config.librarian.client_socket_timeout)
             self.state.s.connect((self.upload_host, self.upload_port))
             self.state.f = self.state.s.makefile("rwb", 0)
 
@@ -96,13 +97,16 @@ class FileUploadClient:
 
     def _close(self):
         """Close connection"""
-        self.state.s_poll.unregister(self.state.s.fileno())
-        self.state.s_poll.close()
-        del self.state.s_poll
-        self.state.s.close()
-        del self.state.s
-        self.state.f.close()
-        del self.state.f
+        if hasattr(self.state, "s_poll"):
+            self.state.s_poll.unregister(self.state.s.fileno())
+            self.state.s_poll.close()
+            del self.state.s_poll
+        if hasattr(self.state, "s"):
+            self.state.s.close()
+            del self.state.s
+        if hasattr(self.state, "f"):
+            self.state.f.close()
+            del self.state.f
 
     def _checkError(self):
         poll_result = self.state.s_poll.poll(self.s_poll_timeout)
@@ -171,8 +175,9 @@ class FileUploadClient:
             LibraryFileContent,
         )
 
-        self._connect()
         try:
+            self._connect()
+
             # Get the name of the database the client is using, so that
             # the server can check that the client is using the same
             # database as the server.
@@ -259,6 +264,12 @@ class FileUploadClient:
                 aliasID,
             )
             return aliasID
+        except socket.timeout:
+            timeout = config.librarian.client_socket_timeout
+            raise UploadFailed(
+                "Server timed out after %s %s"
+                % (timeout, "second" if timeout == 1 else "seconds")
+            )
         finally:
             self._close()
 
diff --git a/lib/lp/services/librarian/tests/test_client.py b/lib/lp/services/librarian/tests/test_client.py
index aec3fb9..b5d3984 100644
--- a/lib/lp/services/librarian/tests/test_client.py
+++ b/lib/lp/services/librarian/tests/test_client.py
@@ -32,6 +32,7 @@ from lp.services.librarian.client import (
 from lp.services.librarian.interfaces.client import UploadFailed
 from lp.services.librarian.model import LibraryFileAlias
 from lp.services.propertycache import cachedproperty
+from lp.services.timeout import override_timeout, with_timeout
 from lp.testing import TestCase
 from lp.testing.layers import (
     DatabaseLayer,
@@ -231,6 +232,22 @@ class EchoServer(threading.Thread):
         self.socket.close()
 
 
+class TimingOutServer(EchoServer):
+    """Fake TCP server that never sends a reply."""
+
+    def run(self):
+        while not self.should_stop:
+            try:
+                conn, addr = self.socket.accept()
+                self.connections.append(conn)
+            except socket.timeout:
+                pass
+        for conn in self.connections:
+            conn.close()
+        self.connections = []
+        self.socket.close()
+
+
 class LibrarianClientTestCase(TestCase):
     layer = LaunchpadFunctionalLayer
 
@@ -307,6 +324,36 @@ class LibrarianClientTestCase(TestCase):
             "text/plain",
         )
 
+    def test_addFile_fails_when_server_times_out(self):
+        server = TimingOutServer()
+        server.start()
+        self.addCleanup(server.join)
+
+        upload_host, upload_port = server.socket.getsockname()
+        self.pushConfig(
+            "librarian",
+            upload_host=upload_host,
+            upload_port=upload_port,
+            client_socket_timeout=1,
+        )
+
+        client = LibrarianClient()
+
+        @with_timeout()
+        def add_file():
+            self.assertRaisesWithContent(
+                UploadFailed,
+                "Server timed out after 1 second",
+                client.addFile,
+                "sample.txt",
+                6,
+                io.BytesIO(b"sample"),
+                "text/plain",
+            )
+
+        with override_timeout(3600):
+            add_file()
+
     def test_addFile_uses_primary(self):
         # addFile is a write operation, so it should always use the
         # primary store, even if the standby is the default. Close the