← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~julian-edwards/launchpad/catch-eintr-bug-616693 into lp:launchpad/devel

 

Julian Edwards has proposed merging lp:~julian-edwards/launchpad/catch-eintr-bug-616693 into lp:launchpad/devel.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #616693 IBuilder.updateBuild() doesn't deal with EINTR
  https://bugs.launchpad.net/bugs/616693


Fixes the linked bug by checking for EINTR and re-trying a few times.
-- 
https://code.launchpad.net/~julian-edwards/launchpad/catch-eintr-bug-616693/+merge/32517
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~julian-edwards/launchpad/catch-eintr-bug-616693 into lp:launchpad/devel.
=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py	2010-08-06 10:48:49 +0000
+++ lib/lp/buildmaster/model/builder.py	2010-08-12 20:44:20 +0000
@@ -12,6 +12,7 @@
     'updateBuilderStatus',
     ]
 
+import errno
 import httplib
 import gzip
 import logging
@@ -200,24 +201,36 @@
     if logger:
         logger.debug('Checking %s' % builder.name)
 
-    try:
-        builder.checkSlaveAlive()
-        builder.rescueIfLost(logger)
-    # Catch only known exceptions.
-    # XXX cprov 2007-06-15 bug=120571: ValueError & TypeError catching is
-    # disturbing in this context. We should spend sometime sanitizing the
-    # exceptions raised in the Builder API since we already started the
-    # main refactoring of this area.
-    except (ValueError, TypeError, xmlrpclib.Fault,
-            BuildDaemonError), reason:
-        builder.failBuilder(str(reason))
-        if logger:
-            logger.warn(
-                "%s (%s) marked as failed due to: %s",
-                builder.name, builder.url, builder.failnotes, exc_info=True)
-    except socket.error, reason:
-        error_message = str(reason)
-        builder.handleTimeout(logger, error_message)
+    MAX_EINTR_RETRIES = 5 # pulling a number out of my a$$ here
+    eintr_retry_count = 0
+
+    while True:
+        try:
+            builder.checkSlaveAlive()
+            builder.rescueIfLost(logger)
+        # Catch only known exceptions.
+        # XXX cprov 2007-06-15 bug=120571: ValueError & TypeError catching is
+        # disturbing in this context. We should spend sometime sanitizing the
+        # exceptions raised in the Builder API since we already started the
+        # main refactoring of this area.
+        except (ValueError, TypeError, xmlrpclib.Fault,
+                BuildDaemonError), reason:
+            builder.failBuilder(str(reason))
+            if logger:
+                logger.warn(
+                    "%s (%s) marked as failed due to: %s",
+                    builder.name, builder.url, builder.failnotes, exc_info=True)
+        except socket.error, reason:
+            # In Python 2.6 we can use IOError instead.
+            if reason.errno == errno.EINTR:
+                eintr_retry_count += 1
+                if eintr_retry_count != MAX_EINTR_RETRIES:
+                    continue
+            error_message = str(reason)
+            builder.handleTimeout(logger, error_message)
+            return
+        else:
+            return
 
 
 class Builder(SQLBase):

=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
--- lib/lp/buildmaster/tests/test_builder.py	2010-08-02 16:00:50 +0000
+++ lib/lp/buildmaster/tests/test_builder.py	2010-08-12 20:44:20 +0000
@@ -3,6 +3,9 @@
 
 """Test Builder features."""
 
+import errno
+import socket
+
 from zope.component import getUtility
 from zope.security.proxy import removeSecurityProxy
 
@@ -23,6 +26,7 @@
     BinaryPackageBuildBehavior)
 from lp.soyuz.tests.test_publishing import SoyuzTestPublisher
 from lp.testing import TestCaseWithFactory
+from lp.testing.fakemethod import FakeMethod
 
 
 class TestBuilder(TestCaseWithFactory):
@@ -43,6 +47,43 @@
         bq = builder.getBuildQueue()
         self.assertIs(None, bq)
 
+    def test_updateBuilderStatus_catches_repeated_EINTR(self):
+        # A single EINTR return from a socket operation should cause the
+        # operation to be retried, not fail/reset the builder.
+        builder = removeSecurityProxy(self.factory.makeBuilder())
+        builder.handleTimeout = FakeMethod()
+        builder.rescueIfLost = FakeMethod()
+
+        def _fake_checkSlaveAlive():
+            # Raise an EINTR error for all invocations.
+            raise socket.error(errno.EINTR, "fake eintr")
+
+        builder.checkSlaveAlive = _fake_checkSlaveAlive
+        builder.updateStatus()
+
+        # builder.updateStatus should eventually have called
+        # handleTimeout()
+        self.assertEqual(1, builder.handleTimeout.call_count)
+
+    def test_updateBuilderStatus_catches_single_EINTR(self):
+        builder = removeSecurityProxy(self.factory.makeBuilder())
+        builder.handleTimeout = FakeMethod()
+        builder.rescueIfLost = FakeMethod()
+        self.eintr_returned = False
+
+        def _fake_checkSlaveAlive():
+            # raise an EINTR error for the first invocation only.
+            if not self.eintr_returned:
+                self.eintr_returned = True
+                raise socket.error(errno.EINTR, "fake eintr")
+
+        builder.checkSlaveAlive = _fake_checkSlaveAlive
+        builder.updateStatus()
+
+        # builder.updateStatus should never call handleTimeout() for a
+        # single EINTR.
+        self.assertEqual(0, builder.handleTimeout.call_count)
+
 
 class TestFindBuildCandidateBase(TestCaseWithFactory):
     """Setup the test publisher and some builders."""