← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~julian-edwards/launchpad/buildd-failure-counting into lp:launchpad

 

Julian Edwards has proposed merging lp:~julian-edwards/launchpad/buildd-failure-counting into lp:launchpad with lp:~julian-edwards/launchpad/buildd-manager-parallel-scan as a prerequisite.

Requested reviews:
  Robert Collins (lifeless): db
  Stuart Bishop (stub): db
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #369109 Buildd-manager should deal with transient communication failures with builders
  https://bugs.launchpad.net/bugs/369109
  #586362 The buildd-manager shares failure modes for connection failures and reset failures
  https://bugs.launchpad.net/bugs/586362


Add failure counting on builder and jobs in the build farm, to work out which one is the nasty one when Stuff Goes Wrong.

WIP.
-- 
https://code.launchpad.net/~julian-edwards/launchpad/buildd-failure-counting/+merge/31556
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~julian-edwards/launchpad/buildd-failure-counting into lp:launchpad.
=== added file 'database/schema/patch-2207-99-0.sql'
--- database/schema/patch-2207-99-0.sql	1970-01-01 00:00:00 +0000
+++ database/schema/patch-2207-99-0.sql	2010-08-02 17:18:48 +0000
@@ -0,0 +1,12 @@
+-- Copyright 2010 Canonical Ltd.  This software is licensed under the
+-- GNU Affero General Public License version 3 (see the file LICENSE).
+
+ALTER TABLE Builder
+    ADD COLUMN failure_count integer not null default 0;
+
+ALTER TABLE BuildFarmJob
+    ADD COLUMN failure_count integer not null default 0;
+
+
+
+INSERT INTO LaunchpadDatabaseRevision VALUES (2207, 99, 0);

=== modified file 'lib/lp/buildmaster/configure.zcml'
--- lib/lp/buildmaster/configure.zcml	2010-06-15 14:34:50 +0000
+++ lib/lp/buildmaster/configure.zcml	2010-08-02 17:18:48 +0000
@@ -50,6 +50,9 @@
         <require
             permission="launchpad.Edit"
             set_attributes="status"/>
+        <require
+            permission="launchpad.Admin"
+            set_attributes="failure_count"/>
     </class>
     <securedutility
         component="lp.buildmaster.model.buildfarmjob.BuildFarmJob"

=== modified file 'lib/lp/buildmaster/interfaces/builder.py'
--- lib/lp/buildmaster/interfaces/builder.py	2010-08-02 17:18:46 +0000
+++ lib/lp/buildmaster/interfaces/builder.py	2010-08-02 17:18:48 +0000
@@ -20,7 +20,7 @@
     ]
 
 from zope.interface import Interface, Attribute
-from zope.schema import Bool, Choice, Field, Text, TextLine
+from zope.schema import Bool, Choice, Field, Int, Text, TextLine
 
 from canonical.launchpad import _
 from canonical.launchpad.fields import Title, Description
@@ -146,6 +146,10 @@
                 "new jobs. "),
         required=False)
 
+    failure_count = Int(
+        title=_('Failure Count'), required=False, default=0,
+        description=_("Number of consecutive failures for this builder."))
+
     current_build_behavior = Field(
         title=u"The current behavior of the builder for the current job.",
         required=False)

=== modified file 'lib/lp/buildmaster/interfaces/buildfarmjob.py'
--- lib/lp/buildmaster/interfaces/buildfarmjob.py	2010-06-16 14:12:20 +0000
+++ lib/lp/buildmaster/interfaces/buildfarmjob.py	2010-08-02 17:18:48 +0000
@@ -18,7 +18,7 @@
     ]
 
 from zope.interface import Interface, Attribute
-from zope.schema import Bool, Choice, Datetime, TextLine, Timedelta
+from zope.schema import Bool, Choice, Datetime, Int, TextLine, Timedelta
 from lazr.enum import DBEnumeratedType, DBItem
 from lazr.restful.declarations import exported
 from lazr.restful.fields import Reference
@@ -258,6 +258,11 @@
         vocabulary=BuildFarmJobType,
         description=_("The specific type of job."))
 
+    failure_count = Int(
+        title=_("Failure Count"), required=False, readonly=True,
+        default=0,
+        description=_("Number of consecutive failures for this job."))
+
     def getSpecificJob():
         """Return the specific build job associated with this record.
 

=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2010-08-02 17:18:46 +0000
+++ lib/lp/buildmaster/manager.py	2010-08-02 17:18:48 +0000
@@ -30,7 +30,8 @@
 from canonical.config import config
 from canonical.launchpad.webapp import urlappend
 from canonical.librarian.db import write_transaction
-from lp.buildmaster.interfaces.buildbase import BUILDD_MANAGER_LOG_NAME
+from lp.buildmaster.interfaces.buildbase import (
+    BuildStatus, BUILDD_MANAGER_LOG_NAME)
 from lp.services.twistedsupport.processmonitor import ProcessWithTimeout
 
 
@@ -157,6 +158,51 @@
         if job is not None:
             job.reset()
 
+    def _getBuilder(self):
+        # Helper to return the builder given the slave for this request.
+        # Avoiding circular imports.
+        from lp.buildmaster.interfaces.builder import IBuilderSet
+        return getUtility(IBuilderSet)[self.slave.name]
+
+    def assessFailureCounts(self):
+        """View builder/job failure_count and work out which needs to die.
+        
+        :return: True if we disabled something, False if we did not.
+        """
+        # Avoiding circular imports.
+        builder = self._getBuilder()
+        build_job = builder.currentjob.specific_job.build
+
+        if builder.failure_count == build_job.failure_count:
+            # This is either the first failure for this job on this
+            # builder, or by some chance the job was re-dispatched to
+            # the same builder.  This make it impossible to determine
+            # whether the job or the builder is at fault, so don't fail
+            # either.  We reset the builder and job to try again.
+            self._cleanJob(builder.currentjob)
+            return False
+
+        if builder.failure_count > build_job.failure_count:
+            # The builder has failed more than the jobs it's been
+            # running, so let's disable it and re-schedule the build.
+            builder.failBuilder(self.info)
+            self._cleanJob(builder.currentjob)
+            return True
+        else:
+            # The job is the culprit!  Override its status to 'failed'
+            # to make sure it won't get automatically dispatched again,
+            # and remove the buildqueue request.  The failure should
+            # have already caused any relevant slave data to be stored
+            # on the build record so don't worry about that here.
+            build_job.status = BuildStatus.FAILEDTOBUILD
+            builder.currentjob.destroySelf()
+
+            # N.B. We could try and call _handleStatus_PACKAGEFAIL here
+            # but that would cause us to query the slave for its status
+            # again, and if the slave is non-responsive it holds up the
+            # next buildd scan.
+            return True
+
     def ___call__(self):
         raise NotImplementedError(
             "Call sites must define an evaluation method.")
@@ -175,12 +221,7 @@
 
     @write_transaction
     def __call__(self):
-        # Avoiding circular imports.
-        from lp.buildmaster.interfaces.builder import IBuilderSet
-
-        builder = getUtility(IBuilderSet)[self.slave.name]
-        builder.failBuilder(self.info)
-        self._cleanJob(builder.currentjob)
+        self.assessFailureCounts()
 
 
 class ResetDispatchResult(BaseDispatchResult):
@@ -195,10 +236,7 @@
 
     @write_transaction
     def __call__(self):
-        # Avoiding circular imports.
-        from lp.buildmaster.interfaces.builder import IBuilderSet
-
-        builder = getUtility(IBuilderSet)[self.slave.name]
+        builder = self._getBuilder()
         # Builders that fail to reset should be disabled as per bug
         # 563353.
         # XXX Julian bug=586362
@@ -440,6 +478,13 @@
         self.logger.error('%s resume failure: %s' % (slave, error_text))
         return self.reset_result(slave, error_text)
 
+    def _incrementFailureCounts(self, slave):
+        # Avoid circular import.
+        from lp.buildmaster.interfaces.builder import IBuilderSet
+        builder = getUtility(IBuilderSet)[slave.name]
+        builder.failure_count += 1
+        builder.currentjob.specific_job.build.failure_count += 1
+
     def checkDispatch(self, response, method, slave):
         """Verify the results of a slave xmlrpc call.
 
@@ -457,7 +502,8 @@
                 '%s communication failed (%s)' %
                 (slave, response.getErrorMessage()))
             self.slaveConversationEnded()
-            return self.reset_result(slave)
+            self._incrementFailureCounts(slave)
+            return self.fail_result(slave)
 
         if isinstance(response, list) and len(response) == 2:
             if method in buildd_success_result_map:
@@ -475,6 +521,7 @@
             '%s failed to dispatch (%s)' % (slave, info))
 
         self.slaveConversationEnded()
+        self._incrementFailureCounts(slave)
         return self.fail_result(slave, info)
 
 

=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py	2010-08-02 17:18:46 +0000
+++ lib/lp/buildmaster/model/builder.py	2010-08-02 17:18:48 +0000
@@ -246,6 +246,7 @@
     manual = BoolCol(dbName='manual', default=False)
     vm_host = StringCol(dbName='vm_host')
     active = BoolCol(dbName='active', notNull=True, default=True)
+    failure_count = IntCol(dbName='failure_count', default=0, notNull=True)
 
     def _getCurrentBuildBehavior(self):
         """Return the current build behavior."""

=== modified file 'lib/lp/buildmaster/model/buildfarmjob.py'
--- lib/lp/buildmaster/model/buildfarmjob.py	2010-06-16 16:01:08 +0000
+++ lib/lp/buildmaster/model/buildfarmjob.py	2010-08-02 17:18:48 +0000
@@ -209,6 +209,8 @@
     job_type = DBEnum(
         name='job_type', allow_none=False, enum=BuildFarmJobType)
 
+    failure_count = Int(name='failure_count', allow_none=False)
+
     def __init__(self, job_type, status=BuildStatus.NEEDSBUILD,
                  processor=None, virtualized=None, date_created=None):
         super(BuildFarmJob, self).__init__()

=== modified file 'lib/lp/buildmaster/model/buildqueue.py'
--- lib/lp/buildmaster/model/buildqueue.py	2010-05-27 18:20:21 +0000
+++ lib/lp/buildmaster/model/buildqueue.py	2010-08-02 17:18:48 +0000
@@ -170,6 +170,7 @@
     def markAsBuilding(self, builder):
         """See `IBuildQueue`."""
         self.builder = builder
+        builder.failure_count = 0
         if self.job.status != JobStatus.RUNNING:
             self.job.start()
         self.specific_job.jobStarted()

=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
--- lib/lp/buildmaster/tests/test_builder.py	2010-08-02 17:18:46 +0000
+++ lib/lp/buildmaster/tests/test_builder.py	2010-08-02 17:18:48 +0000
@@ -6,11 +6,13 @@
 from zope.component import getUtility
 from zope.security.proxy import removeSecurityProxy
 
+from canonical.database.sqlbase import flush_database_updates
 from canonical.launchpad.webapp.interfaces import (
     IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)
-from canonical.testing import LaunchpadZopelessLayer
+from canonical.testing.layers import (
+    DatabaseFunctionalLayer, LaunchpadZopelessLayer)
 from lp.buildmaster.interfaces.buildbase import BuildStatus
-from lp.buildmaster.interfaces.builder import IBuilderSet
+from lp.buildmaster.interfaces.builder import IBuilder, IBuilderSet
 from lp.buildmaster.interfaces.buildfarmjobbehavior import (
     IBuildFarmJobBehavior)
 from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
@@ -28,7 +30,17 @@
 class TestBuilder(TestCaseWithFactory):
     """Basic unit tests for `Builder`."""
 
-    layer = LaunchpadZopelessLayer
+    layer = DatabaseFunctionalLayer
+
+    def test_providesInterface(self):
+        # Builder provides IBuilder
+        builder = self.factory.makeBuilder()
+        self.assertProvides(builder, IBuilder)
+
+    def test_default_values(self):
+        builder = self.factory.makeBuilder()
+        flush_database_updates()
+        self.assertEqual(0, builder.failure_count)
 
     def test_getBuildQueue(self):
         buildqueueset = getUtility(IBuildQueueSet)

=== modified file 'lib/lp/buildmaster/tests/test_buildfarmjob.py'
--- lib/lp/buildmaster/tests/test_buildfarmjob.py	2010-07-18 17:43:23 +0000
+++ lib/lp/buildmaster/tests/test_buildfarmjob.py	2010-08-02 17:18:48 +0000
@@ -80,6 +80,8 @@
         # The job type is required to create a build farm job.
         self.assertEqual(
             BuildFarmJobType.PACKAGEBUILD, self.build_farm_job.job_type)
+        # Failure count defaults to zero.
+        self.assertEqual(0, self.build_farm_job.failure_count)
         # Other attributes are unset by default.
         self.assertEqual(None, self.build_farm_job.processor)
         self.assertEqual(None, self.build_farm_job.virtualized)

=== modified file 'lib/lp/buildmaster/tests/test_buildqueue.py'
--- lib/lp/buildmaster/tests/test_buildqueue.py	2010-07-18 17:43:23 +0000
+++ lib/lp/buildmaster/tests/test_buildqueue.py	2010-08-02 17:18:48 +0000
@@ -220,6 +220,22 @@
             "An active build job must have a builder.")
 
 
+class TestBuildQueue(TestCaseWithFactory):
+    """Unit tests for `BuildQueue`."""
+
+    layer = LaunchpadZopelessLayer
+
+    def setUp(self):
+        super(TestBuildQueue, self).setUp()
+        self.buildqueue = self.factory.makeSourcePackageRecipeBuildJob()
+
+    def test_markAsBuilding_resets_failure_count(self):
+        builder = self.factory.makeBuilder()
+        builder.failure_count = 2
+        self.buildqueue.markAsBuilding(builder)
+        self.assertEqual(0, builder.failure_count)
+
+
 class TestBuildQueueBase(TestCaseWithFactory):
     """Setup the test publisher and some builders."""
 

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2010-08-02 17:18:46 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2010-08-02 17:18:48 +0000
@@ -221,7 +221,7 @@
 
 class TestSlaveScanner(TrialTestCase):
     """Tests for the actual build slave manager."""
-    layer = TwistedLayer
+    layer = LaunchpadZopelessLayer
 
     def setUp(self):
         TrialTestCase.setUp(self)
@@ -360,9 +360,17 @@
             self.assertFalse(result.processed)
         return d.addCallback(check_result)
 
-    def testCheckDispatch(self):
-        """`SlaveScanner.checkDispatch` is chained after dispatch requests.
-
+    def _setUpSlaveAndBuilder(self):
+        # Helper function to set up a builder and its recording slave.
+        slave = RecordingSlave('bob', 'http://foo.buildd:8221/', 'foo.host')
+        bob_builder = getUtility(IBuilderSet)[slave.name]
+        return slave, bob_builder
+
+    def test_checkDispatch_success(self):
+        # SlaveScanner.checkDispatch returns None for a successful
+        # dispatch.
+
+        """
         If the dispatch request fails or a unknown method is given, it
         returns a `FailDispatchResult` (in the test context) that will
         be evaluated later.
@@ -385,7 +393,9 @@
 
         On success dispatching it returns None.
         """
-        slave = RecordingSlave('foo', 'http://foo.buildd:8221/', 'foo.host')
+        slave, bob_builder = self._setUpSlaveAndBuilder()
+        bob_builder.failure_count = 0
+        bob_builder.currentjob.specific_job.build.failure_count = 0
 
         # Successful legitimate response, None is returned.
         successful_response = [
@@ -395,39 +405,75 @@
         self.assertEqual(
             None, result, 'Successful dispatch checks should return None')
 
-        # Failed legitimate response, results in a `FailDispatchResult`.
+    def test_checkDispatch_first_fail(self):
+        # Failed legitimate response, results in FailDispatchResult and
+        # failure_count on the job and the builder are both incremented.
+        slave, bob_builder = self._setUpSlaveAndBuilder()
+        bob_builder.failure_count = 0
+        bob_builder.currentjob.specific_job.build.failure_count = 0
+
         failed_response = [False, 'uncool builder']
         result = self.manager.checkDispatch(
             failed_response, 'ensurepresent', slave)
         self.assertIsDispatchFail(result)
         self.assertEqual(
-            '<foo:http://foo.buildd:8221/> failure (uncool builder)',
-            repr(result))
-
-        # Twisted Failure response, results in a `ResetDispatchResult`.
+            repr(result),
+            '<bob:http://foo.buildd:8221/> failure (uncool builder)')
+        self.assertEqual(1, bob_builder.failure_count)
+        self.assertEqual(
+            1, bob_builder.currentjob.specific_job.build.failure_count)
+
+    def test_checkDispatch_second_reset_fail_by_builder(self):
+        # Twisted Failure response, results in a `FailDispatchResult`.
+        slave, bob_builder = self._setUpSlaveAndBuilder()
+        bob_builder.failure_count = 1
+        bob_builder.currentjob.specific_job.build.failure_count = 0
+
         twisted_failure = Failure(ConnectionClosed('Boom!'))
         result = self.manager.checkDispatch(
             twisted_failure, 'ensurepresent', slave)
-        self.assertIsDispatchReset(result)
-        self.assertEqual(
-            '<foo:http://foo.buildd:8221/> reset failure', repr(result))
+        self.assertIsDispatchFail(result)
+        self.assertEqual(
+            '<bob:http://foo.buildd:8221/> failure (None)', repr(result))
+        self.assertEqual(2, bob_builder.failure_count)
+        self.assertEqual(
+            1, bob_builder.currentjob.specific_job.build.failure_count)
 
+    def test_checkDispatch_second_comms_fail_by_builder(self):
         # Unexpected response, results in a `FailDispatchResult`.
+        slave, bob_builder = self._setUpSlaveAndBuilder()
+        bob_builder.failure_count = 1
+        bob_builder.currentjob.specific_job.build.failure_count = 0
+
         unexpected_response = [1, 2, 3]
         result = self.manager.checkDispatch(
             unexpected_response, 'build', slave)
         self.assertIsDispatchFail(result)
         self.assertEqual(
-            '<foo:http://foo.buildd:8221/> failure '
+            '<bob:http://foo.buildd:8221/> failure '
             '(Unexpected response: [1, 2, 3])', repr(result))
-
-        # Unknown method was given, results in a `FailDispatchResult`
+        self.assertEqual(2, bob_builder.failure_count)
+        self.assertEqual(
+            1, bob_builder.currentjob.specific_job.build.failure_count)
+
+    def test_checkDispatch_second_comms_fail_by_job(self):
+        # Unknown method was given, results in a `FailDispatchResult`.
+        # This could be caused by a faulty job which would fail the job.
+        slave, bob_builder = self._setUpSlaveAndBuilder()
+        bob_builder.failure_count = 0
+        bob_builder.currentjob.specific_job.build.failure_count = 1
+
+        successful_response = [
+            buildd_success_result_map.get('ensurepresent'), 'cool builder']
         result = self.manager.checkDispatch(
             successful_response, 'unknown-method', slave)
         self.assertIsDispatchFail(result)
         self.assertEqual(
-            '<foo:http://foo.buildd:8221/> failure '
+            '<bob:http://foo.buildd:8221/> failure '
             '(Unknown slave method: unknown-method)', repr(result))
+        self.assertEqual(1, bob_builder.failure_count)
+        self.assertEqual(
+            2, bob_builder.currentjob.specific_job.build.failure_count)
 
     def test_initiateDispatch(self):
         """Check `dispatchBuild` in various scenarios.
@@ -450,7 +496,7 @@
         def check_events(results):
             [error] = [r for s, r in results if r is not None]
             self.assertEqual(
-                '<foo:http://foo.buildd:8221/> failure (very broken slave)',
+                '<bob:http://bob.buildd:8221/> failure (very broken slave)',
                 repr(error))
             self.assertTrue(error.processed)
 
@@ -463,7 +509,7 @@
             dl.addCallback(check_events)
 
         # A functional slave charged with some interactions.
-        slave = RecordingSlave('foo', 'http://foo.buildd:8221/', 'foo.host')
+        slave = RecordingSlave('bob', 'http://bob.buildd:8221/', 'bob.host')
         slave.ensurepresent('arg1', 'arg2', 'arg3')
         slave.build('arg1', 'arg2', 'arg3')
 
@@ -495,7 +541,7 @@
         # Create a broken slave and insert interaction that will
         # cause the builder to be marked as fail.
         self.test_proxy = TestingXMLRPCProxy('very broken slave')
-        slave = RecordingSlave('foo', 'http://foo.buildd:8221/', 'foo.host')
+        slave = RecordingSlave('bob', 'http://bob.buildd:8221/', 'bob.host')
         slave.ensurepresent('arg1', 'arg2', 'arg3')
         slave.build('arg1', 'arg2', 'arg3')
 
@@ -784,22 +830,24 @@
 
         return builder, job.id
 
-    def assertJobIsClean(self, job_id):
-        """Re-fetch the `IBuildQueue` record and check if it's clean."""
-        job = getUtility(IBuildQueueSet).get(job_id)
-        build = getUtility(IBinaryPackageBuildSet).getByQueueEntry(job)
-        self.assertEqual('NEEDSBUILD', build.status.name)
-        self.assertEqual(None, job.builder)
-        self.assertEqual(None, job.date_started)
-        self.assertEqual(None, job.logtail)
+    def assertBuildqueueIsClean(self, buildqueue):
+        # Check that the buildqueue is reset.
+        self.assertEqual(None, buildqueue.builder)
+        self.assertEqual(None, buildqueue.date_started)
+        self.assertEqual(None, buildqueue.logtail)
+
+    def assertBuilderIsClean(self, builder):
+        # Check that the builder is ready for a new build.
+        self.assertTrue(builder.builderok)
+        self.assertTrue(builder.failnotes is None)
+        self.assertTrue(builder.currentjob is None)
 
     def testResetDispatchResult(self):
-        """`ResetDispatchResult` clean any existing jobs.
-
-        Although it keeps the builder active in pool.
-        """
+        # Test that `ResetDispatchResult` resets the builder and job.
         builder, job_id = self._getBuilder('bob')
+        buildqueue_id = builder.currentjob.id
         builder.builderok = True
+        builder.failure_count = 1
 
         # Setup a interaction to satisfy 'write_transaction' decorator.
         login(ANONYMOUS)
@@ -807,32 +855,82 @@
         result = ResetDispatchResult(slave)
         result()
 
-        self.assertJobIsClean(job_id)
+        buildqueue = getUtility(IBuildQueueSet).get(buildqueue_id)
+        self.assertBuildqueueIsClean(buildqueue)
 
         # XXX Julian
         # Disabled test until bug 586362 is fixed.
         #self.assertFalse(builder.builderok)
-        self.assertEqual(None, builder.currentjob)
+        self.assertBuilderIsClean(builder)
 
     def testFailDispatchResult(self):
-        """`FailDispatchResult` excludes the builder from pool.
-
-        It marks the build as failed (builderok=False) and clean any
-        existing jobs.
-        """
+        # Test that `FailDispatchResult` calls assessFailureCounts().
         builder, job_id = self._getBuilder('bob')
 
         # Setup a interaction to satisfy 'write_transaction' decorator.
         login(ANONYMOUS)
         slave = RecordingSlave(builder.name, builder.url, builder.vm_host)
         result = FailDispatchResult(slave, 'does not work!')
+        result.assessFailureCounts = FakeMethod()
+        self.assertEqual(0, result.assessFailureCounts.call_count)
         result()
-
-        self.assertJobIsClean(job_id)
-
+        self.assertEqual(1, result.assessFailureCounts.call_count)
+
+    def _setup_failing_dispatch_result(self):
+        # assessFailureCounts should fail jobs or builders depending on
+        # whether it sees the failure_counts on each increasing.
+        builder, job_id = self._getBuilder('bob')
+        slave = RecordingSlave(builder.name, builder.url, builder.vm_host)
+        result = FailDispatchResult(slave, 'does not work!')
+        return builder, result
+
+    def test_assessFailureCounts_equal_failures(self):
+        # Basic case where the failure counts are equal and the job is
+        # reset to try again & the builder is not failed.
+        builder, result = self._setup_failing_dispatch_result()
+        buildqueue = builder.currentjob
+        build = buildqueue.specific_job.build
+        builder.failure_count = 2
+        build.failure_count = 2
+        disabled = result.assessFailureCounts()
+
+        self.assertFalse(disabled)
+        self.assertBuilderIsClean(builder)
+        self.assertEqual('NEEDSBUILD', build.status.name)
+        self.assertBuildqueueIsClean(buildqueue)
+
+    def test_assessFailureCounts_job_failed(self):
+        # Case where the job has failed more than the builder.
+        builder, result = self._setup_failing_dispatch_result()
+        buildqueue = builder.currentjob
+        build = buildqueue.specific_job.build
+        build.failure_count = 2
+        builder.failure_count = 1
+        disabled = result.assessFailureCounts()
+
+        self.assertTrue(disabled)
+        self.assertBuilderIsClean(builder)
+        self.assertEqual('FAILEDTOBUILD', build.status.name)
+        # The buildqueue should have been removed entirely.
+        self.assertEqual(
+            None, getUtility(IBuildQueueSet).getByBuilder(builder),
+            "Buildqueue was not removed when it should be.")
+
+    def test_assessFailureCounts_builder_failed(self):
+        # Case where the builder has failed more than the job.
+        builder, result = self._setup_failing_dispatch_result()
+        buildqueue = builder.currentjob
+        build = buildqueue.specific_job.build
+        build.failure_count = 2
+        builder.failure_count = 3
+        disabled = result.assessFailureCounts()
+
+        self.assertTrue(disabled)
         self.assertFalse(builder.builderok)
-        self.assertEqual(None, builder.currentjob)
         self.assertEqual('does not work!', builder.failnotes)
+        self.assertTrue(builder.currentjob is None)
+        self.assertEqual('NEEDSBUILD', build.status.name)
+        self.assertBuildqueueIsClean(buildqueue)
 
 
 class TestBuilddManager(TrialTestCase):

=== modified file 'lib/lp/soyuz/configure.zcml'
--- lib/lp/soyuz/configure.zcml	2010-06-30 17:35:36 +0000
+++ lib/lp/soyuz/configure.zcml	2010-08-02 17:18:48 +0000
@@ -510,6 +510,15 @@
             permission="launchpad.Edit"
             set_attributes="log date_finished date_started builder
                             status dependencies upload_log"/>
+
+        <!-- XXX bigjools 2010-07-27 bug=570939
+             This should not be required once the old BuildFarmJob stuff is
+             removed when the Translation Template jobs and Recipe jobs
+             use the new infrastructure -->
+        <require
+            permission="launchpad.Admin"
+            set_attributes="failure_count"/>
+
     </class>
     <adapter
         provides="canonical.launchpad.webapp.interfaces.IBreadcrumb"


Follow ups