← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad with lp:~wgrant/launchpad/no-getBuildCookie as a prerequisite.

Commit message:
Log builder and job judgement results and actions in recover_failure.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/recover_failure-logging/+merge/224568

Log builder and job judgement results and actions in recover_failure.
-- 
https://code.launchpad.net/~wgrant/launchpad/recover_failure-logging/+merge/224568
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad.
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py	2014-06-26 08:48:50 +0000
+++ lib/lp/buildmaster/manager.py	2014-06-26 08:48:50 +0000
@@ -200,32 +200,47 @@
     builder_action, job_action = judge_failure(
         builder.failure_count, job.specific_build.failure_count if job else 0,
         exception, retry=retry and not cancelling)
+    if job is not None:
+        logger.info(
+            "Judged builder %s (%d failures) with job %s (%d failures): "
+            "%r, %r", builder.name, builder.failure_count, job.build_cookie,
+            job.specific_build.failure_count, builder_action, job_action)
+    else:
+        logger.info(
+            "Judged builder %s (%d failures) with no job: %r, %r",
+            builder.name, builder.failure_count, builder_action, job_action)
 
     if job is not None and job_action is not None:
-        if job_action == False:
-            # We've decided the job is bad, so unblame the builder.
-            builder.resetFailureCount()
-
         if cancelling:
             # We've previously been asked to cancel the job, so just set
             # it to cancelled rather than retrying or failing.
+            logger.info("Cancelling job %s.", job.build_cookie)
             job.markAsCancelled()
         elif job_action == False:
             # Fail and dequeue the job.
+            logger.info("Failing job %s.", job.build_cookie)
             job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
             job.destroySelf()
         elif job_action == True:
             # Reset the job so it will be retried elsewhere.
+            logger.info("Requeueing job %s.", job.build_cookie)
             job.reset()
 
+        if job_action == False:
+            # We've decided the job is bad, so unblame the builder.
+            logger.info("Resetting failure count of builder %s.", builder.name)
+            builder.resetFailureCount()
+
     if builder_action == False:
         # We've already tried resetting it enough times, so we have
         # little choice but to give up.
+        logger.info("Failing builder %s.", builder.name)
         builder.failBuilder(str(exception))
     elif builder_action == True:
         # Dirty the builder to attempt recovery. In the virtual case,
         # the dirty idleness will cause a reset, giving us a good chance
         # of recovery.
+        logger.info("Dirtying builder %s to attempt recovery.", builder.name)
         builder.setCleanStatus(BuilderCleanStatus.DIRTY)
 
 

=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py	2014-06-24 09:41:32 +0000
+++ lib/lp/buildmaster/model/builder.py	2014-06-26 08:48:50 +0000
@@ -296,17 +296,8 @@
     def handleFailure(self, logger):
         """See IBuilder."""
         self.gotFailure()
-        if self.currentjob is not None:
-            build_farm_job = self.current_build
-            build_farm_job.gotFailure()
-            logger.info(
-                "Builder %s failure count: %s, job '%s' failure count: %s" % (
-                    self.name, self.failure_count,
-                    build_farm_job.title, build_farm_job.failure_count))
-        else:
-            logger.info(
-                "Builder %s failure count: %s" % (
-                    self.name, self.failure_count))
+        if self.current_build is not None:
+            self.current_build.gotFailure()
 
 
 class BuilderProcessor(StormBase):

=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py	2014-06-26 08:48:50 +0000
+++ lib/lp/buildmaster/tests/test_manager.py	2014-06-26 08:48:50 +0000
@@ -1046,23 +1046,27 @@
 
     def _recover_failure(self, fail_notes, retry=True):
         # Helper for recover_failure boilerplate.
+        logger = BufferLogger()
         recover_failure(
-            BufferLogger(), extract_vitals_from_db(self.builder), self.builder,
+            logger, extract_vitals_from_db(self.builder), self.builder,
             retry, Exception(fail_notes))
+        return logger.getLogBuffer()
 
     def test_job_reset_threshold_with_retry(self):
         naked_build = removeSecurityProxy(self.build)
         self.builder.failure_count = JOB_RESET_THRESHOLD - 1
         naked_build.failure_count = JOB_RESET_THRESHOLD - 1
 
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertNotIn("Requeueing job", log)
         self.assertIsNot(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.BUILDING)
 
         self.builder.failure_count += 1
         naked_build.failure_count += 1
 
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Requeueing job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
 
@@ -1071,7 +1075,8 @@
         self.builder.failure_count = 1
         naked_build.failure_count = 1
 
-        self._recover_failure("failnotes", retry=False)
+        log = self._recover_failure("failnotes", retry=False)
+        self.assertIn("Requeueing job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
 
@@ -1083,7 +1088,8 @@
         self.builder.failure_count = 1
         naked_build.failure_count = 1
 
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Cancelling job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuildStatus.CANCELLED, self.build.status)
 
@@ -1092,7 +1098,9 @@
         self.build.gotFailure()
         self.builder.gotFailure()
 
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Failing job", log)
+        self.assertIn("Resetting failure count of builder", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
         self.assertEqual(0, self.builder.failure_count)
@@ -1104,7 +1112,9 @@
         self.build.gotFailure()
         self.build.gotFailure()
         self.builder.gotFailure()
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Cancelling job", log)
+        self.assertIn("Resetting failure count of builder", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuildStatus.CANCELLED, self.build.status)
 
@@ -1116,7 +1126,9 @@
         # builder or attempt to clean up a non-virtual builder.
         self.builder.failure_count = BUILDER_FAILURE_THRESHOLD - 1
         self.assertIsNot(None, self.builder.currentjob)
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Requeueing job RECIPEBRANCHBUILD-1", log)
+        self.assertIn("Dirtying builder %s" % self.builder.name, log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertTrue(self.builder.builderok)
@@ -1125,7 +1137,9 @@
         # disabled.
         self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
         self.buildqueue.markAsBuilding(self.builder)
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("Requeueing job", log)
+        self.assertIn("Failing builder", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertFalse(self.builder.builderok)
@@ -1135,7 +1149,9 @@
         self.buildqueue.reset()
         self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
 
-        self._recover_failure("failnotes")
+        log = self._recover_failure("failnotes")
+        self.assertIn("with no job", log)
+        self.assertIn("Failing builder", log)
         self.assertFalse(self.builder.builderok)
         self.assertEqual("failnotes", self.builder.failnotes)
 


Follow ups