launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #17015
[Merge] lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad
William Grant has proposed merging lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad with lp:~wgrant/launchpad/no-getBuildCookie as a prerequisite.
Commit message:
Log builder and job judgement results and actions in recover_failure.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/recover_failure-logging/+merge/224568
Log builder and job judgement results and actions in recover_failure.
--
https://code.launchpad.net/~wgrant/launchpad/recover_failure-logging/+merge/224568
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad.
=== modified file 'lib/lp/buildmaster/manager.py'
--- lib/lp/buildmaster/manager.py 2014-06-26 08:48:50 +0000
+++ lib/lp/buildmaster/manager.py 2014-06-26 08:48:50 +0000
@@ -200,32 +200,47 @@
builder_action, job_action = judge_failure(
builder.failure_count, job.specific_build.failure_count if job else 0,
exception, retry=retry and not cancelling)
+ if job is not None:
+ logger.info(
+ "Judged builder %s (%d failures) with job %s (%d failures): "
+ "%r, %r", builder.name, builder.failure_count, job.build_cookie,
+ job.specific_build.failure_count, builder_action, job_action)
+ else:
+ logger.info(
+ "Judged builder %s (%d failures) with no job: %r, %r",
+ builder.name, builder.failure_count, builder_action, job_action)
if job is not None and job_action is not None:
- if job_action == False:
- # We've decided the job is bad, so unblame the builder.
- builder.resetFailureCount()
-
if cancelling:
# We've previously been asked to cancel the job, so just set
# it to cancelled rather than retrying or failing.
+ logger.info("Cancelling job %s.", job.build_cookie)
job.markAsCancelled()
elif job_action == False:
# Fail and dequeue the job.
+ logger.info("Failing job %s.", job.build_cookie)
job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
job.destroySelf()
elif job_action == True:
# Reset the job so it will be retried elsewhere.
+ logger.info("Requeueing job %s.", job.build_cookie)
job.reset()
+ if job_action == False:
+ # We've decided the job is bad, so unblame the builder.
+ logger.info("Resetting failure count of builder %s.", builder.name)
+ builder.resetFailureCount()
+
if builder_action == False:
# We've already tried resetting it enough times, so we have
# little choice but to give up.
+ logger.info("Failing builder %s.", builder.name)
builder.failBuilder(str(exception))
elif builder_action == True:
# Dirty the builder to attempt recovery. In the virtual case,
# the dirty idleness will cause a reset, giving us a good chance
# of recovery.
+ logger.info("Dirtying builder %s to attempt recovery.", builder.name)
builder.setCleanStatus(BuilderCleanStatus.DIRTY)
=== modified file 'lib/lp/buildmaster/model/builder.py'
--- lib/lp/buildmaster/model/builder.py 2014-06-24 09:41:32 +0000
+++ lib/lp/buildmaster/model/builder.py 2014-06-26 08:48:50 +0000
@@ -296,17 +296,8 @@
def handleFailure(self, logger):
"""See IBuilder."""
self.gotFailure()
- if self.currentjob is not None:
- build_farm_job = self.current_build
- build_farm_job.gotFailure()
- logger.info(
- "Builder %s failure count: %s, job '%s' failure count: %s" % (
- self.name, self.failure_count,
- build_farm_job.title, build_farm_job.failure_count))
- else:
- logger.info(
- "Builder %s failure count: %s" % (
- self.name, self.failure_count))
+ if self.current_build is not None:
+ self.current_build.gotFailure()
class BuilderProcessor(StormBase):
=== modified file 'lib/lp/buildmaster/tests/test_manager.py'
--- lib/lp/buildmaster/tests/test_manager.py 2014-06-26 08:48:50 +0000
+++ lib/lp/buildmaster/tests/test_manager.py 2014-06-26 08:48:50 +0000
@@ -1046,23 +1046,27 @@
def _recover_failure(self, fail_notes, retry=True):
# Helper for recover_failure boilerplate.
+ logger = BufferLogger()
recover_failure(
- BufferLogger(), extract_vitals_from_db(self.builder), self.builder,
+ logger, extract_vitals_from_db(self.builder), self.builder,
retry, Exception(fail_notes))
+ return logger.getLogBuffer()
def test_job_reset_threshold_with_retry(self):
naked_build = removeSecurityProxy(self.build)
self.builder.failure_count = JOB_RESET_THRESHOLD - 1
naked_build.failure_count = JOB_RESET_THRESHOLD - 1
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertNotIn("Requeueing job", log)
self.assertIsNot(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.BUILDING)
self.builder.failure_count += 1
naked_build.failure_count += 1
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Requeueing job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
@@ -1071,7 +1075,8 @@
self.builder.failure_count = 1
naked_build.failure_count = 1
- self._recover_failure("failnotes", retry=False)
+ log = self._recover_failure("failnotes", retry=False)
+ self.assertIn("Requeueing job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
@@ -1083,7 +1088,8 @@
self.builder.failure_count = 1
naked_build.failure_count = 1
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Cancelling job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuildStatus.CANCELLED, self.build.status)
@@ -1092,7 +1098,9 @@
self.build.gotFailure()
self.builder.gotFailure()
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Failing job", log)
+ self.assertIn("Resetting failure count of builder", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
self.assertEqual(0, self.builder.failure_count)
@@ -1104,7 +1112,9 @@
self.build.gotFailure()
self.build.gotFailure()
self.builder.gotFailure()
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Cancelling job", log)
+ self.assertIn("Resetting failure count of builder", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuildStatus.CANCELLED, self.build.status)
@@ -1116,7 +1126,9 @@
# builder or attempt to clean up a non-virtual builder.
self.builder.failure_count = BUILDER_FAILURE_THRESHOLD - 1
self.assertIsNot(None, self.builder.currentjob)
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Requeueing job RECIPEBRANCHBUILD-1", log)
+ self.assertIn("Dirtying builder %s" % self.builder.name, log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
self.assertTrue(self.builder.builderok)
@@ -1125,7 +1137,9 @@
# disabled.
self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
self.buildqueue.markAsBuilding(self.builder)
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("Requeueing job", log)
+ self.assertIn("Failing builder", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
self.assertFalse(self.builder.builderok)
@@ -1135,7 +1149,9 @@
self.buildqueue.reset()
self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
- self._recover_failure("failnotes")
+ log = self._recover_failure("failnotes")
+ self.assertIn("with no job", log)
+ self.assertIn("Failing builder", log)
self.assertFalse(self.builder.builderok)
self.assertEqual("failnotes", self.builder.failnotes)
Follow ups