launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #17019
Re: [Merge] lp:~wgrant/launchpad/recover_failure-logging into lp:launchpad
Review: Approve
Just a minor nitpick in the diff re. dropping uneeded 'logger' argument from IBuilder.handleFailure() and updating all the existing callsites. Feel free to do it later if it messes with a lots of tests.
Diff comments:
> === modified file 'lib/lp/buildmaster/manager.py'
> --- lib/lp/buildmaster/manager.py 2014-06-26 08:48:50 +0000
> +++ lib/lp/buildmaster/manager.py 2014-06-26 08:48:50 +0000
> @@ -200,32 +200,47 @@
> builder_action, job_action = judge_failure(
> builder.failure_count, job.specific_build.failure_count if job else 0,
> exception, retry=retry and not cancelling)
> + if job is not None:
> + logger.info(
> + "Judged builder %s (%d failures) with job %s (%d failures): "
> + "%r, %r", builder.name, builder.failure_count, job.build_cookie,
> + job.specific_build.failure_count, builder_action, job_action)
> + else:
> + logger.info(
> + "Judged builder %s (%d failures) with no job: %r, %r",
> + builder.name, builder.failure_count, builder_action, job_action)
>
> if job is not None and job_action is not None:
> - if job_action == False:
> - # We've decided the job is bad, so unblame the builder.
> - builder.resetFailureCount()
> -
> if cancelling:
> # We've previously been asked to cancel the job, so just set
> # it to cancelled rather than retrying or failing.
> + logger.info("Cancelling job %s.", job.build_cookie)
> job.markAsCancelled()
> elif job_action == False:
> # Fail and dequeue the job.
> + logger.info("Failing job %s.", job.build_cookie)
> job.specific_build.updateStatus(BuildStatus.FAILEDTOBUILD)
> job.destroySelf()
> elif job_action == True:
> # Reset the job so it will be retried elsewhere.
> + logger.info("Requeueing job %s.", job.build_cookie)
> job.reset()
>
> + if job_action == False:
> + # We've decided the job is bad, so unblame the builder.
> + logger.info("Resetting failure count of builder %s.", builder.name)
> + builder.resetFailureCount()
> +
> if builder_action == False:
> # We've already tried resetting it enough times, so we have
> # little choice but to give up.
> + logger.info("Failing builder %s.", builder.name)
> builder.failBuilder(str(exception))
> elif builder_action == True:
> # Dirty the builder to attempt recovery. In the virtual case,
> # the dirty idleness will cause a reset, giving us a good chance
> # of recovery.
> + logger.info("Dirtying builder %s to attempt recovery.", builder.name)
> builder.setCleanStatus(BuilderCleanStatus.DIRTY)
>
>
>
> === modified file 'lib/lp/buildmaster/model/builder.py'
> --- lib/lp/buildmaster/model/builder.py 2014-06-24 09:41:32 +0000
> +++ lib/lp/buildmaster/model/builder.py 2014-06-26 08:48:50 +0000
> @@ -296,17 +296,8 @@
> def handleFailure(self, logger):
You don't need 'logger' anymore, is it worth tweaking IBuilder and callsites ?
> """See IBuilder."""
> self.gotFailure()
> - if self.currentjob is not None:
> - build_farm_job = self.current_build
> - build_farm_job.gotFailure()
> - logger.info(
> - "Builder %s failure count: %s, job '%s' failure count: %s" % (
> - self.name, self.failure_count,
> - build_farm_job.title, build_farm_job.failure_count))
> - else:
> - logger.info(
> - "Builder %s failure count: %s" % (
> - self.name, self.failure_count))
> + if self.current_build is not None:
> + self.current_build.gotFailure()
>
>
> class BuilderProcessor(StormBase):
>
> === modified file 'lib/lp/buildmaster/tests/test_manager.py'
> --- lib/lp/buildmaster/tests/test_manager.py 2014-06-26 08:48:50 +0000
> +++ lib/lp/buildmaster/tests/test_manager.py 2014-06-26 08:48:50 +0000
> @@ -1046,23 +1046,27 @@
>
> def _recover_failure(self, fail_notes, retry=True):
> # Helper for recover_failure boilerplate.
> + logger = BufferLogger()
> recover_failure(
> - BufferLogger(), extract_vitals_from_db(self.builder), self.builder,
> + logger, extract_vitals_from_db(self.builder), self.builder,
> retry, Exception(fail_notes))
> + return logger.getLogBuffer()
>
> def test_job_reset_threshold_with_retry(self):
> naked_build = removeSecurityProxy(self.build)
> self.builder.failure_count = JOB_RESET_THRESHOLD - 1
> naked_build.failure_count = JOB_RESET_THRESHOLD - 1
>
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertNotIn("Requeueing job", log)
> self.assertIsNot(None, self.builder.currentjob)
> self.assertEqual(self.build.status, BuildStatus.BUILDING)
>
> self.builder.failure_count += 1
> naked_build.failure_count += 1
>
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Requeueing job", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
>
> @@ -1071,7 +1075,8 @@
> self.builder.failure_count = 1
> naked_build.failure_count = 1
>
> - self._recover_failure("failnotes", retry=False)
> + log = self._recover_failure("failnotes", retry=False)
> + self.assertIn("Requeueing job", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
>
> @@ -1083,7 +1088,8 @@
> self.builder.failure_count = 1
> naked_build.failure_count = 1
>
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Cancelling job", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(BuildStatus.CANCELLED, self.build.status)
>
> @@ -1092,7 +1098,9 @@
> self.build.gotFailure()
> self.builder.gotFailure()
>
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Failing job", log)
> + self.assertIn("Resetting failure count of builder", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
> self.assertEqual(0, self.builder.failure_count)
> @@ -1104,7 +1112,9 @@
> self.build.gotFailure()
> self.build.gotFailure()
> self.builder.gotFailure()
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Cancelling job", log)
> + self.assertIn("Resetting failure count of builder", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(BuildStatus.CANCELLED, self.build.status)
>
> @@ -1116,7 +1126,9 @@
> # builder or attempt to clean up a non-virtual builder.
> self.builder.failure_count = BUILDER_FAILURE_THRESHOLD - 1
> self.assertIsNot(None, self.builder.currentjob)
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Requeueing job RECIPEBRANCHBUILD-1", log)
> + self.assertIn("Dirtying builder %s" % self.builder.name, log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> self.assertTrue(self.builder.builderok)
> @@ -1125,7 +1137,9 @@
> # disabled.
> self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
> self.buildqueue.markAsBuilding(self.builder)
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("Requeueing job", log)
> + self.assertIn("Failing builder", log)
> self.assertIs(None, self.builder.currentjob)
> self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
> self.assertFalse(self.builder.builderok)
> @@ -1135,7 +1149,9 @@
> self.buildqueue.reset()
> self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
>
> - self._recover_failure("failnotes")
> + log = self._recover_failure("failnotes")
> + self.assertIn("with no job", log)
> + self.assertIn("Failing builder", log)
> self.assertFalse(self.builder.builderok)
> self.assertEqual("failnotes", self.builder.failnotes)
>
>
--
https://code.launchpad.net/~wgrant/launchpad/recover_failure-logging/+merge/224568
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.
References