launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #29896
[Merge] ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master
Guruprasad has proposed merging ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master.
Commit message:
Emit statsd metrics when recovering from build job failure
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~lgp171188/launchpad/+git/launchpad/+merge/440721
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index ae46630..31eed3f 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -397,10 +397,19 @@ def recover_failure(logger, vitals, builder, retry, exception):
)
if job is not None and job_action is not None:
+ statsd_client = getUtility(IStatsdClient)
+ labels = {
+ "build": True,
+ "arch": builder.current_build.processor.name,
+ "builder_name": builder.name,
+ "virtualized": str(builder.virtualized),
+ }
+
if cancelling:
# We've previously been asked to cancel the job, so just set
# it to cancelled rather than retrying or failing.
logger.info("Cancelling job %s.", job.build_cookie)
+ statsd_client.incr("builders.job_cancelled", labels=labels)
job.markAsCancelled()
elif job_action == False:
# Fail and dequeue the job.
@@ -421,15 +430,18 @@ def recover_failure(logger, vitals, builder, retry, exception):
job.specific_build.updateStatus(
BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
)
+ statsd_client.incr("builders.job_failed", labels=labels)
job.destroySelf()
elif job_action == True:
# Reset the job so it will be retried elsewhere.
logger.info("Requeueing job %s.", job.build_cookie)
+ statsd_client.incr("builders.job_reset", labels=labels)
job.reset()
if job_action == False:
# We've decided the job is bad, so unblame the builder.
logger.info("Resetting failure count of builder %s.", builder.name)
+ statsd_client.incr("builders.job_bad", labels=labels)
builder.resetFailureCount()
if builder_action == False:
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index a568d64..28546e5 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -1385,7 +1385,7 @@ class TestBuilddManager(TestCase):
self.assertNotEqual(0, manager.flushLogTails.call_count)
-class TestFailureAssessments(TestCaseWithFactory):
+class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
layer = ZopelessDatabaseLayer
@@ -1396,6 +1396,7 @@ class TestFailureAssessments(TestCaseWithFactory):
self.buildqueue = self.build.queueBuild()
self.buildqueue.markAsBuilding(self.builder)
self.worker = OkWorker()
+ self.setUpStats()
def _recover_failure(self, fail_notes, retry=True):
# Helper for recover_failure boilerplate.
@@ -1409,6 +1410,28 @@ class TestFailureAssessments(TestCaseWithFactory):
)
return logger.getLogBuffer()
+ def assert_statsd_metrics_requeue(self):
+ build = removeSecurityProxy(self.build)
+ self.assertEqual(2, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.job_reset,arch={},build=True,builder_name={},"
+ "env=test,virtualized=True".format(
+ build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "build.reset,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,virtualized=True".format(
+ build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
+
def test_job_reset_threshold_with_retry(self):
naked_build = removeSecurityProxy(self.build)
self.builder.failure_count = JOB_RESET_THRESHOLD - 1
@@ -1426,6 +1449,7 @@ class TestFailureAssessments(TestCaseWithFactory):
self.assertIn("Requeueing job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assert_statsd_metrics_requeue()
def test_job_reset_threshold_no_retry(self):
naked_build = removeSecurityProxy(self.build)
@@ -1436,6 +1460,7 @@ class TestFailureAssessments(TestCaseWithFactory):
self.assertIn("Requeueing job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+ self.assert_statsd_metrics_requeue()
def test_reset_during_cancellation_cancels(self):
self.buildqueue.cancel()
@@ -1449,11 +1474,32 @@ class TestFailureAssessments(TestCaseWithFactory):
self.assertIn("Cancelling job", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+ self.assertEqual(2, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.job_cancelled,arch={},build=True,"
+ "builder_name={},env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "build.finished,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
+ "virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
def test_job_failing_more_than_builder_fails_job(self):
self.build.gotFailure()
self.build.gotFailure()
self.builder.gotFailure()
+ naked_build = removeSecurityProxy(self.build)
log = self._recover_failure("failnotes")
self.assertIn("Failing job", log)
@@ -1461,6 +1507,33 @@ class TestFailureAssessments(TestCaseWithFactory):
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
self.assertEqual(0, self.builder.failure_count)
+ self.assertEqual(3, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "build.finished,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD,"
+ "virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.job_failed,arch={},build=True,"
+ "builder_name={},env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.job_bad,arch={},build=True,builder_name={},"
+ "env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
def test_bad_job_does_not_unsucceed(self):
# If a FULLYBUILT build somehow ends up back in buildd-manager,
@@ -1473,6 +1546,7 @@ class TestFailureAssessments(TestCaseWithFactory):
self.build.gotFailure()
self.build.gotFailure()
self.builder.gotFailure()
+ naked_build = removeSecurityProxy(self.build)
log = self._recover_failure("failnotes")
self.assertIn("Failing job", log)
@@ -1481,6 +1555,33 @@ class TestFailureAssessments(TestCaseWithFactory):
self.assertIs(None, self.builder.currentjob)
self.assertEqual(self.build.status, BuildStatus.FULLYBUILT)
self.assertEqual(0, self.builder.failure_count)
+ self.assertEqual(3, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "build.finished,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT,"
+ "virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.job_failed,arch={},build=True,"
+ "builder_name={},env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.job_bad,arch={},build=True,builder_name={},"
+ "env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
def test_failure_during_cancellation_cancels(self):
self.buildqueue.cancel()
@@ -1489,11 +1590,40 @@ class TestFailureAssessments(TestCaseWithFactory):
self.build.gotFailure()
self.build.gotFailure()
self.builder.gotFailure()
+ naked_build = removeSecurityProxy(self.build)
log = self._recover_failure("failnotes")
+
self.assertIn("Cancelling job", log)
self.assertIn("Resetting failure count of builder", log)
self.assertIs(None, self.builder.currentjob)
self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+ self.assertEqual(3, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls(
+ [
+ mock.call(
+ "builders.job_cancelled,arch={},build=True,"
+ "builder_name={},env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "build.finished,arch={},builder_name={},env=test,"
+ "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
+ "virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ mock.call(
+ "builders.job_bad,arch={},build=True,"
+ "builder_name={},env=test,virtualized=True".format(
+ naked_build.processor.name,
+ self.builder.name,
+ )
+ ),
+ ]
+ )
def test_bad_builder(self):
self.builder.setCleanStatus(BuilderCleanStatus.CLEAN)