← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master

 

Guruprasad has proposed merging ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master.

Commit message:
Emit statsd metrics when recovering from build job failure

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~lgp171188/launchpad/+git/launchpad/+merge/440721
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~lgp171188/launchpad:emit-statsd-metrics-build-failures into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index ae46630..31eed3f 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -397,10 +397,19 @@ def recover_failure(logger, vitals, builder, retry, exception):
         )
 
     if job is not None and job_action is not None:
+        statsd_client = getUtility(IStatsdClient)
+        labels = {
+            "build": True,
+            "arch": builder.current_build.processor.name,
+            "builder_name": builder.name,
+            "virtualized": str(builder.virtualized),
+        }
+
         if cancelling:
             # We've previously been asked to cancel the job, so just set
             # it to cancelled rather than retrying or failing.
             logger.info("Cancelling job %s.", job.build_cookie)
+            statsd_client.incr("builders.job_cancelled", labels=labels)
             job.markAsCancelled()
         elif job_action == False:
             # Fail and dequeue the job.
@@ -421,15 +430,18 @@ def recover_failure(logger, vitals, builder, retry, exception):
                 job.specific_build.updateStatus(
                     BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
                 )
+            statsd_client.incr("builders.job_failed", labels=labels)
             job.destroySelf()
         elif job_action == True:
             # Reset the job so it will be retried elsewhere.
             logger.info("Requeueing job %s.", job.build_cookie)
+            statsd_client.incr("builders.job_reset", labels=labels)
             job.reset()
 
         if job_action == False:
             # We've decided the job is bad, so unblame the builder.
             logger.info("Resetting failure count of builder %s.", builder.name)
+            statsd_client.incr("builders.job_bad", labels=labels)
             builder.resetFailureCount()
 
     if builder_action == False:
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index a568d64..28546e5 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -1385,7 +1385,7 @@ class TestBuilddManager(TestCase):
         self.assertNotEqual(0, manager.flushLogTails.call_count)
 
 
-class TestFailureAssessments(TestCaseWithFactory):
+class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
 
     layer = ZopelessDatabaseLayer
 
@@ -1396,6 +1396,7 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.buildqueue = self.build.queueBuild()
         self.buildqueue.markAsBuilding(self.builder)
         self.worker = OkWorker()
+        self.setUpStats()
 
     def _recover_failure(self, fail_notes, retry=True):
         # Helper for recover_failure boilerplate.
@@ -1409,6 +1410,28 @@ class TestFailureAssessments(TestCaseWithFactory):
         )
         return logger.getLogBuffer()
 
+    def assert_statsd_metrics_requeue(self):
+        build = removeSecurityProxy(self.build)
+        self.assertEqual(2, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.job_reset,arch={},build=True,builder_name={},"
+                    "env=test,virtualized=True".format(
+                        build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "build.reset,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,virtualized=True".format(
+                        build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
+
     def test_job_reset_threshold_with_retry(self):
         naked_build = removeSecurityProxy(self.build)
         self.builder.failure_count = JOB_RESET_THRESHOLD - 1
@@ -1426,6 +1449,7 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.assertIn("Requeueing job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+        self.assert_statsd_metrics_requeue()
 
     def test_job_reset_threshold_no_retry(self):
         naked_build = removeSecurityProxy(self.build)
@@ -1436,6 +1460,7 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.assertIn("Requeueing job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.NEEDSBUILD)
+        self.assert_statsd_metrics_requeue()
 
     def test_reset_during_cancellation_cancels(self):
         self.buildqueue.cancel()
@@ -1449,11 +1474,32 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.assertIn("Cancelling job", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+        self.assertEqual(2, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.job_cancelled,arch={},build=True,"
+                    "builder_name={},env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "build.finished,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
+                    "virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
 
     def test_job_failing_more_than_builder_fails_job(self):
         self.build.gotFailure()
         self.build.gotFailure()
         self.builder.gotFailure()
+        naked_build = removeSecurityProxy(self.build)
 
         log = self._recover_failure("failnotes")
         self.assertIn("Failing job", log)
@@ -1461,6 +1507,33 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.FAILEDTOBUILD)
         self.assertEqual(0, self.builder.failure_count)
+        self.assertEqual(3, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "build.finished,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD,"
+                    "virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.job_failed,arch={},build=True,"
+                    "builder_name={},env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.job_bad,arch={},build=True,builder_name={},"
+                    "env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
 
     def test_bad_job_does_not_unsucceed(self):
         # If a FULLYBUILT build somehow ends up back in buildd-manager,
@@ -1473,6 +1546,7 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.build.gotFailure()
         self.build.gotFailure()
         self.builder.gotFailure()
+        naked_build = removeSecurityProxy(self.build)
 
         log = self._recover_failure("failnotes")
         self.assertIn("Failing job", log)
@@ -1481,6 +1555,33 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(self.build.status, BuildStatus.FULLYBUILT)
         self.assertEqual(0, self.builder.failure_count)
+        self.assertEqual(3, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "build.finished,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT,"
+                    "virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.job_failed,arch={},build=True,"
+                    "builder_name={},env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.job_bad,arch={},build=True,builder_name={},"
+                    "env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
 
     def test_failure_during_cancellation_cancels(self):
         self.buildqueue.cancel()
@@ -1489,11 +1590,40 @@ class TestFailureAssessments(TestCaseWithFactory):
         self.build.gotFailure()
         self.build.gotFailure()
         self.builder.gotFailure()
+        naked_build = removeSecurityProxy(self.build)
         log = self._recover_failure("failnotes")
+
         self.assertIn("Cancelling job", log)
         self.assertIn("Resetting failure count of builder", log)
         self.assertIs(None, self.builder.currentjob)
         self.assertEqual(BuildStatus.CANCELLED, self.build.status)
+        self.assertEqual(3, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.job_cancelled,arch={},build=True,"
+                    "builder_name={},env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "build.finished,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
+                    "virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.job_bad,arch={},build=True,"
+                    "builder_name={},env=test,virtualized=True".format(
+                        naked_build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
 
     def test_bad_builder(self):
         self.builder.setCleanStatus(BuilderCleanStatus.CLEAN)