← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master

 

William Grant has proposed merging ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master with ~wgrant/launchpad:buildd-manager-failure-refactor as a prerequisite.

Commit message:
Tweak buildd-manager failure handling metrics

They all now start with "builders.failure", labels are more consistent,
and builder failure/reset are counted.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/+git/launchpad/+merge/454693
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~wgrant/launchpad:buildd-manager-failure-metrics into launchpad:master.
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 0c6d335..2bc0b81 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -312,6 +312,23 @@ class PrefetchedBuilderFactory(BaseBuilderFactory):
         return self.candidates.pop(vitals)
 
 
+def get_statsd_labels(builder, build):
+    labels = {
+        "builder_name": builder.name,
+        "region": builder.region,
+        "virtualized": str(builder.virtualized),
+    }
+    if build is not None:
+        labels.update(
+            {
+                "build": True,
+                "arch": build.processor.name,
+                "job_type": build.job_type.name,
+            }
+        )
+    return labels
+
+
 def judge_failure(builder_count, job_count, exc, retry=True):
     """Judge how to recover from a scan failure.
 
@@ -394,22 +411,15 @@ def recover_failure(logger, vitals, builder, retry, exception):
             job_action,
         )
 
-    if job is not None and job_action is not None:
-        statsd_client = getUtility(IStatsdClient)
-        labels = {
-            "build": True,
-            "arch": job.specific_build.processor.name,
-            "region": builder.region,
-            "builder_name": builder.name,
-            "virtualized": str(builder.virtualized),
-            "job_type": job.specific_build.job_type.name,
-        }
+    statsd_client = getUtility(IStatsdClient)
+    labels = get_statsd_labels(builder, job.specific_build if job else None)
 
+    if job is not None and job_action is not None:
         if cancelling:
             # We've previously been asked to cancel the job, so just set
             # it to cancelled rather than retrying or failing.
             logger.info("Cancelling job %s.", job.build_cookie)
-            statsd_client.incr("builders.job_cancelled", labels=labels)
+            statsd_client.incr("builders.failure.job_cancelled", labels=labels)
             job.markAsCancelled()
         elif job_action == False:
             # Fail and dequeue the job.
@@ -430,12 +440,12 @@ def recover_failure(logger, vitals, builder, retry, exception):
                 job.specific_build.updateStatus(
                     BuildStatus.FAILEDTOBUILD, force_invalid_transition=True
                 )
-            statsd_client.incr("builders.job_failed", labels=labels)
+            statsd_client.incr("builders.failure.job_failed", labels=labels)
             job.destroySelf()
         elif job_action == True:
             # Reset the job so it will be retried elsewhere.
             logger.info("Requeueing job %s.", job.build_cookie)
-            statsd_client.incr("builders.job_reset", labels=labels)
+            statsd_client.incr("builders.failure.job_reset", labels=labels)
             job.reset()
 
         if job_action == False:
@@ -447,12 +457,14 @@ def recover_failure(logger, vitals, builder, retry, exception):
         # We've already tried resetting it enough times, so we have
         # little choice but to give up.
         logger.info("Failing builder %s.", builder.name)
+        statsd_client.incr("builders.failure.builder_failed", labels=labels)
         builder.failBuilder(str(exception))
     elif builder_action == True:
         # Dirty the builder to attempt recovery. In the virtual case,
         # the dirty idleness will cause a reset, giving us a good chance
         # of recovery.
         logger.info("Dirtying builder %s to attempt recovery.", builder.name)
+        statsd_client.incr("builders.failure.builder_reset", labels=labels)
         builder.setCleanStatus(BuilderCleanStatus.DIRTY)
 
 
@@ -583,20 +595,13 @@ class WorkerScanner:
         vitals = self.builder_factory.getVitals(self.builder_name)
         builder = self.builder_factory[self.builder_name]
         try:
+            labels = get_statsd_labels(builder, builder.current_build)
+            self.statsd_client.incr("builders.failure", labels=labels)
+
             builder.gotFailure()
-            labels = {}
             if builder.current_build is not None:
                 builder.current_build.gotFailure()
-                labels.update(
-                    {
-                        "build": True,
-                        "arch": builder.current_build.processor.name,
-                        "region": builder.region,
-                    }
-                )
-            else:
-                labels["build"] = False
-            self.statsd_client.incr("builders.judged_failed", labels=labels)
+
             recover_failure(self.logger, vitals, builder, retry, exc)
             transaction.commit()
         except Exception:
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index b87e95c..01e6044 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -555,13 +555,20 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
         transaction.commit()
 
         yield scanner.singleCycle()
-        self.assertEqual(2, self.stats_client.incr.call_count)
+        self.assertEqual(3, self.stats_client.incr.call_count)
         self.stats_client.incr.assert_has_calls(
             [
                 mock.call(
                     "build.reset,arch=386,env=test,job_type=PACKAGEBUILD"
                 ),
-                mock.call("builders.judged_failed,build=False,env=test"),
+                mock.call(
+                    "builders.failure,builder_name=bob,env=test,"
+                    "region=,virtualized=False"
+                ),
+                mock.call(
+                    "builders.failure.builder_failed,builder_name=bob,"
+                    "env=test,region=,virtualized=False"
+                ),
             ]
         )
 
@@ -1411,9 +1418,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
         self.stats_client.incr.assert_has_calls(
             [
                 mock.call(
-                    "builders.job_reset,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,region={},"
-                    "virtualized=True".format(
+                    "builders.failure.job_reset,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region={},virtualized=True".format(
                         build.processor.name,
                         self.builder.name,
                         self.builder.region,
@@ -1477,7 +1484,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
         self.stats_client.incr.assert_has_calls(
             [
                 mock.call(
-                    "builders.job_cancelled,arch={},build=True,"
+                    "builders.failure.job_cancelled,arch={},build=True,"
                     "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
                     "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
@@ -1519,9 +1526,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
                     )
                 ),
                 mock.call(
-                    "builders.job_failed,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
-                    "virtualized=True".format(
+                    "builders.failure.job_failed,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
@@ -1561,9 +1568,9 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
                     )
                 ),
                 mock.call(
-                    "builders.job_failed,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
-                    "virtualized=True".format(
+                    "builders.failure.job_failed,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
@@ -1588,7 +1595,7 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
         self.stats_client.incr.assert_has_calls(
             [
                 mock.call(
-                    "builders.job_cancelled,arch={},build=True,"
+                    "builders.failure.job_cancelled,arch={},build=True,"
                     "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
                     "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
@@ -1621,6 +1628,36 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
         self.assertIs(None, self.build.builder)
         self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertTrue(self.builder.builderok)
+        self.assertEqual(3, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.failure.job_reset,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "build.reset,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.failure.builder_reset,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
+        self.stats_client.incr.reset_mock()
 
         # But if the builder continues to cause trouble, it will be
         # disabled.
@@ -1634,16 +1671,59 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
         self.assertEqual(BuilderCleanStatus.DIRTY, self.builder.clean_status)
         self.assertFalse(self.builder.builderok)
         self.assertEqual("failnotes", self.builder.failnotes)
+        self.assertEqual(3, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.failure.job_reset,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "build.reset,arch={},builder_name={},env=test,"
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+                mock.call(
+                    "builders.failure.builder_failed,arch={},build=True,"
+                    "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
+                    "region=builder-name,virtualized=True".format(
+                        self.build.processor.name,
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
+        self.stats_client.incr.reset_mock()
 
     def test_builder_failing_with_no_attached_job(self):
         self.buildqueue.reset()
         self.builder.failure_count = BUILDER_FAILURE_THRESHOLD
+        self.stats_client.incr.reset_mock()
 
         log = self._recover_failure("failnotes")
         self.assertIn("with no job", log)
         self.assertIn("Failing builder", log)
         self.assertFalse(self.builder.builderok)
         self.assertEqual("failnotes", self.builder.failnotes)
+        self.assertEqual(1, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls(
+            [
+                mock.call(
+                    "builders.failure.builder_failed,builder_name={},"
+                    "env=test,region=builder-name,virtualized=True".format(
+                        self.builder.name,
+                    )
+                ),
+            ]
+        )
+        self.stats_client.incr.reset_mock()
 
 
 class TestNewBuilders(TestCase):

Follow ups