← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.

Commit message:
Add region to builder metrics

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/444189

Launchpad builders don't really have a built-in notion of "region" (i.e. a collection of builders that are on common infrastructure and that are all managed as a group).  However, in practice, we always create builders with names that allow us to infer a region from their names: for instance, `lcy02-amd64-{001..120}` currently form a region.

Add the region to builder metrics where appropriate, so that we can write alerts of the form "no builder in a given region has been in the idle or building state for the last hour", which would indicate a problem with that region's builder reset handling.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index 174411c..1ba8b7b 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -375,6 +375,7 @@ BuilderVitals = namedtuple(
         "clean_status",
         "active",
         "failure_count",
+        "region",
     ),
 )
 
@@ -400,6 +401,7 @@ def extract_vitals_from_db(builder, build_queue=_BQ_UNSPECIFIED):
         builder.clean_status,
         builder.active,
         builder.failure_count,
+        builder.region,
     )
 
 
diff --git a/lib/lp/buildmaster/interfaces/builder.py b/lib/lp/buildmaster/interfaces/builder.py
index 9225325..afca0b3 100644
--- a/lib/lp/buildmaster/interfaces/builder.py
+++ b/lib/lp/buildmaster/interfaces/builder.py
@@ -311,6 +311,16 @@ class IBuilderView(IHasBuildRecords, IHasOwner):
         )
     )
 
+    region = TextLine(
+        title=_("Region"),
+        required=True,
+        description=_(
+            "The builder's region.  This is derived purely based on its name "
+            "(e.g. lcy02-amd64-001 is in the 'lcy02-amd64' region), and is "
+            "used for metrics."
+        ),
+    )
+
     def gotFailure():
         """Increment failure_count on the builder."""
 
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 67d564a..17c90a6 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -401,6 +401,7 @@ def recover_failure(logger, vitals, builder, retry, exception):
         labels = {
             "build": True,
             "arch": job.specific_build.processor.name,
+            "region": builder.region,
             "builder_name": builder.name,
             "virtualized": str(builder.virtualized),
             "job_type": job.specific_build.job_type.name,
@@ -596,6 +597,7 @@ class WorkerScanner:
                     {
                         "build": True,
                         "arch": builder.current_build.processor.name,
+                        "region": builder.region,
                     }
                 )
             else:
diff --git a/lib/lp/buildmaster/model/builder.py b/lib/lp/buildmaster/model/builder.py
index 9de4908..d03f10a 100644
--- a/lib/lp/buildmaster/model/builder.py
+++ b/lib/lp/buildmaster/model/builder.py
@@ -7,6 +7,7 @@ __all__ = [
     "BuilderSet",
 ]
 
+import re
 from datetime import timezone
 
 from storm.expr import Coalesce, Count, Sum
@@ -43,6 +44,8 @@ from lp.services.propertycache import cachedproperty, get_property_cache
 from lp.soyuz.interfaces.binarypackagebuild import IBinaryPackageBuildSet
 from lp.soyuz.interfaces.buildrecords import IHasBuildRecords
 
+region_re = re.compile(r"(^[a-z0-9][a-z0-9+.-]+)-\d+$")
+
 
 @implementer(IBuilder, IHasBuildRecords)
 class Builder(StormBase):
@@ -234,6 +237,11 @@ class Builder(StormBase):
                 self, status=build_state, user=user
             )
 
+    @property
+    def region(self):
+        region_match = region_re.match(self.name)
+        return region_match.group(1) if region_match is not None else ""
+
 
 class BuilderProcessor(StormBase):
     __storm_table__ = "BuilderProcessor"
diff --git a/lib/lp/buildmaster/model/buildfarmjob.py b/lib/lp/buildmaster/model/buildfarmjob.py
index 5549891..e69651f 100644
--- a/lib/lp/buildmaster/model/buildfarmjob.py
+++ b/lib/lp/buildmaster/model/buildfarmjob.py
@@ -197,6 +197,7 @@ class BuildFarmJobMixin:
                 {
                     "builder_name": self.builder.name,
                     "virtualized": str(self.builder.virtualized),
+                    "region": self.builder.region,
                 }
             )
         labels.update(extra)
diff --git a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
index f4218df..450e375 100644
--- a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
@@ -197,6 +197,7 @@ class BuildFarmJobBehaviourBase:
             labels={
                 "job_type": job_type_name,
                 "builder_name": self._builder.name,
+                "region": self._builder.region,
             },
         )
 
diff --git a/lib/lp/buildmaster/tests/mock_workers.py b/lib/lp/buildmaster/tests/mock_workers.py
index 1840d4b..a880a9a 100644
--- a/lib/lp/buildmaster/tests/mock_workers.py
+++ b/lib/lp/buildmaster/tests/mock_workers.py
@@ -38,6 +38,7 @@ from txfixtures.tachandler import TacTestFixture
 from lp.buildmaster.enums import BuilderCleanStatus, BuilderResetProtocol
 from lp.buildmaster.interactor import BuilderWorker
 from lp.buildmaster.interfaces.builder import CannotFetchFile
+from lp.buildmaster.model.builder import region_re
 from lp.services.config import config
 from lp.services.daemons.tachandler import twistd_script
 from lp.services.webapp import urlappend
@@ -95,6 +96,11 @@ class MockBuilder:
         self.builderok = False
         self.failnotes = reason
 
+    @property
+    def region(self):
+        region_match = region_re.match(self.name)
+        return region_match.group(1) if region_match is not None else ""
+
 
 # XXX: It would be *really* nice to run some set of tests against the real
 # BuilderWorker and this one to prevent interface skew.
diff --git a/lib/lp/buildmaster/tests/test_builder.py b/lib/lp/buildmaster/tests/test_builder.py
index 39da0c8..f122c43 100644
--- a/lib/lp/buildmaster/tests/test_builder.py
+++ b/lib/lp/buildmaster/tests/test_builder.py
@@ -83,6 +83,14 @@ class TestBuilder(TestCaseWithFactory):
         self.assertEqual(proc, builder.processor)
         self.assertEqual([proc], builder.processors)
 
+    def test_region_empty(self):
+        builder = self.factory.makeBuilder(name="some-builder-name")
+        self.assertEqual("", builder.region)
+
+    def test_region(self):
+        builder = self.factory.makeBuilder(name="some-region-001")
+        self.assertEqual("some-region", builder.region)
+
 
 # XXX cjwatson 2020-05-18: All these tests would now make more sense in
 # lp.buildmaster.tests.test_buildqueue, and should be moved there when
diff --git a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
index 924b131..a164b11 100644
--- a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
@@ -326,11 +326,11 @@ class TestDispatchBuildToWorker(StatsMixin, TestCase):
         yield behaviour.dispatchBuildToWorker(logger)
         self.assertEqual(1, self.stats_client.incr.call_count)
         self.assertEqual(
-            self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name=mock-builder,env=test,"
-                "job_type=UNKNOWN",
+                "job_type=UNKNOWN,region=",
             ),
+            self.stats_client.incr.call_args_list[0][0],
         )
 
 
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index dfd7530..a0edb84 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -1417,17 +1417,20 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
             [
                 mock.call(
                     "builders.job_reset,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,"
+                    "env=test,job_type=RECIPEBRANCHBUILD,region={},"
                     "virtualized=True".format(
                         build.processor.name,
                         self.builder.name,
+                        self.builder.region,
                     )
                 ),
                 mock.call(
                     "build.reset,arch={},builder_name={},env=test,"
-                    "job_type=RECIPEBRANCHBUILD,virtualized=True".format(
+                    "job_type=RECIPEBRANCHBUILD,region={},"
+                    "virtualized=True".format(
                         build.processor.name,
                         self.builder.name,
+                        self.builder.region,
                     )
                 ),
             ]
@@ -1481,15 +1484,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
                 mock.call(
                     "builders.job_cancelled,arch={},build=True,"
                     "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
-                    "virtualized=True".format(
+                    "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
                 ),
                 mock.call(
                     "build.finished,arch={},builder_name={},env=test,"
-                    "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
-                    "virtualized=True".format(
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "status=CANCELLED,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
@@ -1514,15 +1517,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
             [
                 mock.call(
                     "build.finished,arch={},builder_name={},env=test,"
-                    "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD,"
-                    "virtualized=True".format(
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "status=FAILEDTOBUILD,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
                 ),
                 mock.call(
                     "builders.job_failed,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,"
+                    "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
                     "virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
@@ -1556,15 +1559,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
             [
                 mock.call(
                     "build.finished,arch={},builder_name={},env=test,"
-                    "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT,"
-                    "virtualized=True".format(
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "status=FULLYBUILT,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
                 ),
                 mock.call(
                     "builders.job_failed,arch={},build=True,builder_name={},"
-                    "env=test,job_type=RECIPEBRANCHBUILD,"
+                    "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
                     "virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
@@ -1592,15 +1595,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
                 mock.call(
                     "builders.job_cancelled,arch={},build=True,"
                     "builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
-                    "virtualized=True".format(
+                    "region=builder-name,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
                 ),
                 mock.call(
                     "build.finished,arch={},builder_name={},env=test,"
-                    "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
-                    "virtualized=True".format(
+                    "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+                    "status=CANCELLED,virtualized=True".format(
                         naked_build.processor.name,
                         self.builder.name,
                     )
diff --git a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
index 10e676d..2851a9f 100644
--- a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
+++ b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
@@ -516,7 +516,9 @@ class TestAsyncCharmRecipeBuildBehaviour(
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=CHARMRECIPEBUILD".format(builder.name),
+                "job_type=CHARMRECIPEBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )
 
diff --git a/lib/lp/code/model/tests/test_cibuildbehaviour.py b/lib/lp/code/model/tests/test_cibuildbehaviour.py
index 4d6221f..578c38e 100644
--- a/lib/lp/code/model/tests/test_cibuildbehaviour.py
+++ b/lib/lp/code/model/tests/test_cibuildbehaviour.py
@@ -710,7 +710,9 @@ class TestAsyncCIBuildBehaviour(StatsMixin, TestCIBuildBehaviourBase):
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=CIBUILD".format(builder.name),
+                "job_type=CIBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )
 
diff --git a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
index 8e2c214..90ba1b7 100644
--- a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
+++ b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
@@ -782,7 +782,9 @@ class TestAsyncOCIRecipeBuildBehaviour(
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=OCIRECIPEBUILD".format(builder.name),
+                "job_type=OCIRECIPEBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )
 
diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
index bd8cd3b..b4666cd 100644
--- a/lib/lp/services/statsd/numbercruncher.py
+++ b/lib/lp/services/statsd/numbercruncher.py
@@ -111,7 +111,7 @@ class NumberCruncher(service.Service):
                 continue
             for processor_name in builder.processor_names:
                 counts = counts_by_processor.setdefault(
-                    (processor_name, builder.virtualized),
+                    (processor_name, builder.virtualized, builder.region),
                     {"cleaning": 0, "idle": 0, "disabled": 0, "building": 0},
                 )
                 if not builder.builderok:
@@ -127,7 +127,11 @@ class NumberCruncher(service.Service):
                 builder.failure_count,
                 labels={"builder_name": builder.name},
             )
-        for (processor, virtualized), counts in counts_by_processor.items():
+        for (
+            processor,
+            virtualized,
+            region,
+        ), counts in counts_by_processor.items():
             for count_name, count_value in counts.items():
                 self._sendGauge(
                     "builders",
@@ -135,6 +139,7 @@ class NumberCruncher(service.Service):
                     labels={
                         "status": count_name,
                         "arch": processor,
+                        "region": region,
                         "virtualized": virtualized,
                     },
                 )
diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
index a7d4dd7..9db157f 100644
--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
@@ -59,8 +59,8 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         ]
         expected_gauges.extend(
             [
-                "builders,arch=386,env=test,status=%s,virtualized=True"
-                % status
+                "builders,arch=386,env=test,region=builder-name,status=%s,"
+                "virtualized=True" % status
                 for status in ("building", "cleaning", "disabled", "idle")
             ]
         )
@@ -74,16 +74,21 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
     def test_multiple_processor_counts(self):
         builders = [
             self.factory.makeBuilder(
+                name=self.factory.getUniqueUnicode(region),
                 processors=[
                     getUtility(IProcessorSet).getByName(processor_name)
                 ],
                 virtualized=virtualized,
             )
-            for processor_name, virtualized in (
-                ("386", True),
-                ("386", False),
-                ("amd64", True),
-                ("amd64", False),
+            for processor_name, virtualized, region in (
+                ("386", True, "test1"),
+                ("386", True, "test2"),
+                ("386", False, "test1"),
+                ("386", False, "test2"),
+                ("amd64", True, "test1"),
+                ("amd64", True, "test2"),
+                ("amd64", False, "test1"),
+                ("amd64", False, "test2"),
             )
         ]
         for builder in builders:
@@ -101,10 +106,11 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         ]
         expected_gauges.extend(
             [
-                "builders,arch=%s,env=test,status=%s,virtualized=%s"
-                % (arch, status, virtualized)
+                "builders,arch=%s,env=test,region=%s,status=%s,virtualized=%s"
+                % (arch, region, status, virtualized)
                 for arch in ("386", "amd64")
                 for virtualized in (True, False)
+                for region in ("test1", "test2")
                 for status in ("building", "cleaning", "disabled", "idle")
             ]
         )
@@ -182,7 +188,7 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         )
         expected_gauges.update(
             {
-                "builders,arch=amd64,env=test,status=%s,"
+                "builders,arch=amd64,env=test,region=builder-name,status=%s,"
                 "virtualized=True" % status: count
                 for status, count in (
                     ("building", 2),
diff --git a/lib/lp/snappy/tests/test_snapbuildbehaviour.py b/lib/lp/snappy/tests/test_snapbuildbehaviour.py
index ef4e269..28ffc4b 100644
--- a/lib/lp/snappy/tests/test_snapbuildbehaviour.py
+++ b/lib/lp/snappy/tests/test_snapbuildbehaviour.py
@@ -1208,7 +1208,9 @@ class TestAsyncSnapBuildBehaviour(StatsMixin, TestSnapBuildBehaviourBase):
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=SNAPBUILD".format(builder.name),
+                "job_type=SNAPBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )
 
diff --git a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
index 073b387..ee8278e 100644
--- a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
+++ b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
@@ -235,7 +235,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory):
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=PACKAGEBUILD".format(builder.name),
+                "job_type=PACKAGEBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )
 
@@ -318,7 +320,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory):
             self.stats_client.incr.call_args_list[0][0],
             (
                 "build.count,builder_name={},env=test,"
-                "job_type=PACKAGEBUILD".format(builder.name),
+                "job_type=PACKAGEBUILD,region={}".format(
+                    builder.name, builder.region
+                ),
             ),
         )