launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #30063
[Merge] ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.
Commit message:
Add region to builder metrics
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/444189
Launchpad builders don't really have a built-in notion of "region" (i.e. a collection of builders that are on common infrastructure and that are all managed as a group). However, in practice, we always create builders with names that allow us to infer a region from their names: for instance, `lcy02-amd64-{001..120}` currently form a region.
Add the region to builder metrics where appropriate, so that we can write alerts of the form "no builder in a given region has been in the idle or building state for the last hour", which would indicate a problem with that region's builder reset handling.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:builder-metrics-by-region into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index 174411c..1ba8b7b 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -375,6 +375,7 @@ BuilderVitals = namedtuple(
"clean_status",
"active",
"failure_count",
+ "region",
),
)
@@ -400,6 +401,7 @@ def extract_vitals_from_db(builder, build_queue=_BQ_UNSPECIFIED):
builder.clean_status,
builder.active,
builder.failure_count,
+ builder.region,
)
diff --git a/lib/lp/buildmaster/interfaces/builder.py b/lib/lp/buildmaster/interfaces/builder.py
index 9225325..afca0b3 100644
--- a/lib/lp/buildmaster/interfaces/builder.py
+++ b/lib/lp/buildmaster/interfaces/builder.py
@@ -311,6 +311,16 @@ class IBuilderView(IHasBuildRecords, IHasOwner):
)
)
+ region = TextLine(
+ title=_("Region"),
+ required=True,
+ description=_(
+ "The builder's region. This is derived purely based on its name "
+ "(e.g. lcy02-amd64-001 is in the 'lcy02-amd64' region), and is "
+ "used for metrics."
+ ),
+ )
+
def gotFailure():
"""Increment failure_count on the builder."""
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 67d564a..17c90a6 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -401,6 +401,7 @@ def recover_failure(logger, vitals, builder, retry, exception):
labels = {
"build": True,
"arch": job.specific_build.processor.name,
+ "region": builder.region,
"builder_name": builder.name,
"virtualized": str(builder.virtualized),
"job_type": job.specific_build.job_type.name,
@@ -596,6 +597,7 @@ class WorkerScanner:
{
"build": True,
"arch": builder.current_build.processor.name,
+ "region": builder.region,
}
)
else:
diff --git a/lib/lp/buildmaster/model/builder.py b/lib/lp/buildmaster/model/builder.py
index 9de4908..d03f10a 100644
--- a/lib/lp/buildmaster/model/builder.py
+++ b/lib/lp/buildmaster/model/builder.py
@@ -7,6 +7,7 @@ __all__ = [
"BuilderSet",
]
+import re
from datetime import timezone
from storm.expr import Coalesce, Count, Sum
@@ -43,6 +44,8 @@ from lp.services.propertycache import cachedproperty, get_property_cache
from lp.soyuz.interfaces.binarypackagebuild import IBinaryPackageBuildSet
from lp.soyuz.interfaces.buildrecords import IHasBuildRecords
+region_re = re.compile(r"(^[a-z0-9][a-z0-9+.-]+)-\d+$")
+
@implementer(IBuilder, IHasBuildRecords)
class Builder(StormBase):
@@ -234,6 +237,11 @@ class Builder(StormBase):
self, status=build_state, user=user
)
+ @property
+ def region(self):
+ region_match = region_re.match(self.name)
+ return region_match.group(1) if region_match is not None else ""
+
class BuilderProcessor(StormBase):
__storm_table__ = "BuilderProcessor"
diff --git a/lib/lp/buildmaster/model/buildfarmjob.py b/lib/lp/buildmaster/model/buildfarmjob.py
index 5549891..e69651f 100644
--- a/lib/lp/buildmaster/model/buildfarmjob.py
+++ b/lib/lp/buildmaster/model/buildfarmjob.py
@@ -197,6 +197,7 @@ class BuildFarmJobMixin:
{
"builder_name": self.builder.name,
"virtualized": str(self.builder.virtualized),
+ "region": self.builder.region,
}
)
labels.update(extra)
diff --git a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
index f4218df..450e375 100644
--- a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
@@ -197,6 +197,7 @@ class BuildFarmJobBehaviourBase:
labels={
"job_type": job_type_name,
"builder_name": self._builder.name,
+ "region": self._builder.region,
},
)
diff --git a/lib/lp/buildmaster/tests/mock_workers.py b/lib/lp/buildmaster/tests/mock_workers.py
index 1840d4b..a880a9a 100644
--- a/lib/lp/buildmaster/tests/mock_workers.py
+++ b/lib/lp/buildmaster/tests/mock_workers.py
@@ -38,6 +38,7 @@ from txfixtures.tachandler import TacTestFixture
from lp.buildmaster.enums import BuilderCleanStatus, BuilderResetProtocol
from lp.buildmaster.interactor import BuilderWorker
from lp.buildmaster.interfaces.builder import CannotFetchFile
+from lp.buildmaster.model.builder import region_re
from lp.services.config import config
from lp.services.daemons.tachandler import twistd_script
from lp.services.webapp import urlappend
@@ -95,6 +96,11 @@ class MockBuilder:
self.builderok = False
self.failnotes = reason
+ @property
+ def region(self):
+ region_match = region_re.match(self.name)
+ return region_match.group(1) if region_match is not None else ""
+
# XXX: It would be *really* nice to run some set of tests against the real
# BuilderWorker and this one to prevent interface skew.
diff --git a/lib/lp/buildmaster/tests/test_builder.py b/lib/lp/buildmaster/tests/test_builder.py
index 39da0c8..f122c43 100644
--- a/lib/lp/buildmaster/tests/test_builder.py
+++ b/lib/lp/buildmaster/tests/test_builder.py
@@ -83,6 +83,14 @@ class TestBuilder(TestCaseWithFactory):
self.assertEqual(proc, builder.processor)
self.assertEqual([proc], builder.processors)
+ def test_region_empty(self):
+ builder = self.factory.makeBuilder(name="some-builder-name")
+ self.assertEqual("", builder.region)
+
+ def test_region(self):
+ builder = self.factory.makeBuilder(name="some-region-001")
+ self.assertEqual("some-region", builder.region)
+
# XXX cjwatson 2020-05-18: All these tests would now make more sense in
# lp.buildmaster.tests.test_buildqueue, and should be moved there when
diff --git a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
index 924b131..a164b11 100644
--- a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
@@ -326,11 +326,11 @@ class TestDispatchBuildToWorker(StatsMixin, TestCase):
yield behaviour.dispatchBuildToWorker(logger)
self.assertEqual(1, self.stats_client.incr.call_count)
self.assertEqual(
- self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name=mock-builder,env=test,"
- "job_type=UNKNOWN",
+ "job_type=UNKNOWN,region=",
),
+ self.stats_client.incr.call_args_list[0][0],
)
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index dfd7530..a0edb84 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -1417,17 +1417,20 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
[
mock.call(
"builders.job_reset,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,"
+ "env=test,job_type=RECIPEBRANCHBUILD,region={},"
"virtualized=True".format(
build.processor.name,
self.builder.name,
+ self.builder.region,
)
),
mock.call(
"build.reset,arch={},builder_name={},env=test,"
- "job_type=RECIPEBRANCHBUILD,virtualized=True".format(
+ "job_type=RECIPEBRANCHBUILD,region={},"
+ "virtualized=True".format(
build.processor.name,
self.builder.name,
+ self.builder.region,
)
),
]
@@ -1481,15 +1484,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
mock.call(
"builders.job_cancelled,arch={},build=True,"
"builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
- "virtualized=True".format(
+ "region=builder-name,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
),
mock.call(
"build.finished,arch={},builder_name={},env=test,"
- "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
- "virtualized=True".format(
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "status=CANCELLED,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
@@ -1514,15 +1517,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
[
mock.call(
"build.finished,arch={},builder_name={},env=test,"
- "job_type=RECIPEBRANCHBUILD,status=FAILEDTOBUILD,"
- "virtualized=True".format(
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "status=FAILEDTOBUILD,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
),
mock.call(
"builders.job_failed,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,"
+ "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
"virtualized=True".format(
naked_build.processor.name,
self.builder.name,
@@ -1556,15 +1559,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
[
mock.call(
"build.finished,arch={},builder_name={},env=test,"
- "job_type=RECIPEBRANCHBUILD,status=FULLYBUILT,"
- "virtualized=True".format(
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "status=FULLYBUILT,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
),
mock.call(
"builders.job_failed,arch={},build=True,builder_name={},"
- "env=test,job_type=RECIPEBRANCHBUILD,"
+ "env=test,job_type=RECIPEBRANCHBUILD,region=builder-name,"
"virtualized=True".format(
naked_build.processor.name,
self.builder.name,
@@ -1592,15 +1595,15 @@ class TestFailureAssessmentsAndStatsdMetrics(StatsMixin, TestCaseWithFactory):
mock.call(
"builders.job_cancelled,arch={},build=True,"
"builder_name={},env=test,job_type=RECIPEBRANCHBUILD,"
- "virtualized=True".format(
+ "region=builder-name,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
),
mock.call(
"build.finished,arch={},builder_name={},env=test,"
- "job_type=RECIPEBRANCHBUILD,status=CANCELLED,"
- "virtualized=True".format(
+ "job_type=RECIPEBRANCHBUILD,region=builder-name,"
+ "status=CANCELLED,virtualized=True".format(
naked_build.processor.name,
self.builder.name,
)
diff --git a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
index 10e676d..2851a9f 100644
--- a/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
+++ b/lib/lp/charms/tests/test_charmrecipebuildbehaviour.py
@@ -516,7 +516,9 @@ class TestAsyncCharmRecipeBuildBehaviour(
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=CHARMRECIPEBUILD".format(builder.name),
+ "job_type=CHARMRECIPEBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)
diff --git a/lib/lp/code/model/tests/test_cibuildbehaviour.py b/lib/lp/code/model/tests/test_cibuildbehaviour.py
index 4d6221f..578c38e 100644
--- a/lib/lp/code/model/tests/test_cibuildbehaviour.py
+++ b/lib/lp/code/model/tests/test_cibuildbehaviour.py
@@ -710,7 +710,9 @@ class TestAsyncCIBuildBehaviour(StatsMixin, TestCIBuildBehaviourBase):
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=CIBUILD".format(builder.name),
+ "job_type=CIBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)
diff --git a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
index 8e2c214..90ba1b7 100644
--- a/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
+++ b/lib/lp/oci/tests/test_ocirecipebuildbehaviour.py
@@ -782,7 +782,9 @@ class TestAsyncOCIRecipeBuildBehaviour(
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=OCIRECIPEBUILD".format(builder.name),
+ "job_type=OCIRECIPEBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)
diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
index bd8cd3b..b4666cd 100644
--- a/lib/lp/services/statsd/numbercruncher.py
+++ b/lib/lp/services/statsd/numbercruncher.py
@@ -111,7 +111,7 @@ class NumberCruncher(service.Service):
continue
for processor_name in builder.processor_names:
counts = counts_by_processor.setdefault(
- (processor_name, builder.virtualized),
+ (processor_name, builder.virtualized, builder.region),
{"cleaning": 0, "idle": 0, "disabled": 0, "building": 0},
)
if not builder.builderok:
@@ -127,7 +127,11 @@ class NumberCruncher(service.Service):
builder.failure_count,
labels={"builder_name": builder.name},
)
- for (processor, virtualized), counts in counts_by_processor.items():
+ for (
+ processor,
+ virtualized,
+ region,
+ ), counts in counts_by_processor.items():
for count_name, count_value in counts.items():
self._sendGauge(
"builders",
@@ -135,6 +139,7 @@ class NumberCruncher(service.Service):
labels={
"status": count_name,
"arch": processor,
+ "region": region,
"virtualized": virtualized,
},
)
diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
index a7d4dd7..9db157f 100644
--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
@@ -59,8 +59,8 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
]
expected_gauges.extend(
[
- "builders,arch=386,env=test,status=%s,virtualized=True"
- % status
+ "builders,arch=386,env=test,region=builder-name,status=%s,"
+ "virtualized=True" % status
for status in ("building", "cleaning", "disabled", "idle")
]
)
@@ -74,16 +74,21 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
def test_multiple_processor_counts(self):
builders = [
self.factory.makeBuilder(
+ name=self.factory.getUniqueUnicode(region),
processors=[
getUtility(IProcessorSet).getByName(processor_name)
],
virtualized=virtualized,
)
- for processor_name, virtualized in (
- ("386", True),
- ("386", False),
- ("amd64", True),
- ("amd64", False),
+ for processor_name, virtualized, region in (
+ ("386", True, "test1"),
+ ("386", True, "test2"),
+ ("386", False, "test1"),
+ ("386", False, "test2"),
+ ("amd64", True, "test1"),
+ ("amd64", True, "test2"),
+ ("amd64", False, "test1"),
+ ("amd64", False, "test2"),
)
]
for builder in builders:
@@ -101,10 +106,11 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
]
expected_gauges.extend(
[
- "builders,arch=%s,env=test,status=%s,virtualized=%s"
- % (arch, status, virtualized)
+ "builders,arch=%s,env=test,region=%s,status=%s,virtualized=%s"
+ % (arch, region, status, virtualized)
for arch in ("386", "amd64")
for virtualized in (True, False)
+ for region in ("test1", "test2")
for status in ("building", "cleaning", "disabled", "idle")
]
)
@@ -182,7 +188,7 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
)
expected_gauges.update(
{
- "builders,arch=amd64,env=test,status=%s,"
+ "builders,arch=amd64,env=test,region=builder-name,status=%s,"
"virtualized=True" % status: count
for status, count in (
("building", 2),
diff --git a/lib/lp/snappy/tests/test_snapbuildbehaviour.py b/lib/lp/snappy/tests/test_snapbuildbehaviour.py
index ef4e269..28ffc4b 100644
--- a/lib/lp/snappy/tests/test_snapbuildbehaviour.py
+++ b/lib/lp/snappy/tests/test_snapbuildbehaviour.py
@@ -1208,7 +1208,9 @@ class TestAsyncSnapBuildBehaviour(StatsMixin, TestSnapBuildBehaviourBase):
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=SNAPBUILD".format(builder.name),
+ "job_type=SNAPBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)
diff --git a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
index 073b387..ee8278e 100644
--- a/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
+++ b/lib/lp/soyuz/tests/test_binarypackagebuildbehaviour.py
@@ -235,7 +235,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory):
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=PACKAGEBUILD".format(builder.name),
+ "job_type=PACKAGEBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)
@@ -318,7 +320,9 @@ class TestBinaryBuildPackageBehaviour(StatsMixin, TestCaseWithFactory):
self.stats_client.incr.call_args_list[0][0],
(
"build.count,builder_name={},env=test,"
- "job_type=PACKAGEBUILD".format(builder.name),
+ "job_type=PACKAGEBUILD,region={}".format(
+ builder.name, builder.region
+ ),
),
)