launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #26607
[Merge] ~cjwatson/launchpad:build-status-metrics into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:build-status-metrics into launchpad:master.
Commit message:
Emit some more build metrics
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/399444
We now increment a build.finished counter with the new build status name when a build reaches a final state, and build.reset when it is reset. In addition, number-cruncher sends a builders.failure_count gauge with the current failure count of each builder. This should be enough to monitor some operational problems that result in high failure rates of various kinds.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:build-status-metrics into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index b786a6d..69976c2 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd. This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
__metaclass__ = type
@@ -328,7 +328,7 @@ BuilderVitals = namedtuple(
'BuilderVitals',
('name', 'url', 'processor_names', 'virtualized', 'vm_host',
'vm_reset_protocol', 'builderok', 'manual', 'build_queue', 'version',
- 'clean_status', 'active'))
+ 'clean_status', 'active', 'failure_count'))
_BQ_UNSPECIFIED = object()
@@ -341,7 +341,7 @@ def extract_vitals_from_db(builder, build_queue=_BQ_UNSPECIFIED):
[processor.name for processor in builder.processors],
builder.virtualized, builder.vm_host, builder.vm_reset_protocol,
builder.builderok, builder.manual, build_queue, builder.version,
- builder.clean_status, builder.active)
+ builder.clean_status, builder.active, builder.failure_count)
class BuilderInteractor(object):
diff --git a/lib/lp/buildmaster/interfaces/buildfarmjob.py b/lib/lp/buildmaster/interfaces/buildfarmjob.py
index aaba8b2..0d7a600 100644
--- a/lib/lp/buildmaster/interfaces/buildfarmjob.py
+++ b/lib/lp/buildmaster/interfaces/buildfarmjob.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2016 Canonical Ltd. This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Interface for Soyuz build farm jobs."""
@@ -178,6 +178,14 @@ class IBuildFarmJob(Interface):
def setLog(log):
"""Set the `LibraryFileAlias` that contains the job log."""
+ def emitMetric(metric_name, **extra):
+ """Emit a metric for this build.
+
+ :param metric_name: The name of the metric (which will be prefixed
+ with "build.".
+ :param extra: Extra labels to attach to the metric.
+ """
+
def updateStatus(status, builder=None, slave_status=None,
date_started=None, date_finished=None,
force_invalid_transition=False):
diff --git a/lib/lp/buildmaster/model/buildfarmjob.py b/lib/lp/buildmaster/model/buildfarmjob.py
index e490204..1343932 100644
--- a/lib/lp/buildmaster/model/buildfarmjob.py
+++ b/lib/lp/buildmaster/model/buildfarmjob.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2013 Canonical Ltd. This software is licensed under
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under
# the GNU Affero General Public License version 3 (see the file
# LICENSE).
@@ -24,6 +24,7 @@ from storm.locals import (
Storm,
)
from storm.store import Store
+from zope.component import getUtility
from zope.interface import (
implementer,
provider,
@@ -49,6 +50,7 @@ from lp.services.propertycache import (
cachedproperty,
get_property_cache,
)
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
VALID_STATUS_TRANSITIONS = {
@@ -177,6 +179,19 @@ class BuildFarmJobMixin:
"""See `IBuildFarmJob`."""
self.log = log
+ def emitMetric(self, metric_name, **extra):
+ """See `IBuildFarmJob`."""
+ labels = {"job_type": self.job_type.name}
+ if self.processor is not None:
+ labels["arch"] = self.processor.name
+ if self.builder is not None:
+ labels.update({
+ "builder_name": self.builder.name,
+ "virtualized": str(self.builder.virtualized),
+ })
+ labels.update(extra)
+ getUtility(IStatsdClient).incr("build.%s" % metric_name, labels=labels)
+
def updateStatus(self, status, builder=None, slave_status=None,
date_started=None, date_finished=None,
force_invalid_transition=False):
@@ -218,6 +233,7 @@ class BuildFarmJobMixin:
# the duration spent building locally.
self.build_farm_job.date_finished = self.date_finished = (
date_finished or datetime.datetime.now(pytz.UTC))
+ self.emitMetric("finished", status=status.name)
def gotFailure(self):
"""See `IBuildFarmJob`."""
diff --git a/lib/lp/buildmaster/model/buildqueue.py b/lib/lp/buildmaster/model/buildqueue.py
index 7a49cb7..decd388 100644
--- a/lib/lp/buildmaster/model/buildqueue.py
+++ b/lib/lp/buildmaster/model/buildqueue.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd. This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
__metaclass__ = type
@@ -209,6 +209,7 @@ class BuildQueue(StormBase):
def reset(self):
"""See `IBuildQueue`."""
+ self.specific_build.emitMetric("reset")
builder = self.builder
self.builder = None
self.status = BuildQueueStatus.WAITING
diff --git a/lib/lp/buildmaster/tests/mock_slaves.py b/lib/lp/buildmaster/tests/mock_slaves.py
index 54944e3..f1de4cb 100644
--- a/lib/lp/buildmaster/tests/mock_slaves.py
+++ b/lib/lp/buildmaster/tests/mock_slaves.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd. This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Mock Build objects for tests soyuz buildd-system."""
@@ -74,6 +74,7 @@ class MockBuilder:
self.version = version
self.clean_status = clean_status
self.active = active
+ self.failure_count = 0
def setCleanStatus(self, clean_status):
self.clean_status = clean_status
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index 4dc6b47..2da4753 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -72,6 +72,7 @@ from lp.buildmaster.tests.test_interactor import (
MockBuilderFactory,
)
from lp.registry.interfaces.distribution import IDistributionSet
+from lp.services.compat import mock
from lp.services.config import config
from lp.services.log.logger import BufferLogger
from lp.services.statsd.tests import StatsMixin
@@ -537,7 +538,11 @@ class TestSlaveScannerScan(StatsMixin, TestCaseWithFactory):
transaction.commit()
yield scanner.singleCycle()
- self.assertEqual(1, self.stats_client.incr.call_count)
+ self.assertEqual(2, self.stats_client.incr.call_count)
+ self.stats_client.incr.assert_has_calls([
+ mock.call("build.reset,arch=386,env=test,job_type=PACKAGEBUILD"),
+ mock.call("builders.judged_failed,build=False,env=test"),
+ ])
@defer.inlineCallbacks
def test_fail_to_resume_leaves_it_dirty(self):
diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
index 6add2e4..1a1417a 100644
--- a/lib/lp/services/statsd/numbercruncher.py
+++ b/lib/lp/services/statsd/numbercruncher.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Canonical Ltd. This software is licensed under the
+# Copyright 2020-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Out of process statsd reporting."""
@@ -31,10 +31,10 @@ from lp.buildmaster.interfaces.builder import IBuilderSet
from lp.buildmaster.manager import PrefetchedBuilderFactory
from lp.code.enums import CodeImportJobState
from lp.code.model.codeimportjob import CodeImportJob
-from lp.soyuz.model.archive import Archive
from lp.services.database.interfaces import IStore
from lp.services.librarian.model import LibraryFileContent
from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+from lp.soyuz.model.archive import Archive
NUMBER_CRUNCHER_LOG_NAME = "number-cruncher"
@@ -126,6 +126,9 @@ class NumberCruncher(service.Service):
counts['building'] += 1
elif builder.clean_status == BuilderCleanStatus.CLEAN:
counts['idle'] += 1
+ self._sendGauge(
+ "builders.failure_count", builder.failure_count,
+ labels={"builder_name": builder.name})
for (processor, virtualized), counts in counts_by_processor.items():
for count_name, count_value in counts.items():
self._sendGauge(
diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
index a22938b..2149de1 100644
--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Canonical Ltd. This software is licensed under the
+# Copyright 2020-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Tests for the stats number cruncher daemon."""
@@ -28,6 +28,7 @@ from lp.buildmaster.enums import (
BuildStatus,
)
from lp.buildmaster.interactor import BuilderSlave
+from lp.buildmaster.interfaces.builder import IBuilderSet
from lp.buildmaster.interfaces.processor import IProcessorSet
from lp.buildmaster.model.buildqueue import BuildQueue
from lp.buildmaster.tests.mock_slaves import OkSlave
@@ -51,6 +52,10 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
def setUp(self):
super(TestNumberCruncher, self).setUp()
self.setUpStats()
+ # Deactivate sampledata builders; we only want statistics for the
+ # builders explicitly created in these tests.
+ for builder in getUtility(IBuilderSet):
+ builder.active = False
def test_single_processor_counts(self):
builder = self.factory.makeBuilder()
@@ -62,14 +67,32 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
manager.updateBuilderStats()
self.assertFalse(is_transaction_in_progress())
- self.assertEqual(8, self.stats_client.gauge.call_count)
- for call in self.stats_client.mock.gauge.call_args_list:
- self.assertIn('386', call[0][0])
+ expected_gauges = [
+ 'builders.failure_count,builder_name=%s,env=test' % builder.name,
+ ]
+ expected_gauges.extend([
+ 'builders,arch=386,env=test,status=%s,virtualized=True' % status
+ for status in ('building', 'cleaning', 'disabled', 'idle')
+ ])
+ self.assertThat(
+ [call[0][0] for call in self.stats_client.gauge.call_args_list],
+ MatchesSetwise(*(
+ Equals(gauge_name) for gauge_name in expected_gauges)))
def test_multiple_processor_counts(self):
- builder = self.factory.makeBuilder(
- processors=[getUtility(IProcessorSet).getByName('amd64')])
- builder.setCleanStatus(BuilderCleanStatus.CLEAN)
+ builders = [
+ self.factory.makeBuilder(
+ processors=[
+ getUtility(IProcessorSet).getByName(processor_name)],
+ virtualized=virtualized)
+ for processor_name, virtualized in (
+ ('386', True),
+ ('386', False),
+ ('amd64', True),
+ ('amd64', False),
+ )]
+ for builder in builders:
+ builder.setCleanStatus(BuilderCleanStatus.CLEAN)
self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
transaction.commit()
clock = task.Clock()
@@ -77,31 +100,46 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
manager.updateBuilderStats()
self.assertFalse(is_transaction_in_progress())
- self.assertEqual(12, self.stats_client.gauge.call_count)
- i386_calls = [c for c in self.stats_client.gauge.call_args_list
- if '386' in c[0][0]]
- amd64_calls = [c for c in self.stats_client.gauge.call_args_list
- if 'amd64' in c[0][0]]
- self.assertEqual(8, len(i386_calls))
- self.assertEqual(4, len(amd64_calls))
+ expected_gauges = [
+ 'builders.failure_count,builder_name=%s,env=test' % builder.name
+ for builder in builders
+ ]
+ expected_gauges.extend([
+ 'builders,arch=%s,env=test,status=%s,virtualized=%s' % (
+ arch, status, virtualized)
+ for arch in ('386', 'amd64')
+ for virtualized in (True, False)
+ for status in ('building', 'cleaning', 'disabled', 'idle')
+ ])
+ self.assertThat(
+ [call[0][0] for call in self.stats_client.gauge.call_args_list],
+ MatchesSetwise(*(
+ Equals(gauge_name) for gauge_name in expected_gauges)))
def test_correct_values_counts(self):
- for _ in range(3):
- cleaning_builder = self.factory.makeBuilder(
+ cleaning_builders = [
+ self.factory.makeBuilder(
processors=[getUtility(IProcessorSet).getByName('amd64')])
+ for _ in range(3)]
+ for cleaning_builder in cleaning_builders:
+ cleaning_builder.gotFailure()
cleaning_builder.setCleanStatus(BuilderCleanStatus.CLEANING)
- for _ in range(4):
- idle_builder = self.factory.makeBuilder(
+ idle_builders = [
+ self.factory.makeBuilder(
processors=[getUtility(IProcessorSet).getByName('amd64')])
+ for _ in range(4)]
+ for idle_builder in idle_builders:
idle_builder.setCleanStatus(BuilderCleanStatus.CLEAN)
old_build = self.factory.makeSnapBuild()
old_build.queueBuild()
old_build.buildqueue_record.markAsBuilding(builder=idle_builder)
old_build.buildqueue_record.destroySelf()
- builds = []
- for _ in range(2):
- building_builder = self.factory.makeBuilder(
+ building_builders = [
+ self.factory.makeBuilder(
processors=[getUtility(IProcessorSet).getByName('amd64')])
+ for _ in range(2)]
+ builds = []
+ for building_builder in building_builders:
building_builder.setCleanStatus(BuilderCleanStatus.CLEAN)
build = self.factory.makeSnapBuild()
build.queueBuild()
@@ -125,24 +163,29 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
manager.updateBuilderStats()
self.assertFalse(is_transaction_in_progress())
- self.assertEqual(12, self.stats_client.gauge.call_count)
- calls = [c[0] for c in self.stats_client.gauge.call_args_list
- if 'amd64' in c[0][0]]
+ expected_gauges = {
+ 'builders.failure_count,builder_name=%s,env=test' % builder.name: 1
+ for builder in cleaning_builders
+ }
+ expected_gauges.update({
+ 'builders.failure_count,builder_name=%s,env=test' % builder.name: 0
+ for builder in idle_builders + building_builders
+ })
+ expected_gauges.update({
+ 'builders,arch=amd64,env=test,status=%s,'
+ 'virtualized=True' % status: count
+ for status, count in (
+ ('building', 2),
+ ('cleaning', 3),
+ ('disabled', 0),
+ ('idle', 4),
+ )
+ })
self.assertThat(
- calls, MatchesSetwise(
- Equals((
- 'builders,arch=amd64,env=test,status=disabled,'
- 'virtualized=True', 0)),
- Equals((
- 'builders,arch=amd64,env=test,status=building,'
- 'virtualized=True', 2)),
- Equals((
- 'builders,arch=amd64,env=test,status=idle,'
- 'virtualized=True', 4)),
- Equals((
- 'builders,arch=amd64,env=test,status=cleaning,'
- 'virtualized=True', 3))
- ))
+ [call[0] for call in self.stats_client.gauge.call_args_list],
+ MatchesSetwise(*(
+ Equals((gauge_name, count))
+ for gauge_name, count in expected_gauges.items())))
def test_updateBuilderStats_error(self):
clock = task.Clock()