← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:build-status-metrics into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:build-status-metrics into launchpad:master.

Commit message:
Emit some more build metrics

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/399444

We now increment a build.finished counter with the new build status name when a build reaches a final state, and build.reset when it is reset.  In addition, number-cruncher sends a builders.failure_count gauge with the current failure count of each builder.  This should be enough to monitor some operational problems that result in high failure rates of various kinds.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:build-status-metrics into launchpad:master.
diff --git a/lib/lp/buildmaster/interactor.py b/lib/lp/buildmaster/interactor.py
index b786a6d..69976c2 100644
--- a/lib/lp/buildmaster/interactor.py
+++ b/lib/lp/buildmaster/interactor.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 __metaclass__ = type
@@ -328,7 +328,7 @@ BuilderVitals = namedtuple(
     'BuilderVitals',
     ('name', 'url', 'processor_names', 'virtualized', 'vm_host',
      'vm_reset_protocol', 'builderok', 'manual', 'build_queue', 'version',
-     'clean_status', 'active'))
+     'clean_status', 'active', 'failure_count'))
 
 _BQ_UNSPECIFIED = object()
 
@@ -341,7 +341,7 @@ def extract_vitals_from_db(builder, build_queue=_BQ_UNSPECIFIED):
         [processor.name for processor in builder.processors],
         builder.virtualized, builder.vm_host, builder.vm_reset_protocol,
         builder.builderok, builder.manual, build_queue, builder.version,
-        builder.clean_status, builder.active)
+        builder.clean_status, builder.active, builder.failure_count)
 
 
 class BuilderInteractor(object):
diff --git a/lib/lp/buildmaster/interfaces/buildfarmjob.py b/lib/lp/buildmaster/interfaces/buildfarmjob.py
index aaba8b2..0d7a600 100644
--- a/lib/lp/buildmaster/interfaces/buildfarmjob.py
+++ b/lib/lp/buildmaster/interfaces/buildfarmjob.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2016 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Interface for Soyuz build farm jobs."""
@@ -178,6 +178,14 @@ class IBuildFarmJob(Interface):
     def setLog(log):
         """Set the `LibraryFileAlias` that contains the job log."""
 
+    def emitMetric(metric_name, **extra):
+        """Emit a metric for this build.
+
+        :param metric_name: The name of the metric (which will be prefixed
+            with "build.".
+        :param extra: Extra labels to attach to the metric.
+        """
+
     def updateStatus(status, builder=None, slave_status=None,
                      date_started=None, date_finished=None,
                      force_invalid_transition=False):
diff --git a/lib/lp/buildmaster/model/buildfarmjob.py b/lib/lp/buildmaster/model/buildfarmjob.py
index e490204..1343932 100644
--- a/lib/lp/buildmaster/model/buildfarmjob.py
+++ b/lib/lp/buildmaster/model/buildfarmjob.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2013 Canonical Ltd.  This software is licensed under
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under
 # the GNU Affero General Public License version 3 (see the file
 # LICENSE).
 
@@ -24,6 +24,7 @@ from storm.locals import (
     Storm,
     )
 from storm.store import Store
+from zope.component import getUtility
 from zope.interface import (
     implementer,
     provider,
@@ -49,6 +50,7 @@ from lp.services.propertycache import (
     cachedproperty,
     get_property_cache,
     )
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
 
 
 VALID_STATUS_TRANSITIONS = {
@@ -177,6 +179,19 @@ class BuildFarmJobMixin:
         """See `IBuildFarmJob`."""
         self.log = log
 
+    def emitMetric(self, metric_name, **extra):
+        """See `IBuildFarmJob`."""
+        labels = {"job_type": self.job_type.name}
+        if self.processor is not None:
+            labels["arch"] = self.processor.name
+        if self.builder is not None:
+            labels.update({
+                "builder_name": self.builder.name,
+                "virtualized": str(self.builder.virtualized),
+                })
+        labels.update(extra)
+        getUtility(IStatsdClient).incr("build.%s" % metric_name, labels=labels)
+
     def updateStatus(self, status, builder=None, slave_status=None,
                      date_started=None, date_finished=None,
                      force_invalid_transition=False):
@@ -218,6 +233,7 @@ class BuildFarmJobMixin:
             # the duration spent building locally.
             self.build_farm_job.date_finished = self.date_finished = (
                 date_finished or datetime.datetime.now(pytz.UTC))
+            self.emitMetric("finished", status=status.name)
 
     def gotFailure(self):
         """See `IBuildFarmJob`."""
diff --git a/lib/lp/buildmaster/model/buildqueue.py b/lib/lp/buildmaster/model/buildqueue.py
index 7a49cb7..decd388 100644
--- a/lib/lp/buildmaster/model/buildqueue.py
+++ b/lib/lp/buildmaster/model/buildqueue.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 __metaclass__ = type
@@ -209,6 +209,7 @@ class BuildQueue(StormBase):
 
     def reset(self):
         """See `IBuildQueue`."""
+        self.specific_build.emitMetric("reset")
         builder = self.builder
         self.builder = None
         self.status = BuildQueueStatus.WAITING
diff --git a/lib/lp/buildmaster/tests/mock_slaves.py b/lib/lp/buildmaster/tests/mock_slaves.py
index 54944e3..f1de4cb 100644
--- a/lib/lp/buildmaster/tests/mock_slaves.py
+++ b/lib/lp/buildmaster/tests/mock_slaves.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Mock Build objects for tests soyuz buildd-system."""
@@ -74,6 +74,7 @@ class MockBuilder:
         self.version = version
         self.clean_status = clean_status
         self.active = active
+        self.failure_count = 0
 
     def setCleanStatus(self, clean_status):
         self.clean_status = clean_status
diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
index 4dc6b47..2da4753 100644
--- a/lib/lp/buildmaster/tests/test_manager.py
+++ b/lib/lp/buildmaster/tests/test_manager.py
@@ -72,6 +72,7 @@ from lp.buildmaster.tests.test_interactor import (
     MockBuilderFactory,
     )
 from lp.registry.interfaces.distribution import IDistributionSet
+from lp.services.compat import mock
 from lp.services.config import config
 from lp.services.log.logger import BufferLogger
 from lp.services.statsd.tests import StatsMixin
@@ -537,7 +538,11 @@ class TestSlaveScannerScan(StatsMixin, TestCaseWithFactory):
         transaction.commit()
 
         yield scanner.singleCycle()
-        self.assertEqual(1, self.stats_client.incr.call_count)
+        self.assertEqual(2, self.stats_client.incr.call_count)
+        self.stats_client.incr.assert_has_calls([
+            mock.call("build.reset,arch=386,env=test,job_type=PACKAGEBUILD"),
+            mock.call("builders.judged_failed,build=False,env=test"),
+            ])
 
     @defer.inlineCallbacks
     def test_fail_to_resume_leaves_it_dirty(self):
diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
index 6add2e4..1a1417a 100644
--- a/lib/lp/services/statsd/numbercruncher.py
+++ b/lib/lp/services/statsd/numbercruncher.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2020-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Out of process statsd reporting."""
@@ -31,10 +31,10 @@ from lp.buildmaster.interfaces.builder import IBuilderSet
 from lp.buildmaster.manager import PrefetchedBuilderFactory
 from lp.code.enums import CodeImportJobState
 from lp.code.model.codeimportjob import CodeImportJob
-from lp.soyuz.model.archive import Archive
 from lp.services.database.interfaces import IStore
 from lp.services.librarian.model import LibraryFileContent
 from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+from lp.soyuz.model.archive import Archive
 
 
 NUMBER_CRUNCHER_LOG_NAME = "number-cruncher"
@@ -126,6 +126,9 @@ class NumberCruncher(service.Service):
                     counts['building'] += 1
                 elif builder.clean_status == BuilderCleanStatus.CLEAN:
                     counts['idle'] += 1
+            self._sendGauge(
+                "builders.failure_count", builder.failure_count,
+                labels={"builder_name": builder.name})
         for (processor, virtualized), counts in counts_by_processor.items():
             for count_name, count_value in counts.items():
                 self._sendGauge(
diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
index a22938b..2149de1 100644
--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2020-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Tests for the stats number cruncher daemon."""
@@ -28,6 +28,7 @@ from lp.buildmaster.enums import (
     BuildStatus,
     )
 from lp.buildmaster.interactor import BuilderSlave
+from lp.buildmaster.interfaces.builder import IBuilderSet
 from lp.buildmaster.interfaces.processor import IProcessorSet
 from lp.buildmaster.model.buildqueue import BuildQueue
 from lp.buildmaster.tests.mock_slaves import OkSlave
@@ -51,6 +52,10 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
     def setUp(self):
         super(TestNumberCruncher, self).setUp()
         self.setUpStats()
+        # Deactivate sampledata builders; we only want statistics for the
+        # builders explicitly created in these tests.
+        for builder in getUtility(IBuilderSet):
+            builder.active = False
 
     def test_single_processor_counts(self):
         builder = self.factory.makeBuilder()
@@ -62,14 +67,32 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         manager.updateBuilderStats()
 
         self.assertFalse(is_transaction_in_progress())
-        self.assertEqual(8, self.stats_client.gauge.call_count)
-        for call in self.stats_client.mock.gauge.call_args_list:
-            self.assertIn('386', call[0][0])
+        expected_gauges = [
+            'builders.failure_count,builder_name=%s,env=test' % builder.name,
+            ]
+        expected_gauges.extend([
+            'builders,arch=386,env=test,status=%s,virtualized=True' % status
+            for status in ('building', 'cleaning', 'disabled', 'idle')
+            ])
+        self.assertThat(
+            [call[0][0] for call in self.stats_client.gauge.call_args_list],
+            MatchesSetwise(*(
+                Equals(gauge_name) for gauge_name in expected_gauges)))
 
     def test_multiple_processor_counts(self):
-        builder = self.factory.makeBuilder(
-            processors=[getUtility(IProcessorSet).getByName('amd64')])
-        builder.setCleanStatus(BuilderCleanStatus.CLEAN)
+        builders = [
+            self.factory.makeBuilder(
+                processors=[
+                    getUtility(IProcessorSet).getByName(processor_name)],
+                virtualized=virtualized)
+            for processor_name, virtualized in (
+                ('386', True),
+                ('386', False),
+                ('amd64', True),
+                ('amd64', False),
+                )]
+        for builder in builders:
+            builder.setCleanStatus(BuilderCleanStatus.CLEAN)
         self.patch(BuilderSlave, 'makeBuilderSlave', FakeMethod(OkSlave()))
         transaction.commit()
         clock = task.Clock()
@@ -77,31 +100,46 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         manager.updateBuilderStats()
 
         self.assertFalse(is_transaction_in_progress())
-        self.assertEqual(12, self.stats_client.gauge.call_count)
-        i386_calls = [c for c in self.stats_client.gauge.call_args_list
-                      if '386' in c[0][0]]
-        amd64_calls = [c for c in self.stats_client.gauge.call_args_list
-                       if 'amd64' in c[0][0]]
-        self.assertEqual(8, len(i386_calls))
-        self.assertEqual(4, len(amd64_calls))
+        expected_gauges = [
+            'builders.failure_count,builder_name=%s,env=test' % builder.name
+            for builder in builders
+            ]
+        expected_gauges.extend([
+            'builders,arch=%s,env=test,status=%s,virtualized=%s' % (
+                arch, status, virtualized)
+            for arch in ('386', 'amd64')
+            for virtualized in (True, False)
+            for status in ('building', 'cleaning', 'disabled', 'idle')
+            ])
+        self.assertThat(
+            [call[0][0] for call in self.stats_client.gauge.call_args_list],
+            MatchesSetwise(*(
+                Equals(gauge_name) for gauge_name in expected_gauges)))
 
     def test_correct_values_counts(self):
-        for _ in range(3):
-            cleaning_builder = self.factory.makeBuilder(
+        cleaning_builders = [
+            self.factory.makeBuilder(
                 processors=[getUtility(IProcessorSet).getByName('amd64')])
+            for _ in range(3)]
+        for cleaning_builder in cleaning_builders:
+            cleaning_builder.gotFailure()
             cleaning_builder.setCleanStatus(BuilderCleanStatus.CLEANING)
-        for _ in range(4):
-            idle_builder = self.factory.makeBuilder(
+        idle_builders = [
+            self.factory.makeBuilder(
                 processors=[getUtility(IProcessorSet).getByName('amd64')])
+            for _ in range(4)]
+        for idle_builder in idle_builders:
             idle_builder.setCleanStatus(BuilderCleanStatus.CLEAN)
             old_build = self.factory.makeSnapBuild()
             old_build.queueBuild()
             old_build.buildqueue_record.markAsBuilding(builder=idle_builder)
             old_build.buildqueue_record.destroySelf()
-        builds = []
-        for _ in range(2):
-            building_builder = self.factory.makeBuilder(
+        building_builders = [
+            self.factory.makeBuilder(
                 processors=[getUtility(IProcessorSet).getByName('amd64')])
+            for _ in range(2)]
+        builds = []
+        for building_builder in building_builders:
             building_builder.setCleanStatus(BuilderCleanStatus.CLEAN)
             build = self.factory.makeSnapBuild()
             build.queueBuild()
@@ -125,24 +163,29 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         manager.updateBuilderStats()
 
         self.assertFalse(is_transaction_in_progress())
-        self.assertEqual(12, self.stats_client.gauge.call_count)
-        calls = [c[0] for c in self.stats_client.gauge.call_args_list
-                 if 'amd64' in c[0][0]]
+        expected_gauges = {
+            'builders.failure_count,builder_name=%s,env=test' % builder.name: 1
+            for builder in cleaning_builders
+            }
+        expected_gauges.update({
+            'builders.failure_count,builder_name=%s,env=test' % builder.name: 0
+            for builder in idle_builders + building_builders
+            })
+        expected_gauges.update({
+            'builders,arch=amd64,env=test,status=%s,'
+            'virtualized=True' % status: count
+            for status, count in (
+                ('building', 2),
+                ('cleaning', 3),
+                ('disabled', 0),
+                ('idle', 4),
+                )
+            })
         self.assertThat(
-            calls, MatchesSetwise(
-                Equals((
-                    'builders,arch=amd64,env=test,status=disabled,'
-                    'virtualized=True', 0)),
-                Equals((
-                    'builders,arch=amd64,env=test,status=building,'
-                    'virtualized=True', 2)),
-                Equals((
-                    'builders,arch=amd64,env=test,status=idle,'
-                    'virtualized=True', 4)),
-                Equals((
-                    'builders,arch=amd64,env=test,status=cleaning,'
-                    'virtualized=True', 3))
-                ))
+            [call[0] for call in self.stats_client.gauge.call_args_list],
+            MatchesSetwise(*(
+                Equals((gauge_name, count))
+                for gauge_name, count in expected_gauges.items())))
 
     def test_updateBuilderStats_error(self):
         clock = task.Clock()