launchpad-reviewers team mailing list archive

Thread
Date

[Merge] ~twom/launchpad:stats-better-builder-stats-stacking into launchpad:master

To: mp+392892@xxxxxxxxxxxxxxxxxx
From: Tom Wardill <tom.wardill@xxxxxxxxxxxxx>
Date: Tue, 27 Oct 2020 16:02:22 -0000
Reply-to: mp+392892@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Tom Wardill has proposed merging ~twom/launchpad:stats-better-builder-stats-stacking into launchpad:master.

Commit message:
Move to use statsd environment config variable

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~twom/launchpad/+git/launchpad/+merge/392892

The prefix setting in statsd doesn't let us do the final queries in quite the way we wanted, so add another setting for it and apply it as a label in all the metrics.

Also increase number cruncher logging level to DEBUG to see if we can work out why it appears to stop from the logs.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~twom/launchpad:stats-better-builder-stats-stacking into launchpad:master.

diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 7bbe4aa..e9732ea 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -532,10 +532,13 @@ class SlaveScanner:
             if builder.current_build is not None:
                 builder.current_build.gotFailure()
                 self.statsd_client.incr(
-                    'builders.judged_failed,build=True,arch={}'.format(
-                        builder.current_build.processor.name))
+                    'builders.judged_failed,build=True,arch={},env={}'.format(
+                        builder.current_build.processor.name,
+                        self.statsd_client.lp_environment))
             else:
-                self.statsd_client.incr('builders.judged_failed,build=False')
+                self.statsd_client.incr(
+                    'builders.judged_failed,build=False,env={}'.format(
+                        self.statsd_client.lp_environment))
             recover_failure(self.logger, vitals, builder, retry, failure.value)
             transaction.commit()
         except Exception:
diff --git a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
index edff53b..88ede6a 100644
--- a/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/model/buildfarmjobbehaviour.py
@@ -156,10 +156,12 @@ class BuildFarmJobBehaviourBase:
         # Update stats
         job_type = getattr(self.build, 'job_type', None)
         job_type_name = job_type.name if job_type else 'UNKNOWN'
-        getUtility(IStatsdClient).incr(
-            'build.count,job_type={},builder_name={}'.format(
+        statsd_client = getUtility(IStatsdClient)
+        statsd_client.incr(
+            'build.count,job_type={},builder_name={},env={}'.format(
                 job_type_name,
-                self._builder.name))
+                self._builder.name,
+                statsd_client.lp_environment))
 
         logger.info(
             "Job %s (%s) started on %s: %s %s"
diff --git a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
index 3bdf53d..06b0e3f 100644
--- a/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
+++ b/lib/lp/buildmaster/tests/test_buildfarmjobbehaviour.py
@@ -275,7 +275,8 @@ class TestDispatchBuildToSlave(StatsMixin, TestCase):
         self.assertEqual(1, self.stats_client.incr.call_count)
         self.assertEqual(
             self.stats_client.incr.call_args_list[0][0],
-            ('build.count,job_type=UNKNOWN,builder_name=mock-builder',))
+            ('build.count,job_type=UNKNOWN,'
+             'builder_name=mock-builder,env=test',))
 
 
 class TestGetUploadMethodsMixin:
diff --git a/lib/lp/services/config/schema-lazr.conf b/lib/lp/services/config/schema-lazr.conf
index c16270a..aaa9d30 100644
--- a/lib/lp/services/config/schema-lazr.conf
+++ b/lib/lp/services/config/schema-lazr.conf
@@ -2028,3 +2028,4 @@ concurrency: 1
 host: none
 port: none
 prefix: none
+environment: none
diff --git a/lib/lp/services/statsd/model/statsd_client.py b/lib/lp/services/statsd/model/statsd_client.py
index 868af30..6ccf027 100644
--- a/lib/lp/services/statsd/model/statsd_client.py
+++ b/lib/lp/services/statsd/model/statsd_client.py
@@ -43,6 +43,12 @@ class StatsdClient:
         self._make_client()
 
     def __getattr__(self, name):
+        # This is a convenience to keep all statsd client related
+        # items on the client. Having this separate from prefix
+        # allows us to use it as a label, rather than as part of the
+        # gauge name
+        if name == 'lp_environment':
+            return config.statsd.environment
         if self._client is not None:
             return getattr(self._client, name)
         else:
diff --git a/lib/lp/services/statsd/numbercruncher.py b/lib/lp/services/statsd/numbercruncher.py
index d2301d0..0a0552c 100644
--- a/lib/lp/services/statsd/numbercruncher.py
+++ b/lib/lp/services/statsd/numbercruncher.py
@@ -47,7 +47,7 @@ class NumberCruncher(service.Service):
     def _setupLogger(self):
         """Set up a 'number-cruncher' logger that redirects to twisted.
         """
-        level = logging.INFO
+        level = logging.DEBUG
         logger = logging.getLogger(NUMBER_CRUNCHER_LOG_NAME)
         logger.propagate = False
 
@@ -74,8 +74,8 @@ class NumberCruncher(service.Service):
         for queue_type, contents in queue_details.items():
             virt = queue_type == 'virt'
             for arch, value in contents.items():
-                gauge_name = "buildqueue,virtualized={},arch={}".format(
-                    virt, arch)
+                gauge_name = "buildqueue,virtualized={},arch={},env={}".format(
+                    virt, arch, self.statsd_client.lp_environment)
                 self.logger.debug("{}: {}".format(gauge_name, value[0]))
                 self.statsd_client.gauge(gauge_name, value[0])
         self.logger.debug("Build queue stats update complete.")
@@ -107,8 +107,8 @@ class NumberCruncher(service.Service):
                     counts['idle'] += 1
         for processor, counts in counts_by_processor.items():
             for count_name, count_value in counts.items():
-                gauge_name = "builders.{},arch={}".format(
-                    count_name, processor)
+                gauge_name = "builders,status={},arch={},env={}".format(
+                    count_name, processor, self.statsd_client.lp_environment)
                 self.logger.debug("{}: {}".format(gauge_name, count_value))
                 self.statsd_client.gauge(gauge_name, count_value)
         self.logger.debug("Builder stats update complete.")
diff --git a/lib/lp/services/statsd/tests/__init__.py b/lib/lp/services/statsd/tests/__init__.py
index 20885a4..23fb067 100644
--- a/lib/lp/services/statsd/tests/__init__.py
+++ b/lib/lp/services/statsd/tests/__init__.py
@@ -19,5 +19,6 @@ class StatsMixin:
         # Install a mock statsd client so we can assert against the call
         # counts and args.
         self.stats_client = mock.Mock()
+        self.stats_client.lp_environment = "test"
         self.useFixture(
             ZopeUtilityFixture(self.stats_client, IStatsdClient))
diff --git a/lib/lp/services/statsd/tests/test_numbercruncher.py b/lib/lp/services/statsd/tests/test_numbercruncher.py
index 47c633f..c5a4ff6 100644
--- a/lib/lp/services/statsd/tests/test_numbercruncher.py
+++ b/lib/lp/services/statsd/tests/test_numbercruncher.py
@@ -85,10 +85,18 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
                  if 'amd64' in c[0][0]]
         self.assertThat(
             calls, MatchesListwise(
-                [Equals(('builders.disabled,arch=amd64,virtualized=True', 0)),
-                 Equals(('builders.building,arch=amd64,virtualized=True', 0)),
-                 Equals(('builders.idle,arch=amd64,virtualized=True', 0)),
-                 Equals(('builders.cleaning,arch=amd64,virtualized=True', 1))
+                [Equals((
+                    'builders,status=disabled,arch=amd64,'
+                    'virtualized=True,env=test', 0)),
+                 Equals((
+                     'builders,status=building,arch=amd64,'
+                     'virtualized=True,env=test', 0)),
+                 Equals((
+                     'builders,status=idle,arch=amd64,'
+                     'virtualized=True,env=test', 0)),
+                 Equals((
+                     'builders,status=cleaning,arch=amd64,'
+                     'virtualized=True,env=test', 1))
                  ]))
 
     def test_updateBuilderQueues(self):
@@ -109,9 +117,9 @@ class TestNumberCruncher(StatsMixin, TestCaseWithFactory):
         self.assertThat(
             [x[0] for x in self.stats_client.gauge.call_args_list],
             MatchesListwise(
-                [Equals(('buildqueue,virtualized=True,arch={}'.format(
+                [Equals(('buildqueue,virtualized=True,arch={},env=test'.format(
                     build.processor.name), 1)),
-                 Equals(('buildqueue,virtualized=False,arch=386', 1))
+                 Equals(('buildqueue,virtualized=False,arch=386,env=test', 1))
                  ]))
 
     def test_startService_starts_update_queues_loop(self):
diff --git a/lib/lp/services/statsd/tests/test_statsd_client.py b/lib/lp/services/statsd/tests/test_statsd_client.py
index 6852af7..2e1a8e1 100644
--- a/lib/lp/services/statsd/tests/test_statsd_client.py
+++ b/lib/lp/services/statsd/tests/test_statsd_client.py
@@ -26,7 +26,8 @@ class TestClientConfiguration(TestCase):
         self.addCleanup(client.reload)
         config.push(
             'statsd_test',
-            "[statsd]\nhost: 127.0.01\nport: 9999\nprefix: test\n")
+            "[statsd]\nhost: 127.0.01\n"
+            "port: 9999\nprefix: test\nenvironment: test\n")
         client.reload()
         self.assertIsInstance(client._client, StatsClient)
 
@@ -45,6 +46,17 @@ class TestClientConfiguration(TestCase):
         self.addCleanup(client.reload)
         config.push(
             'statsd_test',
-            "[statsd]\nhost: 127.0.01\nport: 9999\nprefix: test\n")
+            "[statsd]\nhost: 127.0.01\n"
+            "port: 9999\nprefix: test\nenvironment: test\n")
         client.reload()
         self.assertIsInstance(client._client, StatsClient)
+
+    def test_lp_environment_is_available(self):
+        client = getUtility(IStatsdClient)
+        self.addCleanup(client.reload)
+        config.push(
+            'statsd_test',
+            "[statsd]\nhost: 127.0.01\n"
+            "port: 9999\nprefix: test\nenvironment: test\n")
+        client.reload()
+        self.assertEqual(client.lp_environment, "test")
diff --git a/lib/lp/services/webapp/publication.py b/lib/lp/services/webapp/publication.py
index 7325d9f..9a9a0e3 100644
--- a/lib/lp/services/webapp/publication.py
+++ b/lib/lp/services/webapp/publication.py
@@ -795,13 +795,16 @@ class LaunchpadBrowserPublication(
                 status = request.response.getStatus()
                 if status == 404:  # Not Found
                     OpStats.stats['404s'] += 1
-                    statsd_client.incr('errors.404')
+                    statsd_client.incr('errors.404,env={}'.format(
+                        statsd_client.lp_environment))
                 elif status == 500:  # Unhandled exceptions
                     OpStats.stats['500s'] += 1
-                    statsd_client.incr('errors.500')
+                    statsd_client.incr('errors.500,env={}'.format(
+                        statsd_client.lp_environment))
                 elif status == 503:  # Timeouts
                     OpStats.stats['503s'] += 1
-                    statsd_client.incr('errors.503')
+                    statsd_client.incr('errors.503,env={}'.format(
+                        statsd_client.lp_environment))
 
                 # Increment counters for status code groups.
                 status_group = str(status)[0] + 'XXs'
@@ -810,7 +813,8 @@ class LaunchpadBrowserPublication(
                 # Increment counter for 5XXs_b.
                 if is_browser(request) and status_group == '5XXs':
                     OpStats.stats['5XXs_b'] += 1
-                    statsd_client.incr('errors.5XX')
+                    statsd_client.incr('errors.5XX,env={}'.format(
+                        statsd_client.lp_environment))
 
         # Make sure our databases are in a sane state for the next request.
         thread_name = threading.current_thread().name