← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:disabled-cronscript-metrics into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:disabled-cronscript-metrics into launchpad:master.

Commit message:
Emit statsd metric when a cron script is disabled

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/423644

We sometimes disable cron scripts for certain kinds of deployments, in particular schema updates.  This can cause spurious alerts, because the usual timing metrics we emit when scripts complete aren't emitted for the duration of the deployment; this deters us from having appropriately-sensitive alerts, and means we have to micromanage the timing of certain deployments unnecessarily.

To avoid this problem, emit a zero-duration timing metric when a cron script exits early due to being disabled.  The trade-off here is that scripts will take somewhat longer to exit in this situation since they need to load the Zope component architecture first, but that's not a serious problem.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:disabled-cronscript-metrics into launchpad:master.
diff --git a/lib/lp/services/scripts/base.py b/lib/lp/services/scripts/base.py
index 62dc799..e7acbdb 100644
--- a/lib/lp/services/scripts/base.py
+++ b/lib/lp/services/scripts/base.py
@@ -47,6 +47,7 @@ from lp.services.features import (
 from lp.services.mail.sendmail import set_immediate_mail_delivery
 from lp.services.scripts.interfaces.scriptactivity import IScriptActivitySet
 from lp.services.scripts.logger import OopsHandler
+from lp.services.scripts.metrics import emit_script_activity_metric
 from lp.services.timeout import (
     override_timeout,
     urlfetch,
@@ -193,12 +194,12 @@ class LaunchpadScript:
 
         if logger is None:
             scripts.logger_options(self.parser, default=self.loglevel)
-            self.parser.add_option(
-                '--profile', dest='profile', metavar='FILE', help=(
-                        "Run the script under the profiler and save the "
-                        "profiling stats in FILE."))
         else:
             scripts.dummy_logger_options(self.parser)
+        self.parser.add_option(
+            '--profile', dest='profile', metavar='FILE', help=(
+                    "Run the script under the profiler and save the "
+                    "profiling stats in FILE."))
 
         self.add_my_options()
         self.options, self.args = self.parser.parse_args(args=test_args)
@@ -384,14 +385,7 @@ class LaunchpadCronScript(LaunchpadScript):
                  ignore_cron_control=False):
         super().__init__(name, dbuser, test_args=test_args, logger=logger)
 
-        # self.name is used instead of the name argument, since it may have
-        # have been overridden by command-line parameters or by
-        # overriding the name property.
-        if not ignore_cron_control:
-            enabled = cronscript_enabled(
-                config.canonical.cron_control_url, self.name, self.logger)
-            if not enabled:
-                sys.exit(0)
+        self.ignore_cron_control = ignore_cron_control
 
         # Configure the IErrorReportingUtility we use with defaults.
         # Scripts can override this if they want.
@@ -404,6 +398,27 @@ class LaunchpadCronScript(LaunchpadScript):
         oops_hdlr = OopsHandler(self.name, logger=self.logger)
         logging.getLogger().addHandler(oops_hdlr)
 
+    def _init_db(self, isolation):
+        # This runs a bit late: we initialize the whole Zope component
+        # architecture before getting here, which is slow.  However, doing
+        # this allows us to emit a script activity metric via the
+        # IStatsdClient utility, and if cron scripts are disabled then we
+        # still exit before anything important like database access happens.
+        if not self.ignore_cron_control:
+            enabled = cronscript_enabled(
+                config.canonical.cron_control_url, self.name, self.logger)
+            if not enabled:
+                # Emit a basic script activity metric so that alerts don't
+                # fire while scripts are intentionally disabled (e.g. during
+                # schema updates).  We set the duration to 0 so that these
+                # can be distinguished from real completions.  Avoid
+                # touching the database here, since that could be
+                # problematic during schema updates.
+                emit_script_activity_metric(self.name, datetime.timedelta(0))
+                sys.exit(0)
+
+        super()._init_db(isolation)
+
     @log_unhandled_exception_and_exit
     def record_activity(self, date_started, date_completed):
         """Record the successful completion of the script."""
diff --git a/lib/lp/services/scripts/doc/script-monitoring.txt b/lib/lp/services/scripts/doc/script-monitoring.txt
index f2765ab..5896dc2 100644
--- a/lib/lp/services/scripts/doc/script-monitoring.txt
+++ b/lib/lp/services/scripts/doc/script-monitoring.txt
@@ -18,16 +18,12 @@ database.  This is performed with a call to
 IScriptActivitySet.recordSuccess():
 
     >>> import datetime
-    >>> import os
     >>> import socket
-    >>> import subprocess
-    >>> import tempfile
     >>> from textwrap import dedent
     >>> from unittest import mock
 
     >>> from fixtures import MockPatchObject
     >>> import pytz
-    >>> import transaction
     >>> from zope.component import getUtility
 
     >>> from lp.services.config import config
@@ -100,75 +96,3 @@ script ran on, as determined by 'socket.gethostname()':
     ...     date_completed=datetime.datetime.now(UTC))
     >>> local_activity.hostname == socket.gethostname()
     True
-
-
-LaunchpadScript Integration
----------------------------
-
-A LaunchpadScript subclass is provided that will automatically log the
-result of successful runs.  This is intended for use by cron scripts
-and others where it is useful to monitor the result.
-
-    >>> script_file = tempfile.NamedTemporaryFile()
-    >>> _ = script_file.write(b"""
-    ... from lp.services.scripts.base import LaunchpadCronScript
-    ...
-    ... class TestScript(LaunchpadCronScript):
-    ...     def main(self):
-    ...         # Fail if we are told to do so
-    ...         if self.args[0] == 'fail':
-    ...             raise RuntimeError('Some failure')
-    ...
-    ... if __name__ == '__main__':
-    ...     script = TestScript('test-script', 'garbo_daily')
-    ...     script.run()
-    ... """)
-    >>> script_file.flush()
-    >>> transaction.commit()
-
-Prepare an environment to run the testing script.
-
-    >>> import lp
-    >>> lp_py = os.path.join(
-    ...     os.path.dirname(lp.__file__), os.pardir, os.pardir, 'bin/py')
-
-We'll now run this script, telling it to fail:
-
-    >>> proc = subprocess.Popen([lp_py, script_file.name, 'fail'],
-    ...                         stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-    ...                         stderr=subprocess.PIPE)
-    >>> (out, err) = proc.communicate()
-    >>> transaction.abort()
-
-The process failed:
-
-    >>> print(proc.returncode)
-    1
-
-And no activity got recorded:
-
-    >>> print(getUtility(IScriptActivitySet).getLastActivity('test-script'))
-    None
-
-If we run it such that it succeeds, we will get an activity record:
-
-    >>> proc = subprocess.Popen(
-    ...     [lp_py, script_file.name, 'pass'], stdin=subprocess.PIPE,
-    ...     stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    >>> (out, err) = proc.communicate()
-    >>> transaction.abort()
-
-    >>> print(proc.returncode)
-    0
-    >>> activity = getUtility(IScriptActivitySet).getLastActivity(
-    ...     'test-script')
-    >>> activity is not None
-    True
-    >>> print(activity.name)
-    test-script
-    >>> activity.hostname == socket.gethostname()
-    True
-
-Remove the temporary script file:
-
-    >>> script_file.close()
diff --git a/lib/lp/services/scripts/metrics.py b/lib/lp/services/scripts/metrics.py
new file mode 100644
index 0000000..2113aba
--- /dev/null
+++ b/lib/lp/services/scripts/metrics.py
@@ -0,0 +1,24 @@
+# Copyright 2022 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Metrics for Launchpad scripts."""
+
+__all__ = [
+    "emit_script_activity_metric",
+    ]
+
+from datetime import timedelta
+
+from zope.component import getUtility
+
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+
+
+def emit_script_activity_metric(name: str, duration: timedelta):
+    """Inform statsd about a script completing."""
+    # Don't bother with labels for hostname information, since telegraf adds
+    # that sort of thing.
+    getUtility(IStatsdClient).timing(
+        "script_activity",
+        duration.total_seconds() * 1000,
+        labels={"name": name})
diff --git a/lib/lp/services/scripts/model/scriptactivity.py b/lib/lp/services/scripts/model/scriptactivity.py
index 728252c..8269d63 100644
--- a/lib/lp/services/scripts/model/scriptactivity.py
+++ b/lib/lp/services/scripts/model/scriptactivity.py
@@ -15,7 +15,6 @@ from storm.locals import (
     Int,
     Unicode,
     )
-from zope.component import getUtility
 from zope.interface import implementer
 
 from lp.services.database.interfaces import IStore
@@ -24,7 +23,7 @@ from lp.services.scripts.interfaces.scriptactivity import (
     IScriptActivity,
     IScriptActivitySet,
     )
-from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+from lp.services.scripts.metrics import emit_script_activity_metric
 
 
 @implementer(IScriptActivity)
@@ -58,12 +57,8 @@ class ScriptActivitySet:
             name=six.ensure_text(name), hostname=six.ensure_text(hostname),
             date_started=date_started, date_completed=date_completed)
         IStore(ScriptActivity).add(activity)
-        # Pass equivalent information through to statsd as well.  (Don't
-        # bother with the hostname, since telegraf adds that.)
-        getUtility(IStatsdClient).timing(
-            'script_activity',
-            (date_completed - date_started).total_seconds() * 1000,
-            labels={'name': name})
+        # Pass equivalent information through to statsd as well.
+        emit_script_activity_metric(name, date_completed - date_started)
         return activity
 
     def getLastActivity(self, name):
diff --git a/lib/lp/services/scripts/tests/test_base.py b/lib/lp/services/scripts/tests/test_base.py
new file mode 100644
index 0000000..2eeb84d
--- /dev/null
+++ b/lib/lp/services/scripts/tests/test_base.py
@@ -0,0 +1,92 @@
+# Copyright 2022 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+
+"""Tests for lp.services.scripts.base."""
+
+import os.path
+import socket
+from unittest import mock
+
+from testtools.matchers import MatchesStructure
+from testtools.testcase import ExpectedException
+from zope.component import getUtility
+
+from lp.services.log.logger import DevNullLogger
+from lp.services.scripts.base import LaunchpadCronScript
+from lp.services.scripts.interfaces.scriptactivity import IScriptActivitySet
+from lp.services.statsd.tests import StatsMixin
+from lp.testing import TestCase
+from lp.testing.layers import ZopelessDatabaseLayer
+
+
+class TestScript(LaunchpadCronScript):
+
+    def _init_zca(self, use_web_security):
+        # Already done by test layer.
+        pass
+
+    def main(self):
+        # Fail if we are told to do so.
+        if self.args[0] == "fail":
+            raise RuntimeError("Some failure")
+
+
+class TestLaunchpadCronScript(StatsMixin, TestCase):
+    """Test cron script integration.
+
+    `LaunchpadCronScript` is a `LaunchpadScript` subclass that automatically
+    logs the result of successful runs.  This is intended for use by cron
+    scripts and others where it is useful to monitor the result.
+    """
+
+    layer = ZopelessDatabaseLayer
+
+    def setCronControlConfig(self, body):
+        tempdir = self.makeTemporaryDirectory()
+        config_path = os.path.join(tempdir, "cron-control.ini")
+        with open(config_path, "w") as config_file:
+            config_file.write(body)
+        self.pushConfig(
+            "canonical", cron_control_url="file://%s" % config_path)
+
+    def test_cronscript_disabled(self):
+        # If scripts are centrally disabled, there is no activity record in
+        # the database but there is an activity metric in statsd.
+        self.setCronControlConfig("[DEFAULT]\nenabled: False\n")
+        self.setUpStats()
+        script = TestScript(
+            "script-name", test_args=["fail"], logger=DevNullLogger())
+        with ExpectedException(
+                SystemExit, MatchesStructure.byEquality(code=0)):
+            script.run()
+        self.assertIsNone(
+            getUtility(IScriptActivitySet).getLastActivity("script-name"))
+        self.stats_client.timing.assert_called_once_with(
+            "script_activity,env=test,name=script-name", 0.0)
+
+    def test_script_fails(self):
+        # If the script fails, there is no activity record in the database
+        # and no activity metric in statsd.
+        self.setUpStats()
+        script = TestScript(
+            "script-name", test_args=["fail"], logger=DevNullLogger())
+        with ExpectedException(
+                SystemExit, MatchesStructure.byEquality(code=1)):
+            script.run()
+        self.assertIsNone(
+            getUtility(IScriptActivitySet).getLastActivity("script-name"))
+        self.stats_client.timing.assert_not_called()
+
+    def test_script_succeeds(self):
+        # If the script succeeds, there is an activity record in the
+        # database and an activity metric in statsd.
+        self.setUpStats()
+        script = TestScript(
+            "script-name", test_args=["pass"], logger=DevNullLogger())
+        script.run()
+        self.assertThat(
+            getUtility(IScriptActivitySet).getLastActivity("script-name"),
+            MatchesStructure.byEquality(
+                name="script-name", hostname=socket.gethostname()))
+        self.stats_client.timing.assert_called_once_with(
+            "script_activity,env=test,name=script-name", mock.ANY)