← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:statsd-for-opstats into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:statsd-for-opstats into launchpad:master with ~cjwatson/launchpad:rearrange-opstats-timeout as a prerequisite.

Commit message:
Emit statsd metrics for all remaining opstats

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #636804 in Launchpad itself: "request expired being raised does not increased the opstats timeout count"
  https://bugs.launchpad.net/launchpad/+bug/636804

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/403883

The old +opstats endpoint needs to go away: its approach isn't very gunicorn-friendly, and it's close to being entirely superseded by statsd.  Emit statsd metrics corresponding to all the remaining opstats metrics, which will allow us to remove +opstats once we stop querying it.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:statsd-for-opstats into launchpad:master.
diff --git a/lib/lp/app/stories/basics/xx-opstats.txt b/lib/lp/app/stories/basics/xx-opstats.txt
index beb5d93..558131b 100644
--- a/lib/lp/app/stories/basics/xx-opstats.txt
+++ b/lib/lp/app/stories/basics/xx-opstats.txt
@@ -11,11 +11,29 @@ We can access them via XML-RPC:
     ...     transport=XMLRPCTestTransport()
     ...     )
 
+We also emit similar metrics to statsd, so set that up.
+
+    >>> from textwrap import dedent
+    >>> from fixtures import MockPatchObject
+    >>> from zope.component import getUtility
+    >>> from lp.services.compat import mock
+    >>> from lp.services.config import config
+    >>> from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+
+    >>> config.push('statsd_test', dedent('''
+    ...     [statsd]
+    ...     environment: test
+    ...     '''))
+    >>> statsd_client = getUtility(IStatsdClient)
+    >>> stats_client = mock.Mock()
+
 Create a function to report our stats for these tests.
 
+    >>> from collections import Counter
     >>> def reset():
     ...     from lp.services.webapp.opstats import OpStats
     ...     OpStats.resetStats()
+    ...     stats_client.reset_mock()
     ...
     >>> def report():
     ...     stats = lp_xmlrpc.opstats()
@@ -23,8 +41,11 @@ Create a function to report our stats for these tests.
     ...         value = stats[stat_key]
     ...         if value > 0:
     ...             print("%s: %d" % (stat_key, value))
+    ...     for statsd_key, value in sorted(Counter(
+    ...             call[0][0]
+    ...             for call in stats_client.incr.call_args_list).items()):
+    ...         print("statsd: %s: %d" % (statsd_key, value))
     ...     reset()
-    ...
 
 Number of requests and XML-RPC requests
 ---------------------------------------
@@ -35,7 +56,8 @@ the 'requests' and 'xml-rpc requests' statistics, but this will not be
 visible until the next time we access the statistics as the statistics
 are adjusted after the request has been served:
 
-    >>> stats = lp_xmlrpc.opstats()
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     stats = lp_xmlrpc.opstats()
     >>> for key in sorted(stats.keys()):
     ...     # Print all so new keys added to OpStats.stats will trigger
     ...     # failures in this test prompting developers to extend it.
@@ -60,17 +82,23 @@ are adjusted after the request has been served:
     >>> report()
     requests: 1
     xml-rpc requests: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.xmlrpc,env=test: 1
 
 Number of HTTP requests and success codes
 -----------------------------------------
 
-    >>> output = http("GET / HTTP/1.1\nHost: bugs.launchpad.test\n")
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http("GET / HTTP/1.1\nHost: bugs.launchpad.test\n")
     >>> output.getStatus()
     200
     >>> report()
     2XXs: 1
     http requests: 1
     requests: 1
+    statsd: errors.2XX,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
 
 Number of 404s
 --------------
@@ -80,7 +108,8 @@ database policy, this request first uses the slave DB. The requested
 information is not found in there, so a retry is attempted against the
 master DB in case the information is missing due to replication lag.
 
-    >>> output = http("GET http://launchpad.test/non-existant HTTP/1.1\n")
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http("GET http://launchpad.test/non-existant HTTP/1.1\n")
     >>> output.getStatus()
     404
     >>> report()
@@ -89,6 +118,11 @@ master DB in case the information is missing due to replication lag.
     http requests: 1
     requests: 1
     retries: 1
+    statsd: errors.404,env=test: 1
+    statsd: errors.4XX,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
+    statsd: requests.retries,env=test: 1
 
 Number of 500 Internal Server Errors (unhandled exceptions)
 -----------------------------------------------------------
@@ -110,7 +144,8 @@ particular need to differentiate these cases though:
     >>> error_view_fixture = ZopeAdapterFixture(
     ...     ErrorView, (None, IDefaultBrowserLayer), Interface, "error-test")
     >>> error_view_fixture.setUp()
-    >>> output = http("GET /error-test HTTP/1.1\nHost: launchpad.test\n")
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http("GET /error-test HTTP/1.1\nHost: launchpad.test\n")
     >>> output.getStatus()
     500
     >>> report()
@@ -118,18 +153,22 @@ particular need to differentiate these cases though:
     5XXs: 1
     http requests: 1
     requests: 1
+    statsd: errors.500,env=test: 1
+    statsd: errors.5XX,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
 
 We also have a special metric counting server errors returned to known
 web browsers (5XXs_b) - in the production environment we care more
 about errors returned to people than robots crawling obscure parts of
 the site.
 
-    >>> from textwrap import dedent
-    >>> output = http(dedent("""\
-    ...     GET /error-test HTTP/1.1
-    ...     Host: launchpad.test
-    ...     User-Agent: Mozilla/42.0
-    ...     """))
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http(dedent("""\
+    ...         GET /error-test HTTP/1.1
+    ...         Host: launchpad.test
+    ...         User-Agent: Mozilla/42.0
+    ...         """))
     >>> output.getStatus()
     500
     >>> report()
@@ -138,36 +177,45 @@ the site.
     5XXs_b: 1
     http requests: 1
     requests: 1
+    statsd: errors.500,env=test: 1
+    statsd: errors.5XX,env=test: 1
+    statsd: errors.5XX.browser,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
 
     >>> error_view_fixture.cleanUp()
 
 Number of XML-RPC Faults
 ------------------------
 
-    >>> try:
-    ...     opstats = lp_xmlrpc.invalid() # XXX: Need a HTTP test too
-    ...     print('Should have raised a Fault exception!')
-    ... except xmlrpc_client.Fault:
-    ...     pass
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     try:
+    ...         opstats = lp_xmlrpc.invalid() # XXX: Need a HTTP test too
+    ...         print('Should have raised a Fault exception!')
+    ...     except xmlrpc_client.Fault:
+    ...         pass
     >>> report()
     requests: 1
     xml-rpc faults: 1
     xml-rpc requests: 1
+    statsd: errors.xmlrpc,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.xmlrpc,env=test: 1
 
 
 Number of soft timeouts
 -----------------------
 
-    >>> from lp.services.config import config
     >>> test_data = dedent("""
     ...     [database]
     ...     soft_request_timeout: 1
     ...     """)
     >>> config.push('base_test_data', test_data)
-    >>> output = http(r"""
-    ... GET /+soft-timeout HTTP/1.1
-    ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
-    ... """)
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http(dedent(r"""
+    ...         GET /+soft-timeout HTTP/1.1
+    ...         Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
+    ...         """))
     >>> output.getStatus()
     200
     >>> report()
@@ -175,6 +223,10 @@ Number of soft timeouts
     http requests: 1
     requests: 1
     soft timeouts: 1
+    statsd: errors.2XX,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
+    statsd: timeouts.soft,env=test: 1
 
 Number of Timeouts
 ------------------
@@ -189,10 +241,11 @@ also return a timeout Fault:
     ...     soft_request_timeout: 2
     ...     """)
     >>> config.push('test_data', test_data)
-    >>> output = http(r"""
-    ... GET /+soft-timeout HTTP/1.1
-    ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
-    ... """)
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http(dedent(r"""
+    ...         GET /+soft-timeout HTTP/1.1
+    ...         Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
+    ...         """))
     >>> output.getStatus()
     503
     >>> # Reset the timeouts so +opstat doesn't die.
@@ -203,6 +256,11 @@ also return a timeout Fault:
     http requests: 1
     requests: 1
     timeouts: 1
+    statsd: errors.503,env=test: 1
+    statsd: errors.5XX,env=test: 1
+    statsd: requests.all,env=test: 1
+    statsd: requests.http,env=test: 1
+    statsd: timeouts.hard,env=test: 1
 
 
 HTTP access for Cricket
@@ -210,8 +268,9 @@ HTTP access for Cricket
 
 Stats can also be retrieved via HTTP in cricket-graph format:
 
-    >>> output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
-    >>> output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
+    >>> with MockPatchObject(statsd_client, '_client', stats_client):
+    ...     output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
+    ...     output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
     >>> print(http("GET /+opstats HTTP/1.1\nHost: launchpad.test\n"))
     HTTP/1.1 200 Ok
     ...
@@ -259,7 +318,6 @@ to somewhere that doesn't exist.
 Then we need to drop all our existing connections, so when we reconnect
 the new connection information is used.
 
-    >>> from zope.component import getUtility
     >>> from storm.zope.interfaces import IZStorm
     >>> getUtility(IZStorm)._reset()
 
@@ -302,3 +360,7 @@ But our database connections are broken.
 
     >>> print(IStore(Person).find(Person, name='janitor').one().name)
     janitor
+
+Clean up.
+
+    >>> _ = config.pop('statsd_test')
diff --git a/lib/lp/services/webapp/errorlog.py b/lib/lp/services/webapp/errorlog.py
index 0ec77cb..ed25ace 100644
--- a/lib/lp/services/webapp/errorlog.py
+++ b/lib/lp/services/webapp/errorlog.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2019 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Error logging facilities."""
@@ -21,6 +21,7 @@ import oops_timeline
 import pytz
 import six
 from six.moves.urllib.parse import urlparse
+from zope.component import getUtility
 from zope.component.interfaces import ObjectEvent
 from zope.error.interfaces import IErrorReportingUtility
 from zope.event import notify
@@ -33,6 +34,7 @@ from lp.app import versioninfo
 from lp.layers import WebServiceLayer
 from lp.services.config import config
 from lp.services.messaging import rabbit
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
 from lp.services.timeline.requesttimeline import get_request_timeline
 from lp.services.webapp.adapter import (
     get_request_duration,
@@ -508,6 +510,7 @@ def end_request(event):
     # the soft timeout has expired, log an OOPS.
     if event.request.oopsid is None and soft_timeout_expired():
         OpStats.stats['soft timeouts'] += 1
+        getUtility(IStatsdClient).incr('timeouts.soft')
         globalErrorUtility.raising(
             (SoftRequestTimeout, SoftRequestTimeout(event.object), None),
             event.request)
diff --git a/lib/lp/services/webapp/publication.py b/lib/lp/services/webapp/publication.py
index e832679..85a8ab3 100644
--- a/lib/lp/services/webapp/publication.py
+++ b/lib/lp/services/webapp/publication.py
@@ -699,6 +699,7 @@ class LaunchpadBrowserPublication(
 
         if isinstance(exc_info[1], (da.RequestExpired, TimeoutError)):
             OpStats.stats['timeouts'] += 1
+            getUtility(IStatsdClient).incr('timeouts.hard')
 
         def should_retry(exc_info):
             if not retry_allowed:
@@ -810,8 +811,10 @@ class LaunchpadBrowserPublication(
         # Maintain operational statistics.
         if getattr(request, '_wants_retry', False):
             OpStats.stats['retries'] += 1
+            statsd_client.incr('requests.retries')
         else:
             OpStats.stats['requests'] += 1
+            statsd_client.incr('requests.all')
 
             # Increment counters for HTTP status codes we track individually
             # NB. We use IBrowserRequest, as other request types such as
@@ -820,6 +823,7 @@ class LaunchpadBrowserPublication(
             # and XML-RPC requests.
             if IBrowserRequest.providedBy(request):
                 OpStats.stats['http requests'] += 1
+                statsd_client.incr('requests.http')
                 status = request.response.getStatus()
                 if status == 404:  # Not Found
                     OpStats.stats['404s'] += 1
@@ -832,13 +836,14 @@ class LaunchpadBrowserPublication(
                     statsd_client.incr('errors.503')
 
                 # Increment counters for status code groups.
-                status_group = str(status)[0] + 'XXs'
-                OpStats.stats[status_group] += 1
+                status_group = str(status)[0] + 'XX'
+                OpStats.stats[status_group + 's'] += 1
+                statsd_client.incr('errors.%s' % status_group)
 
                 # Increment counter for 5XXs_b.
-                if is_browser(request) and status_group == '5XXs':
+                if is_browser(request) and status_group == '5XX':
                     OpStats.stats['5XXs_b'] += 1
-                    statsd_client.incr('errors.5XX')
+                    statsd_client.incr('errors.5XX.browser')
 
         # Make sure our databases are in a sane state for the next request.
         thread_name = threading.current_thread().name
diff --git a/lib/lp/services/webapp/servers.py b/lib/lp/services/webapp/servers.py
index fb05ab1..57de022 100644
--- a/lib/lp/services/webapp/servers.py
+++ b/lib/lp/services/webapp/servers.py
@@ -78,6 +78,7 @@ from lp.services.oauth.interfaces import (
     TokenException,
     )
 from lp.services.propertycache import cachedproperty
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
 from lp.services.webapp.authentication import (
     check_oauth_signature,
     get_oauth_authorization,
@@ -1439,9 +1440,11 @@ class PublicXMLRPCPublication(LaunchpadBrowserPublication):
         LaunchpadBrowserPublication.handleException(
                 self, object, request, exc_info, retry_allowed)
         OpStats.stats['xml-rpc faults'] += 1
+        getUtility(IStatsdClient).incr('errors.xmlrpc')
 
     def endRequest(self, request, object):
         OpStats.stats['xml-rpc requests'] += 1
+        getUtility(IStatsdClient).incr('requests.xmlrpc')
         return LaunchpadBrowserPublication.endRequest(self, request, object)