launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #27194
[Merge] ~cjwatson/launchpad:statsd-for-opstats into launchpad:master
Colin Watson has proposed merging ~cjwatson/launchpad:statsd-for-opstats into launchpad:master with ~cjwatson/launchpad:rearrange-opstats-timeout as a prerequisite.
Commit message:
Emit statsd metrics for all remaining opstats
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #636804 in Launchpad itself: "request expired being raised does not increased the opstats timeout count"
https://bugs.launchpad.net/launchpad/+bug/636804
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/403883
The old +opstats endpoint needs to go away: its approach isn't very gunicorn-friendly, and it's close to being entirely superseded by statsd. Emit statsd metrics corresponding to all the remaining opstats metrics, which will allow us to remove +opstats once we stop querying it.
--
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:statsd-for-opstats into launchpad:master.
diff --git a/lib/lp/app/stories/basics/xx-opstats.txt b/lib/lp/app/stories/basics/xx-opstats.txt
index beb5d93..558131b 100644
--- a/lib/lp/app/stories/basics/xx-opstats.txt
+++ b/lib/lp/app/stories/basics/xx-opstats.txt
@@ -11,11 +11,29 @@ We can access them via XML-RPC:
... transport=XMLRPCTestTransport()
... )
+We also emit similar metrics to statsd, so set that up.
+
+ >>> from textwrap import dedent
+ >>> from fixtures import MockPatchObject
+ >>> from zope.component import getUtility
+ >>> from lp.services.compat import mock
+ >>> from lp.services.config import config
+ >>> from lp.services.statsd.interfaces.statsd_client import IStatsdClient
+
+ >>> config.push('statsd_test', dedent('''
+ ... [statsd]
+ ... environment: test
+ ... '''))
+ >>> statsd_client = getUtility(IStatsdClient)
+ >>> stats_client = mock.Mock()
+
Create a function to report our stats for these tests.
+ >>> from collections import Counter
>>> def reset():
... from lp.services.webapp.opstats import OpStats
... OpStats.resetStats()
+ ... stats_client.reset_mock()
...
>>> def report():
... stats = lp_xmlrpc.opstats()
@@ -23,8 +41,11 @@ Create a function to report our stats for these tests.
... value = stats[stat_key]
... if value > 0:
... print("%s: %d" % (stat_key, value))
+ ... for statsd_key, value in sorted(Counter(
+ ... call[0][0]
+ ... for call in stats_client.incr.call_args_list).items()):
+ ... print("statsd: %s: %d" % (statsd_key, value))
... reset()
- ...
Number of requests and XML-RPC requests
---------------------------------------
@@ -35,7 +56,8 @@ the 'requests' and 'xml-rpc requests' statistics, but this will not be
visible until the next time we access the statistics as the statistics
are adjusted after the request has been served:
- >>> stats = lp_xmlrpc.opstats()
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... stats = lp_xmlrpc.opstats()
>>> for key in sorted(stats.keys()):
... # Print all so new keys added to OpStats.stats will trigger
... # failures in this test prompting developers to extend it.
@@ -60,17 +82,23 @@ are adjusted after the request has been served:
>>> report()
requests: 1
xml-rpc requests: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.xmlrpc,env=test: 1
Number of HTTP requests and success codes
-----------------------------------------
- >>> output = http("GET / HTTP/1.1\nHost: bugs.launchpad.test\n")
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http("GET / HTTP/1.1\nHost: bugs.launchpad.test\n")
>>> output.getStatus()
200
>>> report()
2XXs: 1
http requests: 1
requests: 1
+ statsd: errors.2XX,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
Number of 404s
--------------
@@ -80,7 +108,8 @@ database policy, this request first uses the slave DB. The requested
information is not found in there, so a retry is attempted against the
master DB in case the information is missing due to replication lag.
- >>> output = http("GET http://launchpad.test/non-existant HTTP/1.1\n")
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http("GET http://launchpad.test/non-existant HTTP/1.1\n")
>>> output.getStatus()
404
>>> report()
@@ -89,6 +118,11 @@ master DB in case the information is missing due to replication lag.
http requests: 1
requests: 1
retries: 1
+ statsd: errors.404,env=test: 1
+ statsd: errors.4XX,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
+ statsd: requests.retries,env=test: 1
Number of 500 Internal Server Errors (unhandled exceptions)
-----------------------------------------------------------
@@ -110,7 +144,8 @@ particular need to differentiate these cases though:
>>> error_view_fixture = ZopeAdapterFixture(
... ErrorView, (None, IDefaultBrowserLayer), Interface, "error-test")
>>> error_view_fixture.setUp()
- >>> output = http("GET /error-test HTTP/1.1\nHost: launchpad.test\n")
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http("GET /error-test HTTP/1.1\nHost: launchpad.test\n")
>>> output.getStatus()
500
>>> report()
@@ -118,18 +153,22 @@ particular need to differentiate these cases though:
5XXs: 1
http requests: 1
requests: 1
+ statsd: errors.500,env=test: 1
+ statsd: errors.5XX,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
We also have a special metric counting server errors returned to known
web browsers (5XXs_b) - in the production environment we care more
about errors returned to people than robots crawling obscure parts of
the site.
- >>> from textwrap import dedent
- >>> output = http(dedent("""\
- ... GET /error-test HTTP/1.1
- ... Host: launchpad.test
- ... User-Agent: Mozilla/42.0
- ... """))
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http(dedent("""\
+ ... GET /error-test HTTP/1.1
+ ... Host: launchpad.test
+ ... User-Agent: Mozilla/42.0
+ ... """))
>>> output.getStatus()
500
>>> report()
@@ -138,36 +177,45 @@ the site.
5XXs_b: 1
http requests: 1
requests: 1
+ statsd: errors.500,env=test: 1
+ statsd: errors.5XX,env=test: 1
+ statsd: errors.5XX.browser,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
>>> error_view_fixture.cleanUp()
Number of XML-RPC Faults
------------------------
- >>> try:
- ... opstats = lp_xmlrpc.invalid() # XXX: Need a HTTP test too
- ... print('Should have raised a Fault exception!')
- ... except xmlrpc_client.Fault:
- ... pass
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... try:
+ ... opstats = lp_xmlrpc.invalid() # XXX: Need a HTTP test too
+ ... print('Should have raised a Fault exception!')
+ ... except xmlrpc_client.Fault:
+ ... pass
>>> report()
requests: 1
xml-rpc faults: 1
xml-rpc requests: 1
+ statsd: errors.xmlrpc,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.xmlrpc,env=test: 1
Number of soft timeouts
-----------------------
- >>> from lp.services.config import config
>>> test_data = dedent("""
... [database]
... soft_request_timeout: 1
... """)
>>> config.push('base_test_data', test_data)
- >>> output = http(r"""
- ... GET /+soft-timeout HTTP/1.1
- ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
- ... """)
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http(dedent(r"""
+ ... GET /+soft-timeout HTTP/1.1
+ ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
+ ... """))
>>> output.getStatus()
200
>>> report()
@@ -175,6 +223,10 @@ Number of soft timeouts
http requests: 1
requests: 1
soft timeouts: 1
+ statsd: errors.2XX,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
+ statsd: timeouts.soft,env=test: 1
Number of Timeouts
------------------
@@ -189,10 +241,11 @@ also return a timeout Fault:
... soft_request_timeout: 2
... """)
>>> config.push('test_data', test_data)
- >>> output = http(r"""
- ... GET /+soft-timeout HTTP/1.1
- ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
- ... """)
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http(dedent(r"""
+ ... GET /+soft-timeout HTTP/1.1
+ ... Authorization: Basic Zm9vLmJhckBjYW5vbmljYWwuY29tOnRlc3Q=
+ ... """))
>>> output.getStatus()
503
>>> # Reset the timeouts so +opstat doesn't die.
@@ -203,6 +256,11 @@ also return a timeout Fault:
http requests: 1
requests: 1
timeouts: 1
+ statsd: errors.503,env=test: 1
+ statsd: errors.5XX,env=test: 1
+ statsd: requests.all,env=test: 1
+ statsd: requests.http,env=test: 1
+ statsd: timeouts.hard,env=test: 1
HTTP access for Cricket
@@ -210,8 +268,9 @@ HTTP access for Cricket
Stats can also be retrieved via HTTP in cricket-graph format:
- >>> output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
- >>> output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
+ >>> with MockPatchObject(statsd_client, '_client', stats_client):
+ ... output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
+ ... output = http("GET / HTTP/1.1\nHost: launchpad.test\n")
>>> print(http("GET /+opstats HTTP/1.1\nHost: launchpad.test\n"))
HTTP/1.1 200 Ok
...
@@ -259,7 +318,6 @@ to somewhere that doesn't exist.
Then we need to drop all our existing connections, so when we reconnect
the new connection information is used.
- >>> from zope.component import getUtility
>>> from storm.zope.interfaces import IZStorm
>>> getUtility(IZStorm)._reset()
@@ -302,3 +360,7 @@ But our database connections are broken.
>>> print(IStore(Person).find(Person, name='janitor').one().name)
janitor
+
+Clean up.
+
+ >>> _ = config.pop('statsd_test')
diff --git a/lib/lp/services/webapp/errorlog.py b/lib/lp/services/webapp/errorlog.py
index 0ec77cb..ed25ace 100644
--- a/lib/lp/services/webapp/errorlog.py
+++ b/lib/lp/services/webapp/errorlog.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2019 Canonical Ltd. This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Error logging facilities."""
@@ -21,6 +21,7 @@ import oops_timeline
import pytz
import six
from six.moves.urllib.parse import urlparse
+from zope.component import getUtility
from zope.component.interfaces import ObjectEvent
from zope.error.interfaces import IErrorReportingUtility
from zope.event import notify
@@ -33,6 +34,7 @@ from lp.app import versioninfo
from lp.layers import WebServiceLayer
from lp.services.config import config
from lp.services.messaging import rabbit
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
from lp.services.timeline.requesttimeline import get_request_timeline
from lp.services.webapp.adapter import (
get_request_duration,
@@ -508,6 +510,7 @@ def end_request(event):
# the soft timeout has expired, log an OOPS.
if event.request.oopsid is None and soft_timeout_expired():
OpStats.stats['soft timeouts'] += 1
+ getUtility(IStatsdClient).incr('timeouts.soft')
globalErrorUtility.raising(
(SoftRequestTimeout, SoftRequestTimeout(event.object), None),
event.request)
diff --git a/lib/lp/services/webapp/publication.py b/lib/lp/services/webapp/publication.py
index e832679..85a8ab3 100644
--- a/lib/lp/services/webapp/publication.py
+++ b/lib/lp/services/webapp/publication.py
@@ -699,6 +699,7 @@ class LaunchpadBrowserPublication(
if isinstance(exc_info[1], (da.RequestExpired, TimeoutError)):
OpStats.stats['timeouts'] += 1
+ getUtility(IStatsdClient).incr('timeouts.hard')
def should_retry(exc_info):
if not retry_allowed:
@@ -810,8 +811,10 @@ class LaunchpadBrowserPublication(
# Maintain operational statistics.
if getattr(request, '_wants_retry', False):
OpStats.stats['retries'] += 1
+ statsd_client.incr('requests.retries')
else:
OpStats.stats['requests'] += 1
+ statsd_client.incr('requests.all')
# Increment counters for HTTP status codes we track individually
# NB. We use IBrowserRequest, as other request types such as
@@ -820,6 +823,7 @@ class LaunchpadBrowserPublication(
# and XML-RPC requests.
if IBrowserRequest.providedBy(request):
OpStats.stats['http requests'] += 1
+ statsd_client.incr('requests.http')
status = request.response.getStatus()
if status == 404: # Not Found
OpStats.stats['404s'] += 1
@@ -832,13 +836,14 @@ class LaunchpadBrowserPublication(
statsd_client.incr('errors.503')
# Increment counters for status code groups.
- status_group = str(status)[0] + 'XXs'
- OpStats.stats[status_group] += 1
+ status_group = str(status)[0] + 'XX'
+ OpStats.stats[status_group + 's'] += 1
+ statsd_client.incr('errors.%s' % status_group)
# Increment counter for 5XXs_b.
- if is_browser(request) and status_group == '5XXs':
+ if is_browser(request) and status_group == '5XX':
OpStats.stats['5XXs_b'] += 1
- statsd_client.incr('errors.5XX')
+ statsd_client.incr('errors.5XX.browser')
# Make sure our databases are in a sane state for the next request.
thread_name = threading.current_thread().name
diff --git a/lib/lp/services/webapp/servers.py b/lib/lp/services/webapp/servers.py
index fb05ab1..57de022 100644
--- a/lib/lp/services/webapp/servers.py
+++ b/lib/lp/services/webapp/servers.py
@@ -78,6 +78,7 @@ from lp.services.oauth.interfaces import (
TokenException,
)
from lp.services.propertycache import cachedproperty
+from lp.services.statsd.interfaces.statsd_client import IStatsdClient
from lp.services.webapp.authentication import (
check_oauth_signature,
get_oauth_authorization,
@@ -1439,9 +1440,11 @@ class PublicXMLRPCPublication(LaunchpadBrowserPublication):
LaunchpadBrowserPublication.handleException(
self, object, request, exc_info, retry_allowed)
OpStats.stats['xml-rpc faults'] += 1
+ getUtility(IStatsdClient).incr('errors.xmlrpc')
def endRequest(self, request, object):
OpStats.stats['xml-rpc requests'] += 1
+ getUtility(IStatsdClient).incr('requests.xmlrpc')
return LaunchpadBrowserPublication.endRequest(self, request, object)