← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~cjwatson/lp-dev-utils/ppr-talisker into lp:lp-dev-utils

 

Colin Watson has proposed merging lp:~cjwatson/lp-dev-utils/ppr-talisker into lp:lp-dev-utils.

Commit message:
Add Talisker log format support to the PPR.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/lp-dev-utils/ppr-talisker/+merge/405633
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/lp-dev-utils/ppr-talisker into lp:lp-dev-utils.
=== modified file 'page-performance-report.ini'
--- page-performance-report.ini	2021-07-09 13:19:44 +0000
+++ page-performance-report.ini	2021-07-13 13:44:38 +0000
@@ -51,8 +51,8 @@
 Translations - Front page=^https?://translations\.[^/]+/translations/\+index$
 Translations - Overview=^https?://translations\..*/\+lang/\w+(/\+index)?$
 
-Public XML-RPC=^https://(launchpad|xmlrpc)[^/]+/bazaar/\w+$
-Private XML-RPC=^https://(launchpad|xmlrpc)[^/]+/
+Public XML-RPC=^https?://(launchpad|xmlrpc)[^/]+/bazaar/\w+$
+Private XML-RPC=^https?://(launchpad|xmlrpc)[^/]+/
     (archive|authserver|bugs|codehosting|
      codeimportscheduler|mailinglists|
      featureflags|git)/\w+$

=== modified file 'pageperformancereport.py'
--- pageperformancereport.py	2021-07-09 13:15:54 +0000
+++ pageperformancereport.py	2021-07-13 13:44:38 +0000
@@ -23,6 +23,7 @@
 import sre_constants
 from textwrap import dedent
 import time
+from urlparse import urlunsplit
 
 
 logging.basicConfig()
@@ -136,6 +137,44 @@
         return interval.seconds + interval.microseconds / 1000000.0
 
 
+class TaliskerRequest:
+
+    def __init__(self, dt, logfmt):
+        self.dt = dt
+        host = logfmt['host']
+        if logfmt['scheme'] == 'https' and host.endswith(':443'):
+            host = host[:-len(':443')]
+        self.url = urlunsplit(
+            (logfmt['scheme'], host, logfmt['path'], logfmt.get('qs', ''), ''))
+        self.interval = timedelta(milliseconds=float(logfmt['duration_ms']))
+        self.traversal_interval = (
+            timedelta(milliseconds=float(logfmt['traversal_duration_ms']))
+            if 'traversal_duration_ms' in logfmt else None)
+        self.publication_interval = (
+            timedelta(milliseconds=float(logfmt['publication_duration_ms']))
+            if 'publication_duration_ms' in logfmt else None)
+        self.pageid = logfmt.get('pageid')
+        self.sql_statements = (
+            int(logfmt['sql_statements'])
+            if 'sql_statements' in logfmt else None)
+        self.sql_seconds = (
+            float(logfmt['sql_ms']) / 1000 if 'sql_ms' in logfmt else None)
+
+    @property
+    def app_seconds(self):
+        if self.traversal_interval is not None:
+            interval = self.traversal_interval
+            if self.publication_interval is not None:
+                interval += self.publication_interval
+        else:
+            interval = self.interval
+        return interval.total_seconds()
+
+    @property
+    def total_seconds(self):
+        return self.interval.total_seconds()
+
+
 class Category:
     """A Category in our report.
 
@@ -872,6 +911,8 @@
             parse_launchpad(args, times, options)
         elif log_format == 'access':
             parse_access(args, times, options)
+        elif log_format == 'talisker':
+            parse_talisker(args, times, options)
         else:
             log.fatal("No parser available for log_format: %r" % log_format)
             return 1
@@ -998,12 +1039,15 @@
 
 
 _zserver_ts_re = re.compile(
-    '^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)(?:.(\d{6}))?$')
+    r'^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)(?:.(\d{6}))?$')
 
 _access_ts_re = re.compile(
-    '^\[(\d{2})/([A-Za-z]+)/(\d{4}):(\d{2}):(\d{2}):(\d{2})[+-]\d{4}\]$')
-    
-      
+    r'^\[(\d{2})/([A-Za-z]+)/(\d{4}):(\d{2}):(\d{2}):(\d{2})[+-]\d{4}\]$')
+
+_talisker_ts_re = re.compile(
+    r'^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)\.(\d{3})Z$')
+
+
 def parse_timestamp_zserverlog(ts_string):
     match = _zserver_ts_re.search(ts_string)
     if match is None:
@@ -1020,7 +1064,16 @@
     month = time.strptime(alpha_month,'%b').tm_mon # get numeric month
     return datetime(
         *(int(elem) for elem in (year, month, day, hour, min, sec)))
-        
+
+
+def parse_timestamp_taliskerlog(ts_string):
+    match = _talisker_ts_re.search(ts_string)
+    if match is None:
+        raise ValueError("Invalid timestamp")
+    year, month, day, hour, min, sec, millisec = match.groups()
+    return datetime(
+        int(year), int(month), int(day), int(hour), int(min), int(sec),
+        int(millisec) * 1000)
 
 
 def parse_launchpad(tracefiles, times, options):
@@ -1180,8 +1233,75 @@
             except MalformedLine as x:
                 log.error(
                     "Malformed line %s (%s)" % (repr(line), x))
-                    
-                    
+
+
+def unquote_logfmt_value(value):
+    if value.startswith('"') and value.endswith('"'):
+        value = value[1:-1]
+    return value.replace(r'\"', '"')
+
+
+def parse_talisker(tracefiles, times, options):
+    talisker_re = re.compile(
+        r'^(?P<timestamp>\d+-\d+-\d+ \d+:\d+:\d+\.\d+Z)'
+        r' (?P<levelname>\S+) (?P<module>[0-9a-zA-Z_]+(?:\.[0-9a-zA-Z_]+)*)'
+        r' "(?P<message>(?:[^"\\]|\\.)*(?:"|\.\.\.))'
+        r' ?(?P<logfmt>.*)')
+    logfmt_re = re.compile(
+        r'(?P<key>.+?)=(?P<value>"(?:[^"\\]|\\.)*?"|\S+)\s*')
+    requests = {}
+    total_requests = 0
+    for tracefile in tracefiles:
+        log.info('Processing %s', tracefile)
+        for line in smart_open(tracefile):
+            line = line.rstrip()
+            if not line or not line[0].isdigit():
+                # Log entries start with a digit.  Other lines are things
+                # like tracebacks or warnings, so just ignore those.
+                continue
+
+            try:
+                match = talisker_re.match(line)
+                if not match:
+                    raise MalformedLine()
+
+                try:
+                    dt = parse_timestamp_taliskerlog(
+                        match.group('timestamp'))
+                except ValueError:
+                    raise MalformedLine(
+                        'Invalid timestamp %r' % match.group('timestamp'))
+
+                # Filter entries by command line date range.
+                if options.from_ts is not None and dt < options.from_ts:
+                    continue # Skip to next line.
+                if options.until_ts is not None and dt > options.until_ts:
+                    break # Skip to next log file.
+
+                # Parse logfmt.
+                logfmt = {
+                    m.group('key'): unquote_logfmt_value(m.group('value'))
+                    for m in logfmt_re.finditer(match.group('logfmt'))}
+                if not all(
+                        key in logfmt for key in
+                        ('request_id', 'method', 'scheme', 'host', 'path')):
+                    # Some other log entry, not an HTTP request.
+                    continue
+
+                url = urlunsplit(
+                    (logfmt['scheme'], logfmt['host'], logfmt['path'],
+                     logfmt.get('qs', ''), ''))
+                request = TaliskerRequest(dt, logfmt)
+                requests[logfmt['request_id']] = request
+                total_requests += 1
+                if total_requests % 10000 == 0:
+                    log.debug("Parsed %d requests", total_requests)
+                # Add the request to any matching categories.
+                times.add_request(request)
+            except MalformedLine as x:
+                log.error("Malformed line %r (%s)" % (line, x))
+
+
 def html_report(
     outf, category_times, pageid_times, url_times,
     ninetyninth_percentile_threshold=None, histogram_resolution=0.5,