← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~sinzui/launchpad/mailman-heartbeat-0 into lp:launchpad

 

Curtis Hovey has proposed merging lp:~sinzui/launchpad/mailman-heartbeat-0 into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  #183372 Mailman's qrunner doesn't provide any way to be monitored
  https://bugs.launchpad.net/bugs/183372

For more details, see:
https://code.launchpad.net/~sinzui/launchpad/mailman-heartbeat-0/+merge/46160

Add a heartbeat to Mailman's xmlrpc log.

    Launchpad bug: https://bugs.launchpad.net/bugs/183372
    Pre-implementation: flacoste
    Test command: ./bin/test -vv -t TestXMLRPCRunnerHeatBeat

We currently don't have any way to monitor the health of the qrunner process,
besides a check for the process.

Can we add a variable which controls how often something is written to either
the xmlrpc or qrunner log file so that if there's no activity within that
specified period there will be a "--MARK--" entry with a timestamp. This way
an external process (such as nagios) can easily check for the health of the
qrunner.

--------------------------------------------------------------------

RULES

    * Add a heartbeat method of Launchpad's Mailman xmlrpc runner.
      The method should limit the frequency to 5 minutes. Most syncs happen
      in 1 minutes. The longest time we have observed a sync to happen is 10
      minutes. We think 20-30 minutes without a sync should raise an alarm.
    * Update the runner __init__ method to record a heartbeat
      because we have seen it fail to start in the past.
    * Update the _oneloop() method to record a heartbeat if no exceptions
      were encountered.
      If repeated exceptions are encountered mailman will continue to run,
      but we know it is not talking to Launchpad. If this happen too often
      we want an alarm to sound.


QA

    * Setup nagios to poll <logpath>/mailman/logs/xmlrpc for --MARK--
    * Verify we see a heartbeat.


LINT

Linting changed files:
    lib/lp/services/mailman/monkeypatches/xmlrpcrunner.py
    lib/lp/services/mailman/tests/test_xmlrpcrunner.py


IMPLEMENTATION

Added a heartbeat method that records --MARK-- in the xmlrpc log when
now is greater exceeds the heartbeat frequency. Add the the heartbeat to
__init__() and _oneloop().
    lib/lp/services/mailman/monkeypatches/xmlrpcrunner.py
    lib/lp/services/mailman/tests/test_xmlrpcrunner.py
-- 
https://code.launchpad.net/~sinzui/launchpad/mailman-heartbeat-0/+merge/46160
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~sinzui/launchpad/mailman-heartbeat-0 into lp:launchpad.
=== modified file 'lib/lp/services/mailman/monkeypatches/xmlrpcrunner.py'
--- lib/lp/services/mailman/monkeypatches/xmlrpcrunner.py	2010-09-20 02:10:01 +0000
+++ lib/lp/services/mailman/monkeypatches/xmlrpcrunner.py	2011-01-13 17:52:42 +0000
@@ -9,6 +9,10 @@
     ]
 
 from cStringIO import StringIO
+from datetime import (
+    datetime,
+    timedelta,
+    )
 import errno
 import os
 from random import shuffle
@@ -104,6 +108,9 @@
         self._proxy = get_mailing_list_api_proxy()
         # Ensure that the log file exists, mostly for the test suite.
         syslog('xmlrpc', 'XMLRPC runner starting')
+        self.heartbeat_frequency = timedelta(minutes=5)
+        self.last_heartbeat = None
+        self._heartbeat()
 
     def _oneloop(self):
         """Check to see if there's anything for Mailman to do.
@@ -120,13 +127,23 @@
             self._check_list_actions()
             self._get_subscriptions()
             self._check_held_messages()
-        # pylint: disable-msg=W0702
         except:
             # Log the exception and report an OOPS.
             log_exception('Unexpected XMLRPCRunner exception')
+        else:
+            self._heartbeat()
         # Snooze for a while.
         return 0
 
+    def _heartbeat(self):
+        """Add a heartbeat to the log for a monitor to watch."""
+        now = datetime.now()
+        last_heartbeat = self.last_heartbeat
+        if (last_heartbeat is None
+            or now - last_heartbeat >= self.heartbeat_frequency):
+            syslog('xmlrpc', '--MARK--')
+            self.last_heartbeat = now
+
     def _log(self, exc):
         """Log the exception in a log file and as an OOPS."""
         Runner._log(self, exc)

=== added file 'lib/lp/services/mailman/tests/test_xmlrpcrunner.py'
--- lib/lp/services/mailman/tests/test_xmlrpcrunner.py	1970-01-01 00:00:00 +0000
+++ lib/lp/services/mailman/tests/test_xmlrpcrunner.py	2011-01-13 17:52:42 +0000
@@ -0,0 +1,105 @@
+# Copyright 20011 Canonical Ltd.  This software is licensed under the
+# GNU Affero General Public License version 3 (see the file LICENSE).
+"""Test the Launchpad XMLRPC runner."""
+
+__metaclass__ = type
+__all__ = []
+
+from contextlib import contextmanager
+from datetime import datetime
+
+from Mailman.Logging.Syslog import syslog
+from Mailman.Queue.XMLRPCRunner import XMLRPCRunner
+
+from canonical.testing.layers import DatabaseFunctionalLayer
+from lp.services.mailman.testing import (
+     get_mailing_list_api_test_proxy,
+     MailmanTestCase,
+     )
+
+
+@contextmanager
+def one_loop_exception(runner):
+
+    def raise_exception():
+        raise Exception('Test exception handling.')
+
+    original__check_list_actions = runner._check_list_actions
+    runner._check_list_actions = raise_exception
+    try:
+        yield
+    finally:
+        runner._check_list_actions= original__check_list_actions
+
+
+class TestXMLRPCRunnerHeatBeat(MailmanTestCase):
+    """Test XMLRPCRunner._hearbeat method."""
+
+    layer = DatabaseFunctionalLayer
+
+    def setUp(self):
+        super(TestXMLRPCRunnerHeatBeat, self).setUp()
+        self.mm_list = None
+        syslog.write_ex('xmlrpc', 'Ensure the log is open.')
+        self.reset_log()
+        self.runner = XMLRPCRunner()
+        # MailmanTestCase's setup of the test proxy is ignored because
+        # the runner had a reference to the true proxy in its __init__.
+        self.runner._proxy = get_mailing_list_api_test_proxy()
+
+    def get_mark(self):
+        """Return the first mark line found in the log."""
+        log_path = syslog._logfiles['xmlrpc']._Logger__filename
+        mark = None
+        with open(log_path, 'r') as log_file:
+            for line in log_file.readlines():
+                if '--MARK--' in line:
+                    mark = line
+                    break
+        return mark
+
+    def reset_log(self):
+        """Truncate the log."""
+        log_path = syslog._logfiles['xmlrpc']._Logger__filename
+        syslog._logfiles['xmlrpc'].close()
+        with open(log_path, 'w') as log_file:
+            log_file.truncate()
+        syslog.write_ex('xmlrpc', 'Reset by test.')
+
+    def test_heartbeat_on_start(self):
+        # A heartbeat is recorded in the log on start.
+        mark = self.get_mark()
+        self.assertTrue(mark is not None)
+
+    def test_heatbeat_frequency_no_heartbeat(self):
+        # A hartbeat is not recorded when the that last beat less than
+        # the heartbeat_frequency.
+        self.runner._heartbeat()
+        self.reset_log()
+        self.runner._heartbeat()
+        now = datetime.now()
+        last_heartbeat = self.runner.last_heartbeat
+        self.assertTrue(
+            now - last_heartbeat < self.runner.heartbeat_frequency)
+        mark = self.get_mark()
+        self.assertTrue(mark is None)
+
+    def test__oneloop_success_heartbeat(self):
+        # A heartbeat is recorded when the loop completes successfully.
+        self.reset_log()
+        self.runner.last_heartbeat = (
+            self.runner.last_heartbeat - self.runner.heartbeat_frequency)
+        self.runner._oneloop()
+        mark = self.get_mark()
+        self.assertTrue(mark is not None)
+
+    def test__oneloop_exception_no_heartbeat(self):
+        # A heartbeat is not recorded when there is an exception in the loop.
+        self.reset_log()
+        self.runner.last_heartbeat = (
+            self.runner.last_heartbeat - self.runner.heartbeat_frequency)
+        # Hack runner to raise an oops.
+        with one_loop_exception(self.runner):
+            self.runner._oneloop()
+        mark = self.get_mark()
+        self.assertTrue(mark is None)