← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad

 

William Grant has proposed merging lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad.

Commit message:
Don't OOPS throughout fastdowntime DB patches and staging DB restores.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #504291 in Launchpad itself: "DisconnectionErrors (already disconnected) happening again"
  https://bugs.launchpad.net/launchpad/+bug/504291
  Bug #1523382 in Launchpad itself: "full-update.py and staging_restore.sh cause ongoing DisconnectionError OOPSes"
  https://bugs.launchpad.net/launchpad/+bug/1523382

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/no-fdt-oopses/+merge/279741

Don't OOPS throughout fastdowntime DB patches and staging DB restores.

Three fixes:

 - I finally tracked down and fixed AssertionErrors that have plagued us for several years.

 - We now filter out DisconnectionError OOPSes that are caused by pgbouncer DB disablement. I whitelisted two messages because production is running a slightly non-final version of my DISABLE/ENABLE patch, with a different error.

 - Staging DB restores are wrapped in DISABLE/ENABLE to make them look like a fastdowntime and not OOPS heavily.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad.
=== modified file 'database/replication/Makefile'
--- database/replication/Makefile	2014-11-17 06:49:34 +0000
+++ database/replication/Makefile	2015-12-07 09:27:08 +0000
@@ -54,14 +54,20 @@
 # place.
 stagingsetup: DUMPLIST:=$(shell tempfile --suffix=.lst --prefix=lpstag)
 stagingsetup:
-	${STAGING_PGBOUNCER} -c 'kill launchpad_staging'
-	${STAGING_PGBOUNCER} -c 'kill launchpad_staging_slave'
+	# Deny new connections to the main DBs and kill any leftovers.
+	${STAGING_PGBOUNCER} -c 'DISABLE launchpad_staging'
+	${STAGING_PGBOUNCER} -c 'DISABLE launchpad_staging_slave'
+	${STAGING_PGBOUNCER} -c 'KILL launchpad_staging'
+	${STAGING_PGBOUNCER} -c 'KILL launchpad_staging_slave'
+	${STAGING_PGBOUNCER} -c 'RESUME launchpad_staging'
+	${STAGING_PGBOUNCER} -c 'RESUME launchpad_staging_slave'
 	-${PGMASSACRE} lpmain_staging
 
-	# Quickly clear out the session DB.
-	${STAGING_PGBOUNCER} -c 'kill session_staging'
+	# Quickly clear out the session DB. No need to DISABLE here, as
+	# we bring the DB back quickly.
+	${STAGING_PGBOUNCER} -c 'KILL session_staging'
 	psql -d session_staging -c 'TRUNCATE sessiondata CASCADE;'
-	${STAGING_PGBOUNCER} -c 'resume session_staging'
+	${STAGING_PGBOUNCER} -c 'RESUME session_staging'
 
 	# Create the DB with the desired default tablespace.
 	${CREATEDB} --tablespace ${STAGING_TABLESPACE} lpmain_staging
@@ -85,8 +91,8 @@
 	@echo Setting feature flags
 	-psql -d lpmain_staging -c "INSERT INTO featureflag (flag, scope, priority, value) VALUES ('profiling.enabled', 'team:launchpad', 0, 'on')"
 	-psql -d lpmain_staging -c "INSERT INTO featureflag (flag, scope, priority, value) VALUES ('librarian.swift.enabled', 'default', 0, 'on')"
-	${STAGING_PGBOUNCER} -c 'resume launchpad_staging'
-	${STAGING_PGBOUNCER} -c 'resume launchpad_staging_slave'
+	${STAGING_PGBOUNCER} -c 'ENABLE launchpad_staging'
+	${STAGING_PGBOUNCER} -c 'ENABLE launchpad_staging_slave'
 
 stagingswitch:
 	echo Nothing to do. Staging already built inplace.

=== modified file 'lib/lp/services/webapp/errorlog.py'
--- lib/lp/services/webapp/errorlog.py	2015-07-08 16:05:11 +0000
+++ lib/lp/services/webapp/errorlog.py	2015-12-07 09:27:08 +0000
@@ -378,6 +378,9 @@
         #  - have a missing or offset REFERER header with a type listed in
         #    self._ignored_exceptions_for_offsite_referer
         self._oops_config.filters.append(self._filter_bad_urls_by_referer)
+        #  - look like a deliberate DB outage, to cut down fastdowntime
+        #    noise.
+        self._oops_config.filters.append(self._filter_deliberate_db_outages)
 
     @property
     def oops_prefix(self):
@@ -425,6 +428,17 @@
                     return True
         return False
 
+    def _filter_deliberate_db_outages(self, report):
+        if _get_type(report) == 'DisconnectionError':
+            message = report.get('value', '')
+            for ok in (
+                    "database does not allow connections",
+                    "pgbouncer database is disabled"):
+                if (message.startswith(ok)
+                        or message.startswith("ERROR:  " + ok)):
+                    return True
+        return False
+
     def _attach_messages(self, report, context):
         """merges self._oops_messages into the report req_vars variable."""
         # XXX AaronBentley 2009-11-26 bug=488950: There should be separate

=== modified file 'lib/lp/services/webapp/publication.py'
--- lib/lp/services/webapp/publication.py	2015-09-28 17:38:45 +0000
+++ lib/lp/services/webapp/publication.py	2015-12-07 09:27:08 +0000
@@ -591,9 +591,11 @@
             pass
 
         # Log an OOPS for DisconnectionErrors: we don't expect to see
-        # disconnections as a routine event, so having information about them
-        # is important. See Bug #373837 for more information.
-        # We need to do this before we re-raise the exception as a Retry.
+        # most types of disconnections as a routine event
+        # (full-update.py produces very specific pgbouncer errors), so
+        # having information about them is important. See Bug #373837
+        # for more information.  We need to do this before we re-raise
+        # the exception as a Retry.
         if isinstance(exc_info[1], DisconnectionError):
             getUtility(IErrorReportingUtility).raising(exc_info, request)
 
@@ -645,6 +647,10 @@
             request._wants_retry = True
             if isinstance(exc_info[1], Retry):
                 raise
+            # Abort any in-progress transaction and reset any
+            # disconnected stores. ZopePublication.handleException would
+            # do this for us if we weren't bypassing it.
+            transaction.abort()
             raise Retry(exc_info)
 
         superclass = zope.app.publication.browser.BrowserPublication

=== modified file 'lib/lp/services/webapp/tests/test_error.py'
--- lib/lp/services/webapp/tests/test_error.py	2012-09-06 10:46:06 +0000
+++ lib/lp/services/webapp/tests/test_error.py	2015-12-07 09:27:08 +0000
@@ -9,6 +9,7 @@
 import time
 import urllib2
 
+import psycopg2
 from storm.exceptions import (
     DisconnectionError,
     OperationalError,
@@ -26,10 +27,14 @@
 from lp.services.webapp.servers import LaunchpadTestRequest
 from lp.testing import TestCase
 from lp.testing.fixture import (
+    CaptureOops,
     PGBouncerFixture,
     Urllib2Fixture,
     )
-from lp.testing.layers import LaunchpadFunctionalLayer
+from lp.testing.layers import (
+    DatabaseLayer,
+    LaunchpadFunctionalLayer,
+    )
 from lp.testing.matchers import Contains
 
 
@@ -125,21 +130,78 @@
         url = 'http://launchpad.dev/'
         self.retryConnection(url, bouncer)
         # Now break the database, and we get an exception, along with
-        # our view.
+        # our view and several OOPSes from the retries.
         bouncer.stop()
-        error = self.getHTTPError(url)
+
+        with CaptureOops() as oopses:
+            error = self.getHTTPError(url)
         self.assertEqual(503, error.code)
         self.assertThat(error.read(),
                         Contains(DisconnectionErrorView.reason))
+        self.assertEqual(
+            ([('DisconnectionError', 'error with no message from the libpq')]
+             * 2) +
+            ([('DisconnectionError',
+               'could not connect to server: Connection refused')]
+             * 6),
+            [(oops['type'], oops['value'].split('\n')[0])
+             for oops in oopses.oopses])
+
         # We keep seeing the correct exception on subsequent requests.
-        error = self.getHTTPError(url)
+        with CaptureOops() as oopses:
+            error = self.getHTTPError(url)
         self.assertEqual(503, error.code)
         self.assertThat(error.read(),
                         Contains(DisconnectionErrorView.reason))
+        self.assertEqual(
+            ([('DisconnectionError',
+               'could not connect to server: Connection refused')]
+             * 8),
+            [(oops['type'], oops['value'].split('\n')[0])
+             for oops in oopses.oopses])
+
         # When the database is available again, requests succeed.
         bouncer.start()
         self.retryConnection(url, bouncer)
 
+        # If we ask pgbouncer to disable the database, requests fail and
+        # get the same error page, but we don't log OOPSes except for
+        # the initial connection terminations. Disablement is always
+        # explicit maintenance, and we don't need lots of ongoing OOPSes
+        # to tell us about maintenance that we're doing.
+        dbname = DatabaseLayer._db_fixture.dbname
+        conn = psycopg2.connect("dbname=pgbouncer")
+        conn.autocommit = True
+        cur = conn.cursor()
+        cur.execute("DISABLE " + dbname)
+        cur.execute("KILL " + dbname)
+        cur.execute("RESUME " + dbname)
+
+        with CaptureOops() as oopses:
+            error = self.getHTTPError(url)
+        self.assertEqual(503, error.code)
+        self.assertThat(error.read(),
+                        Contains(DisconnectionErrorView.reason))
+        self.assertEqual(
+            [('DisconnectionError', 'database removed')],
+            [(oops['type'], oops['value'].split('\n')[0])
+             for oops in oopses.oopses])
+
+        # A second request doesn't log any OOPSes.
+        with CaptureOops() as oopses:
+            error = self.getHTTPError(url)
+        self.assertEqual(503, error.code)
+        self.assertThat(error.read(),
+                        Contains(DisconnectionErrorView.reason))
+        self.assertEqual(
+            [],
+            [(oops['type'], oops['value'].split('\n')[0])
+             for oops in oopses.oopses])
+
+        # When the database is available again, requests succeed.
+        cur.execute("ENABLE %s" % DatabaseLayer._db_fixture.dbname)
+        self.retryConnection(url, bouncer)
+
     def test_disconnectionerror_view(self):
         request = LaunchpadTestRequest()
         DisconnectionErrorView(DisconnectionError(), request)


Follow ups