launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #19790
[Merge] lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad
William Grant has proposed merging lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad.
Commit message:
Don't OOPS throughout fastdowntime DB patches and staging DB restores.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #504291 in Launchpad itself: "DisconnectionErrors (already disconnected) happening again"
https://bugs.launchpad.net/launchpad/+bug/504291
Bug #1523382 in Launchpad itself: "full-update.py and staging_restore.sh cause ongoing DisconnectionError OOPSes"
https://bugs.launchpad.net/launchpad/+bug/1523382
For more details, see:
https://code.launchpad.net/~wgrant/launchpad/no-fdt-oopses/+merge/279741
Don't OOPS throughout fastdowntime DB patches and staging DB restores.
Three fixes:
- I finally tracked down and fixed AssertionErrors that have plagued us for several years.
- We now filter out DisconnectionError OOPSes that are caused by pgbouncer DB disablement. I whitelisted two messages because production is running a slightly non-final version of my DISABLE/ENABLE patch, with a different error.
- Staging DB restores are wrapped in DISABLE/ENABLE to make them look like a fastdowntime and not OOPS heavily.
--
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/no-fdt-oopses into lp:launchpad.
=== modified file 'database/replication/Makefile'
--- database/replication/Makefile 2014-11-17 06:49:34 +0000
+++ database/replication/Makefile 2015-12-07 09:27:08 +0000
@@ -54,14 +54,20 @@
# place.
stagingsetup: DUMPLIST:=$(shell tempfile --suffix=.lst --prefix=lpstag)
stagingsetup:
- ${STAGING_PGBOUNCER} -c 'kill launchpad_staging'
- ${STAGING_PGBOUNCER} -c 'kill launchpad_staging_slave'
+ # Deny new connections to the main DBs and kill any leftovers.
+ ${STAGING_PGBOUNCER} -c 'DISABLE launchpad_staging'
+ ${STAGING_PGBOUNCER} -c 'DISABLE launchpad_staging_slave'
+ ${STAGING_PGBOUNCER} -c 'KILL launchpad_staging'
+ ${STAGING_PGBOUNCER} -c 'KILL launchpad_staging_slave'
+ ${STAGING_PGBOUNCER} -c 'RESUME launchpad_staging'
+ ${STAGING_PGBOUNCER} -c 'RESUME launchpad_staging_slave'
-${PGMASSACRE} lpmain_staging
- # Quickly clear out the session DB.
- ${STAGING_PGBOUNCER} -c 'kill session_staging'
+ # Quickly clear out the session DB. No need to DISABLE here, as
+ # we bring the DB back quickly.
+ ${STAGING_PGBOUNCER} -c 'KILL session_staging'
psql -d session_staging -c 'TRUNCATE sessiondata CASCADE;'
- ${STAGING_PGBOUNCER} -c 'resume session_staging'
+ ${STAGING_PGBOUNCER} -c 'RESUME session_staging'
# Create the DB with the desired default tablespace.
${CREATEDB} --tablespace ${STAGING_TABLESPACE} lpmain_staging
@@ -85,8 +91,8 @@
@echo Setting feature flags
-psql -d lpmain_staging -c "INSERT INTO featureflag (flag, scope, priority, value) VALUES ('profiling.enabled', 'team:launchpad', 0, 'on')"
-psql -d lpmain_staging -c "INSERT INTO featureflag (flag, scope, priority, value) VALUES ('librarian.swift.enabled', 'default', 0, 'on')"
- ${STAGING_PGBOUNCER} -c 'resume launchpad_staging'
- ${STAGING_PGBOUNCER} -c 'resume launchpad_staging_slave'
+ ${STAGING_PGBOUNCER} -c 'ENABLE launchpad_staging'
+ ${STAGING_PGBOUNCER} -c 'ENABLE launchpad_staging_slave'
stagingswitch:
echo Nothing to do. Staging already built inplace.
=== modified file 'lib/lp/services/webapp/errorlog.py'
--- lib/lp/services/webapp/errorlog.py 2015-07-08 16:05:11 +0000
+++ lib/lp/services/webapp/errorlog.py 2015-12-07 09:27:08 +0000
@@ -378,6 +378,9 @@
# - have a missing or offset REFERER header with a type listed in
# self._ignored_exceptions_for_offsite_referer
self._oops_config.filters.append(self._filter_bad_urls_by_referer)
+ # - look like a deliberate DB outage, to cut down fastdowntime
+ # noise.
+ self._oops_config.filters.append(self._filter_deliberate_db_outages)
@property
def oops_prefix(self):
@@ -425,6 +428,17 @@
return True
return False
+ def _filter_deliberate_db_outages(self, report):
+ if _get_type(report) == 'DisconnectionError':
+ message = report.get('value', '')
+ for ok in (
+ "database does not allow connections",
+ "pgbouncer database is disabled"):
+ if (message.startswith(ok)
+ or message.startswith("ERROR: " + ok)):
+ return True
+ return False
+
def _attach_messages(self, report, context):
"""merges self._oops_messages into the report req_vars variable."""
# XXX AaronBentley 2009-11-26 bug=488950: There should be separate
=== modified file 'lib/lp/services/webapp/publication.py'
--- lib/lp/services/webapp/publication.py 2015-09-28 17:38:45 +0000
+++ lib/lp/services/webapp/publication.py 2015-12-07 09:27:08 +0000
@@ -591,9 +591,11 @@
pass
# Log an OOPS for DisconnectionErrors: we don't expect to see
- # disconnections as a routine event, so having information about them
- # is important. See Bug #373837 for more information.
- # We need to do this before we re-raise the exception as a Retry.
+ # most types of disconnections as a routine event
+ # (full-update.py produces very specific pgbouncer errors), so
+ # having information about them is important. See Bug #373837
+ # for more information. We need to do this before we re-raise
+ # the exception as a Retry.
if isinstance(exc_info[1], DisconnectionError):
getUtility(IErrorReportingUtility).raising(exc_info, request)
@@ -645,6 +647,10 @@
request._wants_retry = True
if isinstance(exc_info[1], Retry):
raise
+ # Abort any in-progress transaction and reset any
+ # disconnected stores. ZopePublication.handleException would
+ # do this for us if we weren't bypassing it.
+ transaction.abort()
raise Retry(exc_info)
superclass = zope.app.publication.browser.BrowserPublication
=== modified file 'lib/lp/services/webapp/tests/test_error.py'
--- lib/lp/services/webapp/tests/test_error.py 2012-09-06 10:46:06 +0000
+++ lib/lp/services/webapp/tests/test_error.py 2015-12-07 09:27:08 +0000
@@ -9,6 +9,7 @@
import time
import urllib2
+import psycopg2
from storm.exceptions import (
DisconnectionError,
OperationalError,
@@ -26,10 +27,14 @@
from lp.services.webapp.servers import LaunchpadTestRequest
from lp.testing import TestCase
from lp.testing.fixture import (
+ CaptureOops,
PGBouncerFixture,
Urllib2Fixture,
)
-from lp.testing.layers import LaunchpadFunctionalLayer
+from lp.testing.layers import (
+ DatabaseLayer,
+ LaunchpadFunctionalLayer,
+ )
from lp.testing.matchers import Contains
@@ -125,21 +130,78 @@
url = 'http://launchpad.dev/'
self.retryConnection(url, bouncer)
# Now break the database, and we get an exception, along with
- # our view.
+ # our view and several OOPSes from the retries.
bouncer.stop()
- error = self.getHTTPError(url)
+
+ with CaptureOops() as oopses:
+ error = self.getHTTPError(url)
self.assertEqual(503, error.code)
self.assertThat(error.read(),
Contains(DisconnectionErrorView.reason))
+ self.assertEqual(
+ ([('DisconnectionError', 'error with no message from the libpq')]
+ * 2) +
+ ([('DisconnectionError',
+ 'could not connect to server: Connection refused')]
+ * 6),
+ [(oops['type'], oops['value'].split('\n')[0])
+ for oops in oopses.oopses])
+
# We keep seeing the correct exception on subsequent requests.
- error = self.getHTTPError(url)
+ with CaptureOops() as oopses:
+ error = self.getHTTPError(url)
self.assertEqual(503, error.code)
self.assertThat(error.read(),
Contains(DisconnectionErrorView.reason))
+ self.assertEqual(
+ ([('DisconnectionError',
+ 'could not connect to server: Connection refused')]
+ * 8),
+ [(oops['type'], oops['value'].split('\n')[0])
+ for oops in oopses.oopses])
+
# When the database is available again, requests succeed.
bouncer.start()
self.retryConnection(url, bouncer)
+ # If we ask pgbouncer to disable the database, requests fail and
+ # get the same error page, but we don't log OOPSes except for
+ # the initial connection terminations. Disablement is always
+ # explicit maintenance, and we don't need lots of ongoing OOPSes
+ # to tell us about maintenance that we're doing.
+ dbname = DatabaseLayer._db_fixture.dbname
+ conn = psycopg2.connect("dbname=pgbouncer")
+ conn.autocommit = True
+ cur = conn.cursor()
+ cur.execute("DISABLE " + dbname)
+ cur.execute("KILL " + dbname)
+ cur.execute("RESUME " + dbname)
+
+ with CaptureOops() as oopses:
+ error = self.getHTTPError(url)
+ self.assertEqual(503, error.code)
+ self.assertThat(error.read(),
+ Contains(DisconnectionErrorView.reason))
+ self.assertEqual(
+ [('DisconnectionError', 'database removed')],
+ [(oops['type'], oops['value'].split('\n')[0])
+ for oops in oopses.oopses])
+
+ # A second request doesn't log any OOPSes.
+ with CaptureOops() as oopses:
+ error = self.getHTTPError(url)
+ self.assertEqual(503, error.code)
+ self.assertThat(error.read(),
+ Contains(DisconnectionErrorView.reason))
+ self.assertEqual(
+ [],
+ [(oops['type'], oops['value'].split('\n')[0])
+ for oops in oopses.oopses])
+
+ # When the database is available again, requests succeed.
+ cur.execute("ENABLE %s" % DatabaseLayer._db_fixture.dbname)
+ self.retryConnection(url, bouncer)
+
def test_disconnectionerror_view(self):
request = LaunchpadTestRequest()
DisconnectionErrorView(DisconnectionError(), request)
Follow ups