launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #10092
[Merge] lp:~stub/launchpad/db-deploy into lp:launchpad
Stuart Bishop has proposed merging lp:~stub/launchpad/db-deploy into lp:launchpad.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #906222 in Launchpad itself: "More BAD_USERS creating false positives during fast downtime updates"
https://bugs.launchpad.net/launchpad/+bug/906222
Bug #911417 in Launchpad itself: "Staging db restore fails on make -C database/replication stagingsetup"
https://bugs.launchpad.net/launchpad/+bug/911417
Bug #1025396 in Launchpad itself: "deployment scripts need to cope with streaming replication"
https://bugs.launchpad.net/launchpad/+bug/1025396
Bug #1025399 in Launchpad itself: "full-update.py should wait for streaming slaves before completing"
https://bugs.launchpad.net/launchpad/+bug/1025399
For more details, see:
https://code.launchpad.net/~stub/launchpad/db-deploy/+merge/115761
= Summary =
We should wait for streaming replicas to catch up after applying db patches, before letting things reconnect.
== Proposed fix ==
== Pre-implementation notes ==
== LOC Rationale ==
== Implementation details ==
== Tests ==
== Demo and Q/A ==
= Launchpad lint =
Checking for conflicts and issues in changed files.
Linting changed files:
database/schema/full-update.py
./database/schema/full-update.py
7: '_pythonpath' imported but unused
--
https://code.launchpad.net/~stub/launchpad/db-deploy/+merge/115761
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~stub/launchpad/db-deploy into lp:launchpad.
=== modified file 'database/schema/full-update.py'
--- database/schema/full-update.py 2012-07-19 14:38:04 +0000
+++ database/schema/full-update.py 2012-07-19 15:07:23 +0000
@@ -1,5 +1,5 @@
#!/usr/bin/python2.6 -S
-# Copyright 2011 Canonical Ltd. This software is licensed under the
+# Copyright 2011-2012 Canonical Ltd. This software is licensed under the
# GNU Affero General Public License version 3 (see the file LICENSE).
"""Full update process."""
@@ -11,6 +11,10 @@
import subprocess
import sys
+from lp.services.database.sqlbase import (
+ connect,
+ ISOLATION_LEVEL_AUTOCOMMIT,
+ )
from lp.services.scripts import (
db_options,
logger,
@@ -19,6 +23,7 @@
from preflight import (
KillConnectionsPreflight,
NoConnectionCheckPreflight,
+ streaming_sync
)
import security # security.py script
import upgrade # upgrade.py script
@@ -159,7 +164,22 @@
return security_rc
security_run = True
- log.info("All database upgrade steps completed")
+ log.info("All database upgrade steps completed. Waiting for sync.")
+
+ # Increase this timeout once we are confident in the implementation.
+ # We don't want to block rollouts unnecessarily with slow
+ # timeouts and a flaky sync detection implementation.
+ streaming_sync_timeout = 60
+
+ sync = streaming_sync(
+ connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT),
+ streaming_sync_timeout)
+ if sync:
+ log.debug('Streaming replicas in sync.')
+ else:
+ log.error(
+ 'Streaming replicas failed to sync after %d seconds.',
+ streaming_sync_timeout)
log.info("Restarting pgbouncer")
pgbouncer_rc = run_pgbouncer(log, 'start')
=== modified file 'database/schema/preflight.py'
--- database/schema/preflight.py 2012-07-19 14:38:04 +0000
+++ database/schema/preflight.py 2012-07-19 15:07:23 +0000
@@ -8,6 +8,7 @@
'DatabasePreflight',
'KillConnectionsPreflight',
'NoConnectionCheckPreflight',
+ 'streaming_sync',
]
import _pythonpath
@@ -327,34 +328,22 @@
slony_success = replication.helpers.sync(30, exit_on_fail=False)
if slony_success:
self.log.info(
- "Replication events are being propagated.")
+ "Slony replication events are being propagated.")
else:
self.log.fatal(
- "Replication events are not being propagated.")
- self.log.fatal(
- "One or more replication daemons may be down.")
- self.log.fatal(
- "Bounce the replication daemons and check the logs.")
+ "Slony replication events are not being propagated.")
+ self.log.fatal(
+ "One or more slony replication daemons may be down.")
+ self.log.fatal(
+ "Bounce the slony replication daemons and check logs.")
- streaming_success = False
# PG 9.1 streaming replication, or no replication.
- #
- cur = self.lpmain_master_node.con.cursor()
- # Force a WAL switch, returning the current position.
- cur.execute('SELECT pg_switch_xlog()')
- wal_point = cur.fetchone()[0]
- self.log.debug('WAL at %s', wal_point)
- start_time = time.time()
- while time.time() < start_time + 30:
- cur.execute("""
- SELECT FALSE FROM pg_stat_replication
- WHERE replay_location < %s LIMIT 1
- """, (wal_point,))
- if cur.fetchone() is None:
- # All slaves, possibly 0, are in sync.
- streaming_success = True
- break
- time.sleep(0.2)
+ streaming_success = streaming_sync(self.lpmain_master_node.con, 30)
+ if streaming_success:
+ self.log.info("Streaming replicas syncing.")
+ else:
+ self.log.fatal("Streaming replicas not syncing.")
+
return slony_success and streaming_success
def report_patches(self):
@@ -448,6 +437,32 @@
return all_clear
+def streaming_sync(con, timeout=None):
+ """Wait for streaming replicas to synchronize with master as of now.
+
+ :param timeout: seconds to wait, None for no timeout.
+
+ :returns: True if sync happened or no streaming replicas
+ False if the timeout was passed.
+ """
+ cur = con.cursor()
+
+ # Force a WAL switch, returning the current position.
+ cur.execute('SELECT pg_switch_xlog()')
+ wal_point = cur.fetchone()[0]
+ start_time = time.time()
+ while timeout is None or time.time() < start_time + timeout:
+ cur.execute("""
+ SELECT FALSE FROM pg_stat_replication
+ WHERE replay_location < %s LIMIT 1
+ """, (wal_point,))
+ if cur.fetchone() is None:
+ # All slaves, possibly 0, are in sync.
+ return True
+ time.sleep(0.2)
+ return False
+
+
def main():
parser = OptionParser()
db_options(parser)
Follow ups