← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~stub/launchpad/db-deploy into lp:launchpad

 

Stuart Bishop has proposed merging lp:~stub/launchpad/db-deploy into lp:launchpad.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #906222 in Launchpad itself: "More BAD_USERS creating false positives during fast downtime updates"
  https://bugs.launchpad.net/launchpad/+bug/906222
  Bug #911417 in Launchpad itself: "Staging db restore fails on make -C database/replication stagingsetup"
  https://bugs.launchpad.net/launchpad/+bug/911417
  Bug #1025396 in Launchpad itself: "deployment scripts need to cope with streaming replication"
  https://bugs.launchpad.net/launchpad/+bug/1025396
  Bug #1025399 in Launchpad itself: "full-update.py should wait for streaming slaves before completing"
  https://bugs.launchpad.net/launchpad/+bug/1025399

For more details, see:
https://code.launchpad.net/~stub/launchpad/db-deploy/+merge/115761

= Summary =

We should wait for streaming replicas to catch up after applying db patches, before letting things reconnect.

== Proposed fix ==

== Pre-implementation notes ==

== LOC Rationale ==

== Implementation details ==

== Tests ==

== Demo and Q/A ==


= Launchpad lint =

Checking for conflicts and issues in changed files.

Linting changed files:
  database/schema/full-update.py

./database/schema/full-update.py
       7: '_pythonpath' imported but unused
-- 
https://code.launchpad.net/~stub/launchpad/db-deploy/+merge/115761
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~stub/launchpad/db-deploy into lp:launchpad.
=== modified file 'database/schema/full-update.py'
--- database/schema/full-update.py	2012-07-19 14:38:04 +0000
+++ database/schema/full-update.py	2012-07-19 15:07:23 +0000
@@ -1,5 +1,5 @@
 #!/usr/bin/python2.6 -S
-# Copyright 2011 Canonical Ltd.  This software is licensed under the
+# Copyright 2011-2012 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Full update process."""
@@ -11,6 +11,10 @@
 import subprocess
 import sys
 
+from lp.services.database.sqlbase import (
+    connect,
+    ISOLATION_LEVEL_AUTOCOMMIT,
+    )
 from lp.services.scripts import (
     db_options,
     logger,
@@ -19,6 +23,7 @@
 from preflight import (
     KillConnectionsPreflight,
     NoConnectionCheckPreflight,
+    streaming_sync
     )
 import security  # security.py script
 import upgrade  # upgrade.py script
@@ -159,7 +164,22 @@
             return security_rc
         security_run = True
 
-        log.info("All database upgrade steps completed")
+        log.info("All database upgrade steps completed. Waiting for sync.")
+
+        # Increase this timeout once we are confident in the implementation.
+        # We don't want to block rollouts unnecessarily with slow
+        # timeouts and a flaky sync detection implementation.
+        streaming_sync_timeout = 60
+
+        sync = streaming_sync(
+            connect(isolation=ISOLATION_LEVEL_AUTOCOMMIT),
+            streaming_sync_timeout)
+        if sync:
+            log.debug('Streaming replicas in sync.')
+        else:
+            log.error(
+                'Streaming replicas failed to sync after %d seconds.',
+                streaming_sync_timeout)
 
         log.info("Restarting pgbouncer")
         pgbouncer_rc = run_pgbouncer(log, 'start')

=== modified file 'database/schema/preflight.py'
--- database/schema/preflight.py	2012-07-19 14:38:04 +0000
+++ database/schema/preflight.py	2012-07-19 15:07:23 +0000
@@ -8,6 +8,7 @@
     'DatabasePreflight',
     'KillConnectionsPreflight',
     'NoConnectionCheckPreflight',
+    'streaming_sync',
     ]
 
 import _pythonpath
@@ -327,34 +328,22 @@
             slony_success = replication.helpers.sync(30, exit_on_fail=False)
             if slony_success:
                 self.log.info(
-                    "Replication events are being propagated.")
+                    "Slony replication events are being propagated.")
             else:
                 self.log.fatal(
-                    "Replication events are not being propagated.")
-                self.log.fatal(
-                    "One or more replication daemons may be down.")
-                self.log.fatal(
-                    "Bounce the replication daemons and check the logs.")
+                    "Slony replication events are not being propagated.")
+                self.log.fatal(
+                    "One or more slony replication daemons may be down.")
+                self.log.fatal(
+                    "Bounce the slony replication daemons and check logs.")
 
-        streaming_success = False
         # PG 9.1 streaming replication, or no replication.
-        #
-        cur = self.lpmain_master_node.con.cursor()
-        # Force a WAL switch, returning the current position.
-        cur.execute('SELECT pg_switch_xlog()')
-        wal_point = cur.fetchone()[0]
-        self.log.debug('WAL at %s', wal_point)
-        start_time = time.time()
-        while time.time() < start_time + 30:
-            cur.execute("""
-                SELECT FALSE FROM pg_stat_replication
-                WHERE replay_location < %s LIMIT 1
-                """, (wal_point,))
-            if cur.fetchone() is None:
-                # All slaves, possibly 0, are in sync.
-                streaming_success = True
-                break
-            time.sleep(0.2)
+        streaming_success = streaming_sync(self.lpmain_master_node.con, 30)
+        if streaming_success:
+            self.log.info("Streaming replicas syncing.")
+        else:
+            self.log.fatal("Streaming replicas not syncing.")
+
         return slony_success and streaming_success
 
     def report_patches(self):
@@ -448,6 +437,32 @@
         return all_clear
 
 
+def streaming_sync(con, timeout=None):
+    """Wait for streaming replicas to synchronize with master as of now.
+
+    :param timeout: seconds to wait, None for no timeout.
+
+    :returns: True if sync happened or no streaming replicas
+              False if the timeout was passed.
+    """
+    cur = con.cursor()
+
+    # Force a WAL switch, returning the current position.
+    cur.execute('SELECT pg_switch_xlog()')
+    wal_point = cur.fetchone()[0]
+    start_time = time.time()
+    while timeout is None or time.time() < start_time + timeout:
+        cur.execute("""
+            SELECT FALSE FROM pg_stat_replication
+            WHERE replay_location < %s LIMIT 1
+            """, (wal_point,))
+        if cur.fetchone() is None:
+            # All slaves, possibly 0, are in sync.
+            return True
+        time.sleep(0.2)
+    return False
+
+
 def main():
     parser = OptionParser()
     db_options(parser)


Follow ups