← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~cjwatson/launchpad:librarian-gc-ram into launchpad:master

 

Colin Watson has proposed merging ~cjwatson/launchpad:librarian-gc-ram into launchpad:master.

Commit message:
librarian-gc: Use a server-side cursor for iterating LFC IDs

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #853066 in Launchpad itself: "librarian-gc uses the Storm store's raw DB connection"
  https://bugs.launchpad.net/launchpad/+bug/853066
  Bug #1939876 in Launchpad itself: "librarian-gc has very high memory usage"
  https://bugs.launchpad.net/launchpad/+bug/1939876

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/+git/launchpad/+merge/407071

If we use a normal client-side cursor, then psycopg2 reads all the rows into memory up-front, defeating our attempt to avoid doing so.

In order to do this, I had to also fix bug 853066, because Storm's `ConnectionWrapper.cursor` doesn't currently support creating server-side cursors (though that could certainly be fixed).
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~cjwatson/launchpad:librarian-gc-ram into launchpad:master.
diff --git a/cronscripts/librarian-gc.py b/cronscripts/librarian-gc.py
index cc18c12..c767e73 100755
--- a/cronscripts/librarian-gc.py
+++ b/cronscripts/librarian-gc.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python3 -S
 #
-# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Librarian garbage collector.
@@ -16,9 +16,14 @@ import _pythonpath  # noqa: F401
 
 import logging
 
-from lp.services.config import config
-from lp.services.database.interfaces import IStore
-from lp.services.librarian.model import LibraryFileAlias
+from lp.services.config import (
+    config,
+    dbconfig,
+    )
+from lp.services.database.sqlbase import (
+    connect,
+    ISOLATION_LEVEL_AUTOCOMMIT,
+    )
 from lp.services.librarianserver import librariangc
 from lp.services.scripts.base import LaunchpadCronScript
 
@@ -63,15 +68,12 @@ class LibrarianGC(LaunchpadCronScript):
         if self.options.loglevel <= logging.DEBUG:
             librariangc.debug = True
 
-        # XXX wgrant 2011-09-18 bug=853066: Using Storm's raw connection
-        # here is wrong. We should either create our own or use
-        # Store.execute or cursor() and the transaction module.
-        store = IStore(LibraryFileAlias)
-        conn = store._connection._raw_connection
+        conn = connect(
+            user=dbconfig.dbuser, isolation=ISOLATION_LEVEL_AUTOCOMMIT)
 
         # Refuse to run if we have significant clock skew between the
         # librarian and the database.
-        librariangc.confirm_no_clock_skew(store)
+        librariangc.confirm_no_clock_skew(conn)
 
         # Note that each of these next steps will issue commit commands
         # as appropriate to make this script transaction friendly
diff --git a/lib/lp/services/librarianserver/librariangc.py b/lib/lp/services/librarianserver/librariangc.py
index b924409..71c3355 100644
--- a/lib/lp/services/librarianserver/librariangc.py
+++ b/lib/lp/services/librarianserver/librariangc.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Librarian garbage collection routines"""
@@ -31,7 +31,6 @@ from lp.services.database.postgresql import (
     listReferences,
     quoteIdentifier,
     )
-from lp.services.database.sqlbase import get_transaction_timestamp
 from lp.services.features import getFeatureFlag
 from lp.services.librarianserver import swift
 from lp.services.librarianserver.storage import (
@@ -116,14 +115,19 @@ def sha1_file(content_id):
     return hasher.hexdigest(), length
 
 
-def confirm_no_clock_skew(store):
+def confirm_no_clock_skew(con):
     """Raise an exception if there is significant clock skew between the
     database and this machine.
 
     It is theoretically possible to lose data if there is more than several
     hours of skew.
     """
-    db_now = get_transaction_timestamp(store)
+    cur = con.cursor()
+    try:
+        cur.execute("SELECT CURRENT_TIMESTAMP AT TIME ZONE 'UTC'")
+        db_now = cur.fetchone()[0].replace(tzinfo=pytz.UTC)
+    finally:
+        cur.close()
     local_now = _utcnow()
     five_minutes = timedelta(minutes=5)
 
@@ -610,10 +614,18 @@ def delete_unreferenced_content(con):
 
 
 def delete_unwanted_files(con):
-    delete_unwanted_disk_files(con)
-    swift_enabled = getFeatureFlag('librarian.swift.enabled') or False
-    if swift_enabled:
-        delete_unwanted_swift_files(con)
+    con.rollback()
+    orig_autocommit = con.autocommit
+    try:
+        # Disable autocommit so that we can use named cursors.
+        con.autocommit = False
+        delete_unwanted_disk_files(con)
+        swift_enabled = getFeatureFlag('librarian.swift.enabled') or False
+        if swift_enabled:
+            delete_unwanted_swift_files(con)
+    finally:
+        con.rollback()
+        con.autocommit = orig_autocommit
 
 
 def delete_unwanted_disk_files(con):
@@ -629,7 +641,7 @@ def delete_unwanted_disk_files(con):
 
     swift_enabled = getFeatureFlag('librarian.swift.enabled') or False
 
-    cur = con.cursor()
+    cur = con.cursor(name="librariangc_disk_lfcs")
 
     # Calculate all stored LibraryFileContent ids that we want to keep.
     # Results are ordered so we don't have to suck them all in at once.
@@ -737,6 +749,7 @@ def delete_unwanted_disk_files(con):
                 "was not found on disk." % next_wanted_content_id)
             next_wanted_content_id = get_next_wanted_content_id()
 
+    cur.close()
     log.info(
         "Deleted %d files from disk that were no longer referenced "
         "in the db." % removed_count)
@@ -791,12 +804,16 @@ def delete_unwanted_swift_files(con):
 
     log.info("Deleting unwanted files from Swift.")
 
-    cur = con.cursor()
-
     # Get the largest LibraryFileContent id in the database. This lets
     # us know when to stop looking in Swift for more files.
-    cur.execute("SELECT max(id) FROM LibraryFileContent")
-    max_lfc_id = cur.fetchone()[0]
+    cur = con.cursor()
+    try:
+        cur.execute("SELECT max(id) FROM LibraryFileContent")
+        max_lfc_id = cur.fetchone()[0]
+    finally:
+        cur.close()
+
+    cur = con.cursor(name="librariangc_swift_lfcs")
 
     # Calculate all stored LibraryFileContent ids that we want to keep.
     # Results are ordered so we don't have to suck them all in at once.
@@ -877,6 +894,7 @@ def delete_unwanted_swift_files(con):
                 "but was not found in Swift.".format(next_wanted_content_id))
         next_wanted_content_id = get_next_wanted_content_id()
 
+    cur.close()
     log.info(
         "Deleted {0} files from Swift that were no longer referenced "
         "in the db.".format(removed_count))
diff --git a/lib/lp/services/librarianserver/tests/test_gc.py b/lib/lp/services/librarianserver/tests/test_gc.py
index 9163899..31c3830 100644
--- a/lib/lp/services/librarianserver/tests/test_gc.py
+++ b/lib/lp/services/librarianserver/tests/test_gc.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2020 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 """Librarian garbage collection tests"""
@@ -96,8 +96,8 @@ class TestLibrarianGarbageCollectionBase:
         # Make sure that every file the database knows about exists on disk.
         # We manually remove them for tests that need to cope with missing
         # library items.
-        self.store = IMasterStore(LibraryFileContent)
-        for content in self.store.find(LibraryFileContent):
+        store = IMasterStore(LibraryFileContent)
+        for content in store.find(LibraryFileContent):
             path = librariangc.get_file_path(content.id)
             if not os.path.exists(path):
                 if not os.path.exists(os.path.dirname(path)):
@@ -596,13 +596,13 @@ class TestLibrarianGarbageCollectionBase:
 
     def test_confirm_no_clock_skew(self):
         # There should not be any clock skew when running the test suite.
-        librariangc.confirm_no_clock_skew(self.store)
+        librariangc.confirm_no_clock_skew(self.con)
 
         # To test this function raises an excption when it should,
         # fool the garbage collector into thinking it is tomorrow.
         with self.librariangc_thinking_it_is_tomorrow():
             self.assertRaises(
-                Exception, librariangc.confirm_no_clock_skew, (self.store,)
+                Exception, librariangc.confirm_no_clock_skew, (self.con,)
                 )
 
 
diff --git a/lib/lp/testing/pgsql.py b/lib/lp/testing/pgsql.py
index c4dffc4..373b021 100644
--- a/lib/lp/testing/pgsql.py
+++ b/lib/lp/testing/pgsql.py
@@ -1,4 +1,4 @@
-# Copyright 2009-2011 Canonical Ltd.  This software is licensed under the
+# Copyright 2009-2021 Canonical Ltd.  This software is licensed under the
 # GNU Affero General Public License version 3 (see the file LICENSE).
 
 '''
@@ -69,8 +69,8 @@ class ConnectionWrapper:
         finally:
             ConnectionWrapper.committed = True
 
-    def cursor(self):
-        return CursorWrapper(self.real_connection.cursor())
+    def cursor(self, *args, **kwargs):
+        return CursorWrapper(self.real_connection.cursor(*args, **kwargs))
 
     def __getattr__(self, key):
         return getattr(self.real_connection, key)