← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] ~ilkeremrekoc/launchpad:librarian-gc-file-dryrun into launchpad:master

 

İlker Emre Koç has proposed merging ~ilkeremrekoc/launchpad:librarian-gc-file-dryrun into launchpad:master.

Commit message:
Add dry-run to librarian-gc file deletions

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~ilkeremrekoc/launchpad/+git/launchpad/+merge/491422

To test a change to the librarian-gc in a safe manner, we need a
dry-run to ensure nothing is deleted when we run the file deletion
script.

-- 
Your team Launchpad code reviewers is requested to review the proposed merge of ~ilkeremrekoc/launchpad:librarian-gc-file-dryrun into launchpad:master.
diff --git a/cronscripts/librarian-gc.py b/cronscripts/librarian-gc.py
index 7f8d39e..b5aa082 100755
--- a/cronscripts/librarian-gc.py
+++ b/cronscripts/librarian-gc.py
@@ -24,6 +24,14 @@ class LibrarianGC(LaunchpadCronScript):
     def add_my_options(self):
         self.parser.add_option(
             "",
+            "--dry-run-files",
+            action="store_true",
+            default=False,
+            dest="dry_run_files",
+            help="Run over the file deletions without deleting any files",
+        )
+        self.parser.add_option(
+            "",
             "--skip-duplicates",
             action="store_true",
             default=False,
@@ -103,7 +111,7 @@ class LibrarianGC(LaunchpadCronScript):
             # Second sweep.
             librariangc.delete_unreferenced_content(conn)
         if not self.options.skip_files:
-            librariangc.delete_unwanted_files(conn)
+            librariangc.delete_unwanted_files(conn, self.options.dry_run_files)
 
 
 if __name__ == "__main__":
diff --git a/lib/lp/services/librarianserver/librariangc.py b/lib/lp/services/librarianserver/librariangc.py
index 5c04c55..1b82f11 100644
--- a/lib/lp/services/librarianserver/librariangc.py
+++ b/lib/lp/services/librarianserver/librariangc.py
@@ -669,22 +669,22 @@ def delete_unreferenced_content(con):
     loop_tuner.run()
 
 
-def delete_unwanted_files(con):
+def delete_unwanted_files(con, dry_run=False):
     con.rollback()
     orig_autocommit = con.autocommit
     try:
         # Disable autocommit so that we can use named cursors.
         con.autocommit = False
-        delete_unwanted_disk_files(con)
+        delete_unwanted_disk_files(con, dry_run)
         swift_enabled = getFeatureFlag("librarian.swift.enabled") or False
         if swift_enabled:
-            delete_unwanted_swift_files(con)
+            delete_unwanted_swift_files(con, dry_run)
     finally:
         con.rollback()
         con.autocommit = orig_autocommit
 
 
-def delete_unwanted_disk_files(con):
+def delete_unwanted_disk_files(con, dry_run=False):
     """Delete files found on disk that have no corresponding record in the
     database.
 
@@ -693,7 +693,10 @@ def delete_unwanted_disk_files(con):
     the database records committed.
     """
 
-    log.info("Deleting unwanted files from disk.")
+    if dry_run:
+        log.info("Dry run - not deleting any files from disk.")
+    else:
+        log.info("Deleting unwanted files from disk.")
 
     swift_enabled = getFeatureFlag("librarian.swift.enabled") or False
 
@@ -819,8 +822,11 @@ def delete_unwanted_disk_files(con):
                     )
                 else:
                     # File uploaded a while ago but no longer wanted.
-                    os.unlink(path)
-                    log.debug3("Deleted %s" % path)
+                    if dry_run:
+                        log.info("Would delete %s" % path)
+                    else:
+                        os.unlink(path)
+                        log.debug3("Deleted %s" % path)
                     removed_count += 1
 
     # Report any remaining LibraryFileContent that the database says
@@ -837,10 +843,16 @@ def delete_unwanted_disk_files(con):
             next_wanted_content_id = next(content_id_iter, None)
 
     cur.close()
-    log.info(
-        "Deleted %d files from disk that were no longer referenced "
-        "in the db." % removed_count
-    )
+    if dry_run:
+        log.info(
+            "Would have deleted %d files from disk that were no longer "
+            "referenced in the db." % removed_count
+        )
+    else:
+        log.info(
+            "Deleted %d files from disk that were no longer referenced "
+            "in the db." % removed_count
+        )
 
 
 def swift_files(max_lfc_id):
@@ -896,11 +908,14 @@ def swift_files(max_lfc_id):
                 seen_names.add((obj["name"], pool_index))
 
 
-def delete_unwanted_swift_files(con):
+def delete_unwanted_swift_files(con, dry_run=False):
     """Delete files found in Swift that have no corresponding db record."""
     assert getFeatureFlag("librarian.swift.enabled")
 
-    log.info("Deleting unwanted files from Swift.")
+    if dry_run:
+        log.info("Dry run - not deleting any files from Swift.")
+    else:
+        log.info("Deleting unwanted files from Swift.")
 
     # Get the largest LibraryFileContent id in the database. This lets
     # us know when to stop looking in Swift for more files.
@@ -992,13 +1007,21 @@ def delete_unwanted_swift_files(con):
             else:
                 with swift.connection(connection_pool) as swift_connection:
                     try:
-                        swift_connection.delete_object(container, name)
-                        log.debug3(
-                            "Deleted (%s, %s) from Swift (%s)",
-                            container,
-                            name,
-                            connection_pool.os_auth_url,
-                        )
+                        if dry_run:
+                            log.info(
+                                "Would delete (%s, %s) from Swift (%s)",
+                                container,
+                                name,
+                                connection_pool.os_auth_url,
+                            )
+                        else:
+                            swift_connection.delete_object(container, name)
+                            log.debug3(
+                                "Deleted (%s, %s) from Swift (%s)",
+                                container,
+                                name,
+                                connection_pool.os_auth_url,
+                            )
                     except swiftclient.ClientException as e:
                         if e.http_status != 404:
                             log.exception(
@@ -1032,10 +1055,18 @@ def delete_unwanted_swift_files(con):
         next_wanted_content_id = next(content_id_iter, None)
 
     cur.close()
-    log.info(
-        "Deleted {} files from Swift that were no longer referenced "
-        "in the db.".format(removed_count)
-    )
+
+    if dry_run:
+        log.info(
+            "Dry run - would have deleted {} files from Swift.".format(
+                removed_count
+            )
+        )
+    else:
+        log.info(
+            "Deleted {} files from Swift that were no longer referenced "
+            "in the db.".format(removed_count)
+        )
 
 
 def get_file_path(content_id):
diff --git a/lib/lp/services/librarianserver/tests/test_gc.py b/lib/lp/services/librarianserver/tests/test_gc.py
index 23a7eac..efec95f 100644
--- a/lib/lp/services/librarianserver/tests/test_gc.py
+++ b/lib/lp/services/librarianserver/tests/test_gc.py
@@ -517,6 +517,59 @@ class TestLibrarianGarbageCollectionBase:
         for content_id in (row[0] for row in cur.fetchall()):
             self.assertTrue(self.file_exists(content_id))
 
+    def test_delete_unwanted_files_dry_run(self):
+        # Test the --dry-run-files option to the librarian-gc script.
+
+        self.ztm.begin()
+        cur = cursor()
+
+        # We may find files in the LibraryFileContent repository
+        # that do not have an corresponding LibraryFileContent row.
+
+        # Find a content_id we can easily delete and do so. This row is
+        # removed from the database, leaving an orphaned file on the
+        # filesystem that should be removed.
+        cur.execute(
+            """
+            SELECT LibraryFileContent.id
+            FROM LibraryFileContent
+            LEFT OUTER JOIN LibraryFileAlias
+                ON LibraryFileContent.id = content
+            WHERE LibraryFileAlias.id IS NULL
+            LIMIT 1
+            """
+        )
+        content_id = cur.fetchone()[0]
+        cur.execute(
+            """
+                DELETE FROM LibraryFileContent WHERE id=%s
+                """,
+            (content_id,),
+        )
+        self.ztm.commit()
+
+        self.assertTrue(self.file_exists(content_id))
+
+        # To test removal does occur when we want it to, we need to trick
+        # the garbage collector into thinking it is tomorrow.
+        with self.librariangc_thinking_it_is_tomorrow():
+            # Add the dry_run=True argument to ensure no files are deleted.
+            librariangc.delete_unwanted_files(self.con, dry_run=True)
+
+        # The file should still be there.
+        self.assertTrue(self.file_exists(content_id))
+
+        # Make sure nothing else has been removed from disk
+        self.ztm.begin()
+        cur = cursor()
+        cur.execute(
+            """
+                SELECT id FROM LibraryFileContent
+                """
+        )
+        for content_id in (row[0] for row in cur.fetchall()):
+            self.assertTrue(self.file_exists(content_id))
+
     def test_delete_unwanted_files_bug437084(self):
         # There was a bug where delete_unwanted_files() would die
         # if the last file found on disk was unwanted.