launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad

To: mp+297878@xxxxxxxxxxxxxxxxxx
From: William Grant <me@xxxxxxxxxxxxxxxxxx>
Date: Mon, 20 Jun 2016 08:27:26 -0000
Reply-to: mp+297878@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

William Grant has proposed merging lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad.

Commit message:
Don't hash duplicates in librarian-gc.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~wgrant/launchpad/gc-dupe-lighter/+merge/297878

Don't hash duplicates in librarian-gc.

Instead of comparing everything, just check that the original file's hash matches the DB and trust that the rest haven't collided. Pedantically verifying that every duplicate file matches the original file bit by bit is very slow now that we have a lot of duplicates and most of them have to be downloaded from Swift.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad.

=== modified file 'lib/lp/services/librarianserver/librariangc.py'
--- lib/lp/services/librarianserver/librariangc.py	2016-05-05 06:10:32 +0000
+++ lib/lp/services/librarianserver/librariangc.py	2016-06-20 08:26:14 +0000
@@ -10,6 +10,7 @@
     timedelta,
     )
 import errno
+import hashlib
 import os
 import re
 import sys
@@ -91,17 +92,15 @@
     return None  # File not found.
 
 
-def same_file(content_id_1, content_id_2):
-    file1 = open_stream(content_id_1)
-    file2 = open_stream(content_id_2)
-
-    chunks_iter = iter(
-        lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),
-        ('', ''))
-    for chunk1, chunk2 in chunks_iter:
-        if chunk1 != chunk2:
-            return False
-    return True
+def sha1_file(content_id):
+    file = open_stream(content_id)
+    chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
+    length = 0
+    hasher = hashlib.sha1()
+    for chunk in chunks_iter:
+        hasher.update(chunk)
+        length += len(chunk)
+    return hasher.hexdigest(), length
 
 
 def confirm_no_clock_skew(con):
@@ -222,18 +221,18 @@
         # most likely to exist on the staging server (it should be
         # irrelevant on production).
         cur.execute("""
-            SELECT id
+            SELECT id, sha1, filesize
             FROM LibraryFileContent
             WHERE sha1=%(sha1)s AND filesize=%(filesize)s
             ORDER BY datecreated DESC
             """, vars())
-        dupes = [row[0] for row in cur.fetchall()]
+        dupes = cur.fetchall()
 
         if debug:
             log.debug("Found duplicate LibraryFileContents")
             # Spit out more info in case it helps work out where
             # dupes are coming from.
-            for dupe_id in dupes:
+            for dupe_id, _, _ in dupes:
                 cur.execute("""
                     SELECT id, filename, mimetype FROM LibraryFileAlias
                     WHERE content = %(dupe_id)s
@@ -246,7 +245,7 @@
         # and cope - just report and skip. However, on staging this will
         # be more common because database records has been synced from
         # production but the actual librarian contents has not.
-        dupe1_id = dupes[0]
+        dupe1_id = dupes[0][0]
         if not file_exists(dupe1_id):
             if config.instance_name == 'staging':
                 log.debug(
@@ -256,31 +255,25 @@
                         "LibraryFileContent %d data is missing", dupe1_id)
             continue
 
-        # Do a manual check that they really are identical, because we
-        # employ paranoids. And we might as well cope with someone breaking
-        # SHA1 enough that it becomes possible to create a SHA1 collision
-        # with an identical filesize to an existing file. Which is pretty
-        # unlikely. Where did I leave my tin foil hat?
-        for dupe2_id in (dupe for dupe in dupes[1:]):
-            # Check paths exist, because on staging they may not!
-            if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):
-                log.error(
-                        "SHA-1 collision found. LibraryFileContent %d and "
-                        "%d have the same SHA1 and filesize, but are not "
-                        "byte-for-byte identical.",
-                        dupe1_id, dupe2_id
-                        )
-                sys.exit(1)
+        # Check that the first file is intact. Don't want to delete
+        # dupes if we might need them to recover the original.
+        actual_sha1, actual_size = sha1_file(dupe1_id)
+        if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
+            log.error(
+                "Corruption found. LibraryFileContent %d has SHA-1 %s and "
+                "size %d, expected %s and %d.", dupes[0][0],
+                actual_sha1, actual_size, dupes[0][1], dupes[0][2])
+            sys.exit(1)
 
         # Update all the LibraryFileAlias entries to point to a single
         # LibraryFileContent
-        prime_id = dupes[0]
-        other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
+        prime_id = dupes[0][0]
+        other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
         log.debug(
             "Making LibraryFileAliases referencing %s reference %s instead",
             other_ids, prime_id
             )
-        for other_id in dupes[1:]:
+        for other_id, _, _ in dupes[1:]:
             cur.execute("""
                 UPDATE LibraryFileAlias SET content=%(prime_id)s
                 WHERE content = %(other_id)s

=== modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
--- lib/lp/services/librarianserver/tests/test_gc.py	2015-01-14 06:02:50 +0000
+++ lib/lp/services/librarianserver/tests/test_gc.py	2016-06-20 08:26:14 +0000
@@ -851,16 +851,27 @@
         self.unexpired_blob_id = cur.fetchone()[0]
         self.layer.txn.commit()
 
-        # Make sure all the librarian files actually exist on disk
+        # Make sure all the librarian files actually exist on disk with
+        # hashes matching the DB. We use the hash as the new file
+        # content, to preserve existing duplicate relationships.
+        switch_dbuser('testadmin')
         cur = cursor()
-        cur.execute("SELECT id FROM LibraryFileContent")
-        for content_id in (row[0] for row in cur.fetchall()):
+        cur.execute("SELECT id, sha1 FROM LibraryFileContent")
+        for content_id, sha1 in cur.fetchall():
             path = librariangc.get_file_path(content_id)
             if not os.path.exists(path):
                 if not os.path.exists(os.path.dirname(path)):
                     os.makedirs(os.path.dirname(path))
-                open(path, 'w').write('whatever')
-        self.layer.txn.abort()
+                data = sha1
+                open(path, 'w').write(data)
+                cur.execute(
+                    "UPDATE LibraryFileContent "
+                    "SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
+                    "WHERE id = %s",
+                    (hashlib.md5(data).hexdigest(),
+                     hashlib.sha1(data).hexdigest(),
+                     hashlib.sha256(data).hexdigest(), len(data), content_id))
+        self.layer.txn.commit()
 
         switch_dbuser(config.librarian_gc.dbuser)

Follow ups

[Merge] lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad
From: noreply, 2016-06-23
Re: [Merge] lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad
From: Colin Watson, 2016-06-22