launchpad-reviewers team mailing list archive

Thread
Date

[Merge] lp:~stub/launchpad/large-swift-files into lp:launchpad

To: mp+198222@xxxxxxxxxxxxxxxxxx
From: Stuart Bishop <stuart.bishop@xxxxxxxxxxxxx>
Date: Mon, 09 Dec 2013 10:54:26 -0000
Reply-to: mp+198222@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Stuart Bishop has proposed merging lp:~stub/launchpad/large-swift-files into lp:launchpad with lp:~stub/launchpad/swift-librarian-config as a prerequisite.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #1257638 in Launchpad itself: "Large file support for Librarian Swift storage system"
  https://bugs.launchpad.net/launchpad/+bug/1257638

For more details, see:
https://code.launchpad.net/~stub/launchpad/large-swift-files/+merge/198222

Swift requires some gymnastics from the client to store objects larger than 5GB.
-- 
https://code.launchpad.net/~stub/launchpad/large-swift-files/+merge/198222
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~stub/launchpad/large-swift-files into lp:launchpad.

=== modified file 'lib/lp/services/librarianserver/librariangc.py'
--- lib/lp/services/librarianserver/librariangc.py	2013-11-07 14:28:46 +0000
+++ lib/lp/services/librarianserver/librariangc.py	2013-12-09 10:53:28 +0000
@@ -769,7 +769,12 @@
 
     for container, obj in swift_files(max_lfc_id):
         name = obj['name']
-        content_id = int(name)
+
+        # We may have a segment of a large file.
+        if '/' in name:
+            content_id = int(name.split('/', 1)[0])
+        else:
+            content_id = int(name)
 
         while (next_wanted_content_id is not None
             and content_id > next_wanted_content_id):

=== modified file 'lib/lp/services/librarianserver/swift.py'
--- lib/lp/services/librarianserver/swift.py	2013-12-09 10:53:28 +0000
+++ lib/lp/services/librarianserver/swift.py	2013-12-09 10:53:28 +0000
@@ -15,6 +15,7 @@
 import re
 import sys
 import time
+import urllib
 
 from swiftclient import client as swiftclient
 
@@ -24,6 +25,7 @@
 
 
 SWIFT_CONTAINER_PREFIX = 'librarian_'
+MAX_SWIFT_OBJECT_SIZE = 5 * 1024 ** 3  # 5GB Swift limit.
 
 ONE_DAY = 24 * 60 * 60
 
@@ -130,29 +132,57 @@
             except swiftclient.ClientException as x:
                 if x.http_status != 404:
                     raise
-                log.info(
-                    'Putting {0} into Swift ({1}, {2})'.format(
-                        lfc, container, obj_name))
-                md5_stream = HashStream(open(fs_path, 'rb'))
-                db_md5_hash = ISlaveStore(LibraryFileContent).get(
-                    LibraryFileContent, lfc).md5
-                swift_md5_hash = swift_connection.put_object(
-                    container, obj_name, md5_stream, os.path.getsize(fs_path))
-                disk_md5_hash = md5_stream.hash.hexdigest()
-                if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
-                    log.error(
-                        "LibraryFileContent({0}) corrupt. "
-                        "disk md5={1}, db md5={2}, swift md5={3}".format(
-                            lfc, disk_md5_hash, db_md5_hash, swift_md5_hash))
-                    try:
-                        swift_connection.delete_object(container, obj_name)
-                    except Exception as x:
-                        log.error('Failed to delete corrupt file from Swift')
-                    raise AssertionError('md5 mismatch')
+                log.info('Putting {0} into Swift ({1}, {2})'.format(
+                    lfc, container, obj_name))
+                _put(log, swift_connection, lfc, container, obj_name, fs_path)
             if remove:
                 os.unlink(fs_path)
 
 
+def _put(log, swift_connection, lfc_id, container, obj_name, fs_path):
+    fs_size = os.path.getsize(fs_path)
+    fs_file = open(fs_path, 'rb')
+    if fs_size <= MAX_SWIFT_OBJECT_SIZE:
+        md5_stream = HashStream(fs_file)
+        db_md5_hash = ISlaveStore(LibraryFileContent).get(
+            LibraryFileContent, lfc_id).md5
+        swift_md5_hash = swift_connection.put_object(
+            container, obj_name, md5_stream, fs_size)
+        disk_md5_hash = md5_stream.hash.hexdigest()
+        if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
+            log.error(
+                "LibraryFileContent({0}) corrupt. "
+                "disk md5={1}, db md5={2}, swift md5={3}".format(
+                    lfc_id, disk_md5_hash, db_md5_hash, swift_md5_hash))
+            try:
+                swift_connection.delete_object(container, obj_name)
+            except Exception:
+                log.exception('Failed to delete corrupt file from Swift')
+            raise AssertionError('md5 mismatch')
+    else:
+        # Large file upload. Create the segments first, then the
+        # manifest. This order prevents partial downloads, and lets us
+        # detect interrupted uploads and clean up.
+        segment = 0
+        while fs_file.tell() < fs_size:
+            assert segment <= 9999, 'Insane number of segments'
+            seg_name = '%s/%04d' % (obj_name, segment)
+            seg_size = min(fs_size - fs_file.tell(), MAX_SWIFT_OBJECT_SIZE)
+            md5_stream = HashStream(fs_file)
+            swift_md5_hash = swift_connection.put_object(
+                container, seg_name, md5_stream, seg_size)
+            segment_md5_hash = md5_stream.hash.hexdigest()
+            assert swift_md5_hash == segment_md5_hash, (
+                "LibraryFileContent({0}) segment {1} upload corrupted".format(
+                    lfc_id, segment))
+            segment = segment + 1
+        manifest = '{0}/{1}/'.format(
+            urllib.quote(container), urllib.quote(obj_name))
+        manifest_headers = {'x-object-manifest': manifest}
+        swift_connection.put_object(
+            container, obj_name, '', 0, headers=manifest_headers)
+
+
 def swift_location(lfc_id):
     '''Return the (container, obj_name) used to store a file.
 
@@ -165,7 +195,9 @@
     # storage, as objects will no longer be found in the expected
     # container. This value and the container prefix are deliberatly
     # hard coded to avoid cockups with values specified in config files.
-    max_objects_per_container = 1000000
+    # While the suggested number is 'under a million', the rare large files
+    # will take up multiple slots so we choose a more conservative number.
+    max_objects_per_container = 500000
 
     container_num = lfc_id // max_objects_per_container
 

=== modified file 'lib/lp/services/librarianserver/tests/test_swift.py'
--- lib/lp/services/librarianserver/tests/test_swift.py	2013-11-12 15:03:38 +0000
+++ lib/lp/services/librarianserver/tests/test_swift.py	2013-12-09 10:53:28 +0000
@@ -10,6 +10,7 @@
 import time
 
 from mock import patch
+from swiftclient import client as swiftclient
 import transaction
 
 from lp.services.database import write_transaction
@@ -147,8 +148,11 @@
             data = self.librarian_client.getFileByAlias(lfa_id).read()
             self.assertEqual(content, data)
 
-    def test_large_binary_files_from_disk(self):
-        # Generate a large blob, including null bytes for kicks.
+    def test_largish_binary_files_from_disk(self):
+        # Generate a largish blob, including null bytes for kicks.
+        # A largish file is large enough that the HTTP upload needs
+        # to be done in multiple chunks, but small enough that it is
+        # stored in Swift as a single object.
         size = 512 * 1024  # 512KB
         expected_content = ''.join(chr(i % 256) for i in range(0, size))
         lfa_id = self.add_file('hello_bigboy.xls', expected_content)
@@ -157,9 +161,12 @@
         lfa = self.librarian_client.getFileByAlias(lfa_id)
         self.assertEqual(expected_content, lfa.read())
 
-    def test_large_binary_files_from_swift(self):
+    def test_largish_binary_files_from_swift(self):
         # Generate large blob, multiple of the chunk size.
         # Including null bytes for kicks.
+        # A largish file is large enough that the HTTP upload needs
+        # to be done in multiple chunks, but small enough that it is
+        # stored in Swift as a single object.
         size = LibrarianStorage.CHUNK_SIZE * 50
         self.assert_(size > 1024 * 1024)
         expected_content = ''.join(chr(i % 256) for i in range(0, size))
@@ -176,10 +183,12 @@
         lfa = self.librarian_client.getFileByAlias(lfa_id)
         self.assertEqual(expected_content, lfa.read())
 
-
-    def test_large_binary_files_from_swift_offset(self):
+    def test_largish_binary_files_from_swift_offset(self):
         # Generate large blob, but NOT a multiple of the chunk size.
         # Including null bytes for kicks.
+        # A largish file is large enough that the HTTP upload needs
+        # to be done in multiple chunks, but small enough that it is
+        # stored in Swift as a single object.
         size = LibrarianStorage.CHUNK_SIZE * 50 + 1
         self.assert_(size > 1024 * 1024)
         expected_content = ''.join(chr(i % 256) for i in range(0, size))
@@ -195,3 +204,44 @@
         lfa = self.librarian_client.getFileByAlias(lfa_id)
         self.failIf(os.path.exists(swift.filesystem_path(lfc.id)))
         self.assertEqual(expected_content, lfa.read())
+
+    def test_large_file_to_swift(self):
+        # Generate a blob large enough that Swift requires us to store
+        # it as multiple objects plus a manifest.
+        size = LibrarianStorage.CHUNK_SIZE * 50
+        self.assert_(size > 1024 * 1024)
+        expected_content = ''.join(chr(i % 256) for i in range(0, size))
+        lfa_id = self.add_file('hello_bigboy.xls', expected_content)
+        lfa = IStore(LibraryFileAlias).get(LibraryFileAlias, lfa_id)
+        lfc = lfa.content
+
+        # We don't really want to upload a file >5GB to our mock Swift,
+        # so change the constant instead. Set it so we need 3 segments.
+        def _reset_max(val):
+            swift.MAX_SWIFT_OBJECT_SIZE = val
+        self.addCleanup(_reset_max, swift.MAX_SWIFT_OBJECT_SIZE)
+        swift.MAX_SWIFT_OBJECT_SIZE = int(size / 2) - 1
+
+        # Shove the file requiring multiple segments into Swift.
+        swift.to_swift(BufferLogger(), remove=False)
+
+        # As our mock Swift does not support multi-segment files,
+        # instead we examine it directly in Swift as best we can.
+        swift_client = self.swift_fixture.connect()
+
+        # The manifest exists. Unfortunately, we can't test that the
+        # magic manifest header is set correctly.
+        container, name = swift.swift_location(lfc.id)
+        headers, obj = swift_client.get_object(container, name)
+        self.assertEqual(obj, '')
+
+        # The segments we expect are all in their expected locations.
+        _, obj1 = swift_client.get_object(container, '{0}/0000'.format(name))
+        _, obj2 = swift_client.get_object(container, '{0}/0001'.format(name))
+        _, obj3 = swift_client.get_object(container, '{0}/0002'.format(name))
+        self.assertRaises(
+            swiftclient.ClientException, swift_client.get_object,
+            container, '{0}/0003'.format(name))
+
+        # Our object round tripped
+        self.assertEqual(obj1 + obj2 + obj3, expected_content)

Follow ups

[Merge] lp:~stub/launchpad/large-swift-files into lp:launchpad
From: noreply, 2014-01-14
[Merge] lp:~stub/launchpad/large-swift-files into lp:launchpad
From: Stuart Bishop, 2014-01-14
Re: [Merge] lp:~stub/launchpad/large-swift-files into lp:launchpad
From: William Grant, 2014-01-14