launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #16269
[Merge] lp:~stub/launchpad/large-swift-files into lp:launchpad
Stuart Bishop has proposed merging lp:~stub/launchpad/large-swift-files into lp:launchpad with lp:~stub/launchpad/swift-librarian-config as a prerequisite.
Requested reviews:
Launchpad code reviewers (launchpad-reviewers)
Related bugs:
Bug #1257638 in Launchpad itself: "Large file support for Librarian Swift storage system"
https://bugs.launchpad.net/launchpad/+bug/1257638
For more details, see:
https://code.launchpad.net/~stub/launchpad/large-swift-files/+merge/198222
Swift requires some gymnastics from the client to store objects larger than 5GB.
--
https://code.launchpad.net/~stub/launchpad/large-swift-files/+merge/198222
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~stub/launchpad/large-swift-files into lp:launchpad.
=== modified file 'lib/lp/services/librarianserver/librariangc.py'
--- lib/lp/services/librarianserver/librariangc.py 2013-11-07 14:28:46 +0000
+++ lib/lp/services/librarianserver/librariangc.py 2013-12-09 10:53:28 +0000
@@ -769,7 +769,12 @@
for container, obj in swift_files(max_lfc_id):
name = obj['name']
- content_id = int(name)
+
+ # We may have a segment of a large file.
+ if '/' in name:
+ content_id = int(name.split('/', 1)[0])
+ else:
+ content_id = int(name)
while (next_wanted_content_id is not None
and content_id > next_wanted_content_id):
=== modified file 'lib/lp/services/librarianserver/swift.py'
--- lib/lp/services/librarianserver/swift.py 2013-12-09 10:53:28 +0000
+++ lib/lp/services/librarianserver/swift.py 2013-12-09 10:53:28 +0000
@@ -15,6 +15,7 @@
import re
import sys
import time
+import urllib
from swiftclient import client as swiftclient
@@ -24,6 +25,7 @@
SWIFT_CONTAINER_PREFIX = 'librarian_'
+MAX_SWIFT_OBJECT_SIZE = 5 * 1024 ** 3 # 5GB Swift limit.
ONE_DAY = 24 * 60 * 60
@@ -130,29 +132,57 @@
except swiftclient.ClientException as x:
if x.http_status != 404:
raise
- log.info(
- 'Putting {0} into Swift ({1}, {2})'.format(
- lfc, container, obj_name))
- md5_stream = HashStream(open(fs_path, 'rb'))
- db_md5_hash = ISlaveStore(LibraryFileContent).get(
- LibraryFileContent, lfc).md5
- swift_md5_hash = swift_connection.put_object(
- container, obj_name, md5_stream, os.path.getsize(fs_path))
- disk_md5_hash = md5_stream.hash.hexdigest()
- if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
- log.error(
- "LibraryFileContent({0}) corrupt. "
- "disk md5={1}, db md5={2}, swift md5={3}".format(
- lfc, disk_md5_hash, db_md5_hash, swift_md5_hash))
- try:
- swift_connection.delete_object(container, obj_name)
- except Exception as x:
- log.error('Failed to delete corrupt file from Swift')
- raise AssertionError('md5 mismatch')
+ log.info('Putting {0} into Swift ({1}, {2})'.format(
+ lfc, container, obj_name))
+ _put(log, swift_connection, lfc, container, obj_name, fs_path)
if remove:
os.unlink(fs_path)
+def _put(log, swift_connection, lfc_id, container, obj_name, fs_path):
+ fs_size = os.path.getsize(fs_path)
+ fs_file = open(fs_path, 'rb')
+ if fs_size <= MAX_SWIFT_OBJECT_SIZE:
+ md5_stream = HashStream(fs_file)
+ db_md5_hash = ISlaveStore(LibraryFileContent).get(
+ LibraryFileContent, lfc_id).md5
+ swift_md5_hash = swift_connection.put_object(
+ container, obj_name, md5_stream, fs_size)
+ disk_md5_hash = md5_stream.hash.hexdigest()
+ if not (disk_md5_hash == db_md5_hash == swift_md5_hash):
+ log.error(
+ "LibraryFileContent({0}) corrupt. "
+ "disk md5={1}, db md5={2}, swift md5={3}".format(
+ lfc_id, disk_md5_hash, db_md5_hash, swift_md5_hash))
+ try:
+ swift_connection.delete_object(container, obj_name)
+ except Exception:
+ log.exception('Failed to delete corrupt file from Swift')
+ raise AssertionError('md5 mismatch')
+ else:
+ # Large file upload. Create the segments first, then the
+ # manifest. This order prevents partial downloads, and lets us
+ # detect interrupted uploads and clean up.
+ segment = 0
+ while fs_file.tell() < fs_size:
+ assert segment <= 9999, 'Insane number of segments'
+ seg_name = '%s/%04d' % (obj_name, segment)
+ seg_size = min(fs_size - fs_file.tell(), MAX_SWIFT_OBJECT_SIZE)
+ md5_stream = HashStream(fs_file)
+ swift_md5_hash = swift_connection.put_object(
+ container, seg_name, md5_stream, seg_size)
+ segment_md5_hash = md5_stream.hash.hexdigest()
+ assert swift_md5_hash == segment_md5_hash, (
+ "LibraryFileContent({0}) segment {1} upload corrupted".format(
+ lfc_id, segment))
+ segment = segment + 1
+ manifest = '{0}/{1}/'.format(
+ urllib.quote(container), urllib.quote(obj_name))
+ manifest_headers = {'x-object-manifest': manifest}
+ swift_connection.put_object(
+ container, obj_name, '', 0, headers=manifest_headers)
+
+
def swift_location(lfc_id):
'''Return the (container, obj_name) used to store a file.
@@ -165,7 +195,9 @@
# storage, as objects will no longer be found in the expected
# container. This value and the container prefix are deliberatly
# hard coded to avoid cockups with values specified in config files.
- max_objects_per_container = 1000000
+ # While the suggested number is 'under a million', the rare large files
+ # will take up multiple slots so we choose a more conservative number.
+ max_objects_per_container = 500000
container_num = lfc_id // max_objects_per_container
=== modified file 'lib/lp/services/librarianserver/tests/test_swift.py'
--- lib/lp/services/librarianserver/tests/test_swift.py 2013-11-12 15:03:38 +0000
+++ lib/lp/services/librarianserver/tests/test_swift.py 2013-12-09 10:53:28 +0000
@@ -10,6 +10,7 @@
import time
from mock import patch
+from swiftclient import client as swiftclient
import transaction
from lp.services.database import write_transaction
@@ -147,8 +148,11 @@
data = self.librarian_client.getFileByAlias(lfa_id).read()
self.assertEqual(content, data)
- def test_large_binary_files_from_disk(self):
- # Generate a large blob, including null bytes for kicks.
+ def test_largish_binary_files_from_disk(self):
+ # Generate a largish blob, including null bytes for kicks.
+ # A largish file is large enough that the HTTP upload needs
+ # to be done in multiple chunks, but small enough that it is
+ # stored in Swift as a single object.
size = 512 * 1024 # 512KB
expected_content = ''.join(chr(i % 256) for i in range(0, size))
lfa_id = self.add_file('hello_bigboy.xls', expected_content)
@@ -157,9 +161,12 @@
lfa = self.librarian_client.getFileByAlias(lfa_id)
self.assertEqual(expected_content, lfa.read())
- def test_large_binary_files_from_swift(self):
+ def test_largish_binary_files_from_swift(self):
# Generate large blob, multiple of the chunk size.
# Including null bytes for kicks.
+ # A largish file is large enough that the HTTP upload needs
+ # to be done in multiple chunks, but small enough that it is
+ # stored in Swift as a single object.
size = LibrarianStorage.CHUNK_SIZE * 50
self.assert_(size > 1024 * 1024)
expected_content = ''.join(chr(i % 256) for i in range(0, size))
@@ -176,10 +183,12 @@
lfa = self.librarian_client.getFileByAlias(lfa_id)
self.assertEqual(expected_content, lfa.read())
-
- def test_large_binary_files_from_swift_offset(self):
+ def test_largish_binary_files_from_swift_offset(self):
# Generate large blob, but NOT a multiple of the chunk size.
# Including null bytes for kicks.
+ # A largish file is large enough that the HTTP upload needs
+ # to be done in multiple chunks, but small enough that it is
+ # stored in Swift as a single object.
size = LibrarianStorage.CHUNK_SIZE * 50 + 1
self.assert_(size > 1024 * 1024)
expected_content = ''.join(chr(i % 256) for i in range(0, size))
@@ -195,3 +204,44 @@
lfa = self.librarian_client.getFileByAlias(lfa_id)
self.failIf(os.path.exists(swift.filesystem_path(lfc.id)))
self.assertEqual(expected_content, lfa.read())
+
+ def test_large_file_to_swift(self):
+ # Generate a blob large enough that Swift requires us to store
+ # it as multiple objects plus a manifest.
+ size = LibrarianStorage.CHUNK_SIZE * 50
+ self.assert_(size > 1024 * 1024)
+ expected_content = ''.join(chr(i % 256) for i in range(0, size))
+ lfa_id = self.add_file('hello_bigboy.xls', expected_content)
+ lfa = IStore(LibraryFileAlias).get(LibraryFileAlias, lfa_id)
+ lfc = lfa.content
+
+ # We don't really want to upload a file >5GB to our mock Swift,
+ # so change the constant instead. Set it so we need 3 segments.
+ def _reset_max(val):
+ swift.MAX_SWIFT_OBJECT_SIZE = val
+ self.addCleanup(_reset_max, swift.MAX_SWIFT_OBJECT_SIZE)
+ swift.MAX_SWIFT_OBJECT_SIZE = int(size / 2) - 1
+
+ # Shove the file requiring multiple segments into Swift.
+ swift.to_swift(BufferLogger(), remove=False)
+
+ # As our mock Swift does not support multi-segment files,
+ # instead we examine it directly in Swift as best we can.
+ swift_client = self.swift_fixture.connect()
+
+ # The manifest exists. Unfortunately, we can't test that the
+ # magic manifest header is set correctly.
+ container, name = swift.swift_location(lfc.id)
+ headers, obj = swift_client.get_object(container, name)
+ self.assertEqual(obj, '')
+
+ # The segments we expect are all in their expected locations.
+ _, obj1 = swift_client.get_object(container, '{0}/0000'.format(name))
+ _, obj2 = swift_client.get_object(container, '{0}/0001'.format(name))
+ _, obj3 = swift_client.get_object(container, '{0}/0002'.format(name))
+ self.assertRaises(
+ swiftclient.ClientException, swift_client.get_object,
+ container, '{0}/0003'.format(name))
+
+ # Our object round tripped
+ self.assertEqual(obj1 + obj2 + obj3, expected_content)
Follow ups