← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad

 

Colin Watson has proposed merging lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad with lp:~cjwatson/launchpad/ds-publish-by-hash as a prerequisite.

Commit message:
Add files indexed by Release to the librarian and to ArchiveFile.  Publish them in by-hash directories, keeping old versions for a day.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #1430011 in Launchpad itself: "support apt by-hash mirrors"
  https://bugs.launchpad.net/launchpad/+bug/1430011

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/archive-index-by-hash/+merge/289379

Add files indexed by Release to the librarian and to ArchiveFile.  Publish them in by-hash directories, keeping old versions for a day.

DistroSeries.publish_by_hash is useful so that we only do this for series with a version of apt that can make use of it, but it also serves as a circuit breaker in case something goes wrong.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad.
=== modified file 'lib/lp/archivepublisher/model/ftparchive.py'
--- lib/lp/archivepublisher/model/ftparchive.py	2016-02-09 15:51:19 +0000
+++ lib/lp/archivepublisher/model/ftparchive.py	2016-03-17 14:51:01 +0000
@@ -54,10 +54,14 @@
     """Ensure that the path exists and is an empty directory."""
     if os.path.isdir(path):
         for name in os.listdir(path):
+            if name == "by-hash":
+                # Ignore existing by-hash directories; they will be cleaned
+                # up to match the rest of the directory tree later.
+                continue
             child_path = os.path.join(path, name)
             # Directories containing index files should never have
-            # subdirectories.  Guard against expensive mistakes by not
-            # recursing here.
+            # subdirectories other than by-hash.  Guard against expensive
+            # mistakes by not recursing here.
             os.unlink(child_path)
     else:
         os.makedirs(path, 0o755)

=== modified file 'lib/lp/archivepublisher/publishing.py'
--- lib/lp/archivepublisher/publishing.py	2016-03-11 11:45:56 +0000
+++ lib/lp/archivepublisher/publishing.py	2016-03-17 14:51:01 +0000
@@ -12,7 +12,11 @@
 __metaclass__ = type
 
 import bz2
-from datetime import datetime
+from collections import defaultdict
+from datetime import (
+    datetime,
+    timedelta,
+    )
 import errno
 import gzip
 import hashlib
@@ -32,6 +36,7 @@
 from storm.expr import Desc
 from zope.component import getUtility
 
+from lp.app.errors import NotFoundError
 from lp.app.interfaces.launchpad import ILaunchpadCelebrities
 from lp.archivepublisher import HARDCODED_COMPONENT_ORDER
 from lp.archivepublisher.config import getPubConfig
@@ -64,7 +69,9 @@
 from lp.services.database.constants import UTC_NOW
 from lp.services.database.interfaces import IStore
 from lp.services.features import getFeatureFlag
+from lp.services.helpers import filenameToContentType
 from lp.services.librarian.client import LibrarianClient
+from lp.services.osutils import open_for_writing
 from lp.services.utils import file_exists
 from lp.soyuz.enums import (
     ArchivePurpose,
@@ -73,6 +80,7 @@
     PackagePublishingStatus,
     )
 from lp.soyuz.interfaces.archive import NoSuchPPA
+from lp.soyuz.interfaces.archivefile import IArchiveFileSet
 from lp.soyuz.interfaces.publishing import (
     active_publishing_status,
     IPublishingSet,
@@ -95,6 +103,10 @@
     }
 
 
+# Number of days before unreferenced files are removed from by-hash.
+BY_HASH_STAY_OF_EXECUTION = 1
+
+
 def reorder_components(components):
     """Return a list of the components provided.
 
@@ -231,6 +243,93 @@
         return max(len(str(item['size'])) for item in self[key])
 
 
+class ByHash:
+    """Represents a single by-hash directory tree."""
+
+    # Subdirectory names expected by apt.
+    supported_hashes = ("MD5Sum", "SHA1", "SHA256")
+
+    def __init__(self, root, key):
+        self.root = root
+        self.path = os.path.join(root, key, "by-hash")
+        self.known_digests = defaultdict(set)
+
+    @staticmethod
+    def getHashFromLFA(lfa, name):
+        attr = {
+            "MD5Sum": "md5",
+            "SHA1": "sha1",
+            "SHA256": "sha256",
+            }[name]
+        return getattr(lfa.content, attr)
+
+    def add(self, lfa, copy_from_path=None):
+        """Ensure that by-hash entries for a single file exist.
+
+        :param lfa: The `ILibraryFileAlias` to add.
+        :param copy_from_path: If not None, copy file content from here
+            rather than fetching it from the librarian.  This can be used
+            for newly-added files to avoid needing to commit the transaction
+            before calling this method.
+        """
+        for hashname in self.supported_hashes:
+            digest = self.getHashFromLFA(lfa, hashname)
+            digest_path = os.path.join(self.path, hashname, digest)
+            self.known_digests[hashname].add(digest)
+            if not os.path.exists(digest_path):
+                with open_for_writing(digest_path, "wb") as outfile:
+                    if copy_from_path is not None:
+                        infile = open(
+                            os.path.join(self.root, copy_from_path), "rb")
+                    else:
+                        lfa.open()
+                        infile = lfa
+                    try:
+                        shutil.copyfileobj(infile, outfile, 4 * 1024 * 1024)
+                    finally:
+                        infile.close()
+
+    def exists(self, hashname, digest):
+        """Do we know about a file with this digest?"""
+        return digest in self.known_digests[hashname]
+
+    def prune(self):
+        """Remove all by-hash entries that we have not been told to add."""
+        if any(self.known_digests.values()):
+            for hashname in self.supported_hashes:
+                hash_path = os.path.join(self.path, hashname)
+                if os.path.exists(hash_path):
+                    for digest in list(os.listdir(hash_path)):
+                        if not self.exists(hashname, digest):
+                            os.unlink(os.path.join(hash_path, digest))
+        elif os.path.exists(self.path):
+            shutil.rmtree(self.path)
+
+
+class ByHashes:
+    """Represents all by-hash directory trees in an archive."""
+
+    def __init__(self, root):
+        self.root = root
+        self.children = {}
+
+    def getChild(self, path):
+        key = os.path.dirname(path)
+        if key not in self.children:
+            self.children[key] = ByHash(self.root, key)
+        return self.children[key]
+
+    def add(self, path, lfa, copy_from_path=None):
+        self.getChild(path).add(lfa, copy_from_path=copy_from_path)
+
+    def exists(self, path, hashname, digest):
+        return self.getChild(path).exists(hashname, digest)
+
+    def prune(self):
+        for child in self.children.values():
+            child.prune()
+
+
 class Publisher(object):
     """Publisher is the class used to provide the facility to publish
     files in the pool of a Distribution. The publisher objects will be
@@ -501,7 +600,18 @@
             *conditions).config(distinct=True).order_by(
                 DistroSeries.id, BinaryPackagePublishingHistory.pocket)
 
-        for distroseries, pocket in chain(source_suites, binary_suites):
+        archive_file_suites = []
+        for container in getUtility(IArchiveFileSet).getContainersToReap(
+                self.archive, container_prefix=u"release:"):
+            try:
+                distroseries, pocket = self.distro.getDistroSeriesAndPocket(
+                    container[len(u"release:"):])
+                archive_file_suites.append((distroseries, pocket))
+            except NotFoundError:
+                pass
+
+        for distroseries, pocket in chain(
+                source_suites, binary_suites, archive_file_suites):
             if self.isDirty(distroseries, pocket):
                 continue
             if (cannot_modify_suite(self.archive, distroseries, pocket)
@@ -796,6 +906,69 @@
             return self.distro.displayname
         return "LP-PPA-%s" % get_ppa_reference(self.archive)
 
+    def _updateByHash(self, suite, release_data):
+        """Update by-hash files for a suite."""
+        archive_file_set = getUtility(IArchiveFileSet)
+        by_hashes = ByHashes(self._config.archiveroot)
+        suite_dir = os.path.relpath(
+            os.path.join(self._config.distsroot, suite),
+            self._config.archiveroot)
+        container = "release:%s" % suite
+
+        # Remove any condemned files from the database.  We ensure that we
+        # know about all the relevant by-hash directory trees before doing
+        # any removals so that we can prune them properly later.
+        for archive_file in archive_file_set.getByArchive(
+                self.archive, container=container):
+            by_hashes.getChild(archive_file.path)
+        archive_file_set.reap(self.archive, container=container)
+
+        # Gather information.
+        archive_files = archive_file_set.getByArchive(
+            self.archive, container=container, eager_load=True)
+        active_files = {}
+        for active_entry in release_data["SHA256"]:
+            path = os.path.join(suite_dir, active_entry["name"])
+            active_files[path] = (active_entry["size"], active_entry["sha256"])
+
+        # Ensure that all files recorded in the database are in by-hash.
+        current_files = {}
+        for archive_file in archive_files:
+            by_hashes.add(archive_file.path, archive_file.library_file)
+            if archive_file.scheduled_deletion_date is None:
+                current_files[archive_file.path] = archive_file
+
+        # Supersede any database records that do not correspond to active
+        # index files.
+        superseded_files = set()
+        for archive_file in archive_files:
+            path = archive_file.path
+            if (path not in active_files or
+                not by_hashes.exists(
+                    path, "SHA256", active_files[path][1])):
+                superseded_files.add(archive_file)
+        archive_file_set.scheduleDeletion(
+            superseded_files, timedelta(days=BY_HASH_STAY_OF_EXECUTION))
+
+        # Ensure that all the active index files are in by-hash and have
+        # corresponding database entries.
+        # XXX cjwatson 2016-03-15: This should possibly use bulk creation,
+        # although we can only avoid about a third of the queries since the
+        # librarian client has no bulk upload methods.
+        for path, (size, sha256) in active_files.items():
+            full_path = os.path.join(self._config.archiveroot, path)
+            if (os.path.exists(full_path) and
+                    not by_hashes.exists(path, "SHA256", sha256)):
+                archive_file = archive_file_set.newFromFile(
+                    self.archive, container, self._config.archiveroot, path,
+                    size, filenameToContentType(path))
+                by_hashes.add(
+                    path, archive_file.library_file, copy_from_path=path)
+
+        # Finally, remove any files from disk that aren't recorded in the
+        # database and aren't active.
+        by_hashes.prune()
+
     def _writeReleaseFile(self, suite, release_data):
         """Write a Release file to the archive.
 
@@ -907,6 +1080,10 @@
             release_file.setdefault("SHA1", []).append(hashes["sha1"])
             release_file.setdefault("SHA256", []).append(hashes["sha256"])
 
+        if distroseries.publish_by_hash:
+            self._updateByHash(suite, release_file)
+            release_file["Acquire-By-Hash"] = "yes"
+
         self._writeReleaseFile(suite, release_file)
         core_files.add("Release")
 
@@ -1014,16 +1191,14 @@
         # Schedule this for inclusion in the Release file.
         all_series_files.add(os.path.join(component, "i18n", "Index"))
 
-    def _readIndexFileHashes(self, distroseries_name, file_name,
-                             subpath=None):
+    def _readIndexFileHashes(self, suite, file_name, subpath=None):
         """Read an index file and return its hashes.
 
-        :param distroseries_name: Distro series name
+        :param suite: Suite name.
         :param file_name: Filename relative to the parent container directory.
-        :param subpath: Optional subpath within the distroseries root.
-            Generated indexes will not include this path. If omitted,
-            filenames are assumed to be relative to the distroseries
-            root.
+        :param subpath: Optional subpath within the suite root.  Generated
+            indexes will not include this path.  If omitted, filenames are
+            assumed to be relative to the suite root.
         :return: A dictionary mapping hash field names to dictionaries of
             their components as defined by debian.deb822.Release (e.g.
             {"md5sum": {"md5sum": ..., "size": ..., "name": ...}}), or None
@@ -1031,8 +1206,7 @@
         """
         open_func = open
         full_name = os.path.join(
-            self._config.distsroot, distroseries_name, subpath or '.',
-            file_name)
+            self._config.distsroot, suite, subpath or '.', file_name)
         if not os.path.exists(full_name):
             if os.path.exists(full_name + '.gz'):
                 open_func = gzip.open

=== modified file 'lib/lp/archivepublisher/tests/test_publisher.py'
--- lib/lp/archivepublisher/tests/test_publisher.py	2016-03-11 11:45:56 +0000
+++ lib/lp/archivepublisher/tests/test_publisher.py	2016-03-17 14:51:01 +0000
@@ -7,6 +7,10 @@
 
 import bz2
 import crypt
+from datetime import (
+    datetime,
+    timedelta,
+    )
 from functools import partial
 import gzip
 import hashlib
@@ -22,9 +26,17 @@
     import lzma
 except ImportError:
     from backports import lzma
+import pytz
 from testtools.matchers import (
     ContainsAll,
+    DirContains,
+    Equals,
+    FileContains,
     LessThan,
+    Matcher,
+    MatchesSetwise,
+    Not,
+    PathExists,
     )
 import transaction
 from zope.component import getUtility
@@ -36,6 +48,8 @@
     IArchiveSigningKey,
     )
 from lp.archivepublisher.publishing import (
+    ByHash,
+    ByHashes,
     getPublisher,
     I18nIndex,
     Publisher,
@@ -58,6 +72,7 @@
     BufferLogger,
     DevNullLogger,
     )
+from lp.services.osutils import open_for_writing
 from lp.services.utils import file_exists
 from lp.soyuz.enums import (
     ArchivePurpose,
@@ -68,12 +83,16 @@
     PackageUploadStatus,
     )
 from lp.soyuz.interfaces.archive import IArchiveSet
+from lp.soyuz.interfaces.archivefile import IArchiveFileSet
 from lp.soyuz.tests.test_publishing import TestNativePublishingBase
 from lp.testing import TestCaseWithFactory
 from lp.testing.fakemethod import FakeMethod
 from lp.testing.gpgkeys import gpgkeysdir
 from lp.testing.keyserver import KeyServerTac
-from lp.testing.layers import ZopelessDatabaseLayer
+from lp.testing.layers import (
+    LaunchpadZopelessLayer,
+    ZopelessDatabaseLayer,
+    )
 
 
 RELEASE = PackagePublishingPocket.RELEASE
@@ -423,6 +442,223 @@
             'i386', publications[0].distroarchseries.architecturetag)
 
 
+class ByHashHasContents(Matcher):
+    """Matches if a by-hash directory has exactly the specified contents."""
+
+    def __init__(self, contents):
+        self.contents = contents
+
+    def match(self, by_hash_path):
+        mismatch = DirContains(["MD5Sum", "SHA1", "SHA256"]).match(
+            by_hash_path)
+        if mismatch is not None:
+            return mismatch
+        for hashname, hashattr in (
+                ("MD5Sum", "md5"), ("SHA1", "sha1"), ("SHA256", "sha256")):
+            digests = {
+                getattr(hashlib, hashattr)(content).hexdigest(): content
+                for content in self.contents}
+            path = os.path.join(by_hash_path, hashname)
+            mismatch = DirContains(digests.keys()).match(path)
+            if mismatch is not None:
+                return mismatch
+            for digest, content in digests.items():
+                mismatch = FileContains(content).match(
+                    os.path.join(path, digest))
+                if mismatch is not None:
+                    return mismatch
+
+
+class ByHashesHaveContents(Matcher):
+    """Matches if only these by-hash directories exist with proper contents."""
+
+    def __init__(self, path_contents):
+        self.path_contents = path_contents
+
+    def match(self, root):
+        children = set()
+        for dirpath, dirnames, _ in os.walk(root):
+            if "by-hash" in dirnames:
+                children.add(os.path.relpath(dirpath, root))
+        mismatch = MatchesSetwise(
+            *(Equals(path) for path in self.path_contents)).match(children)
+        if mismatch is not None:
+            return mismatch
+        for path, contents in self.path_contents.items():
+            by_hash_path = os.path.join(root, path, "by-hash")
+            mismatch = ByHashHasContents(contents).match(by_hash_path)
+            if mismatch is not None:
+                return mismatch
+
+
+class TestByHash(TestCaseWithFactory):
+    """Unit tests for details of handling a single by-hash directory tree."""
+
+    layer = LaunchpadZopelessLayer
+
+    def test_add(self):
+        root = self.makeTemporaryDirectory()
+        contents = ["abc\n", "def\n"]
+        lfas = [
+            self.factory.makeLibraryFileAlias(content=content)
+            for content in contents]
+        transaction.commit()
+        by_hash = ByHash(root, "dists/foo/main/source")
+        for lfa in lfas:
+            by_hash.add(lfa)
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        self.assertThat(by_hash_path, ByHashHasContents(contents))
+
+    def test_add_copy_from_path(self):
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        sources_path = "dists/foo/main/source/Sources"
+        with open_for_writing(
+                os.path.join(root, sources_path), "w") as sources:
+            sources.write(content)
+        lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
+        by_hash = ByHash(root, "dists/foo/main/source")
+        by_hash.add(lfa, copy_from_path=sources_path)
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        self.assertThat(by_hash_path, ByHashHasContents([content]))
+
+    def test_add_existing(self):
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        lfa = self.factory.makeLibraryFileAlias(content=content)
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        for hashname, hashattr in (
+                ("MD5Sum", "md5"), ("SHA1", "sha1"), ("SHA256", "sha256")):
+            digest = getattr(hashlib, hashattr)(content).hexdigest()
+            with open_for_writing(
+                    os.path.join(by_hash_path, hashname, digest), "w") as f:
+                f.write(content)
+        by_hash = ByHash(root, "dists/foo/main/source")
+        self.assertThat(by_hash_path, ByHashHasContents([content]))
+        by_hash.add(lfa)
+        self.assertThat(by_hash_path, ByHashHasContents([content]))
+
+    def test_exists(self):
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        with open_for_writing(os.path.join(root, "abc"), "w") as f:
+            f.write(content)
+        lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
+        by_hash = ByHash(root, "")
+        md5 = hashlib.md5(content).hexdigest()
+        sha1 = hashlib.sha1(content).hexdigest()
+        sha256 = hashlib.sha256(content).hexdigest()
+        self.assertFalse(by_hash.exists("MD5Sum", md5))
+        self.assertFalse(by_hash.exists("SHA1", sha1))
+        self.assertFalse(by_hash.exists("SHA256", sha256))
+        by_hash.add(lfa, copy_from_path="abc")
+        self.assertTrue(by_hash.exists("MD5Sum", md5))
+        self.assertTrue(by_hash.exists("SHA1", sha1))
+        self.assertTrue(by_hash.exists("SHA256", sha256))
+
+    def test_prune(self):
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        sources_path = "dists/foo/main/source/Sources"
+        with open_for_writing(os.path.join(root, sources_path), "w") as f:
+            f.write(content)
+        lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
+        by_hash = ByHash(root, "dists/foo/main/source")
+        by_hash.add(lfa, copy_from_path=sources_path)
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
+            pass
+        self.assertThat(by_hash_path, Not(ByHashHasContents([content])))
+        by_hash.prune()
+        self.assertThat(by_hash_path, ByHashHasContents([content]))
+
+    def test_prune_empty(self):
+        root = self.makeTemporaryDirectory()
+        by_hash = ByHash(root, "dists/foo/main/source")
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
+            pass
+        self.assertThat(by_hash_path, PathExists())
+        by_hash.prune()
+        self.assertThat(by_hash_path, Not(PathExists()))
+
+
+class TestByHashes(TestCaseWithFactory):
+    """Unit tests for details of handling a set of by-hash directory trees."""
+
+    layer = LaunchpadZopelessLayer
+
+    def test_add(self):
+        root = self.makeTemporaryDirectory()
+        self.assertThat(root, ByHashesHaveContents({}))
+        path_contents = {
+            "dists/foo/main/source": {"Sources": "abc\n"},
+            "dists/foo/main/binary-amd64": {
+                "Packages.gz": "def\n", "Packages.xz": "ghi\n"},
+            }
+        by_hashes = ByHashes(root)
+        for dirpath, contents in path_contents.items():
+            for name, content in contents.items():
+                path = os.path.join(dirpath, name)
+                with open_for_writing(os.path.join(root, path), "w") as f:
+                    f.write(content)
+                lfa = self.factory.makeLibraryFileAlias(
+                    content=content, db_only=True)
+                by_hashes.add(path, lfa, copy_from_path=path)
+        self.assertThat(root, ByHashesHaveContents({
+            path: contents.values()
+            for path, contents in path_contents.items()}))
+
+    def test_exists(self):
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        sources_path = "dists/foo/main/source/Sources"
+        with open_for_writing(os.path.join(root, sources_path), "w") as f:
+            f.write(content)
+        lfa = self.factory.makeLibraryFileAlias(content=content, db_only=True)
+        by_hashes = ByHashes(root)
+        md5 = hashlib.md5(content).hexdigest()
+        sha1 = hashlib.sha1(content).hexdigest()
+        sha256 = hashlib.sha256(content).hexdigest()
+        self.assertFalse(by_hashes.exists(sources_path, "MD5Sum", md5))
+        self.assertFalse(by_hashes.exists(sources_path, "SHA1", sha1))
+        self.assertFalse(by_hashes.exists(sources_path, "SHA256", sha256))
+        by_hashes.add(sources_path, lfa, copy_from_path=sources_path)
+        self.assertTrue(by_hashes.exists(sources_path, "MD5Sum", md5))
+        self.assertTrue(by_hashes.exists(sources_path, "SHA1", sha1))
+        self.assertTrue(by_hashes.exists(sources_path, "SHA256", sha256))
+
+    def test_prune(self):
+        root = self.makeTemporaryDirectory()
+        path_contents = {
+            "dists/foo/main/source": {"Sources": "abc\n"},
+            "dists/foo/main/binary-amd64": {
+                "Packages.gz": "def\n", "Packages.xz": "ghi\n"},
+            }
+        by_hashes = ByHashes(root)
+        for dirpath, contents in path_contents.items():
+            for name, content in contents.items():
+                path = os.path.join(dirpath, name)
+                with open_for_writing(os.path.join(root, path), "w") as f:
+                    f.write(content)
+                lfa = self.factory.makeLibraryFileAlias(
+                    content=content, db_only=True)
+                by_hashes.add(path, lfa, copy_from_path=path)
+        strays = [
+            "dists/foo/main/source/by-hash/MD5Sum/0",
+            "dists/foo/main/binary-amd64/by-hash/MD5Sum/0",
+            ]
+        for stray in strays:
+            with open_for_writing(os.path.join(root, stray), "w"):
+                pass
+        matcher = ByHashesHaveContents({
+            path: contents.values()
+            for path, contents in path_contents.items()})
+        self.assertThat(root, Not(matcher))
+        by_hashes.prune()
+        self.assertThat(root, matcher)
+
+
 class TestPublisher(TestPublisherBase):
     """Testing `Publisher` behaviour."""
 
@@ -1557,6 +1793,34 @@
         # are marked as dirty.
         self.checkDirtyPockets(publisher, expected=allowed_suites)
 
+    def testDirtyingPocketsWithReapableArchiveFiles(self):
+        """Pockets are dirty if they contain reapable archive files."""
+        allowed_suites = []
+        publisher = getPublisher(
+            self.ubuntutest.main_archive, allowed_suites, self.logger)
+        publisher.A2_markPocketsWithDeletionsDirty()
+        self.checkDirtyPockets(publisher, expected=[])
+
+        lfa = self.factory.makeLibraryFileAlias()
+        getUtility(IArchiveFileSet).new(
+            self.ubuntutest.main_archive, u"stray", u"foo", lfa)
+        publisher.A2_markPocketsWithDeletionsDirty()
+        self.checkDirtyPockets(publisher, expected=[])
+
+        archive_file = getUtility(IArchiveFileSet).new(
+            self.ubuntutest.main_archive, u"release:breezy-autotest", u"foo",
+            lfa)
+        publisher.A2_markPocketsWithDeletionsDirty()
+        self.checkDirtyPockets(publisher, expected=[])
+
+        removeSecurityProxy(archive_file).scheduled_deletion_date = (
+            datetime.now(pytz.UTC) - timedelta(days=1))
+        publisher.A2_markPocketsWithDeletionsDirty()
+        expected_dirty_pockets = [
+            ('breezy-autotest', PackagePublishingPocket.RELEASE),
+            ]
+        self.checkDirtyPockets(publisher, expected=expected_dirty_pockets)
+
     def testReleaseFile(self):
         """Test release file writing.
 
@@ -1908,6 +2172,160 @@
                 os.stat(os.path.join(dep11_path, name)).st_mtime,
                 LessThan(now - 59))
 
+    def testUpdateByHashDisabled(self):
+        # The publisher does not create by-hash directories if it is
+        # disabled in the series configuration.
+        self.assertFalse(self.breezy_autotest.publish_by_hash)
+        publisher = Publisher(
+            self.logger, self.config, self.disk_pool,
+            self.ubuntutest.main_archive)
+
+        self.getPubSource(filecontent='Source: foo\n')
+
+        publisher.A_publish(False)
+        publisher.C_doFTPArchive(False)
+        publisher.D_writeReleaseFiles(False)
+
+        suite_path = partial(
+            os.path.join, self.config.distsroot, 'breezy-autotest')
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'), Not(PathExists()))
+        release = self.parseRelease(suite_path('Release'))
+        self.assertNotIn('Acquire-By-Hash', release)
+
+    def testUpdateByHashInitial(self):
+        # An initial publisher run populates by-hash directories and leaves
+        # no archive files scheduled for deletion.
+        self.breezy_autotest.publish_by_hash = True
+        publisher = Publisher(
+            self.logger, self.config, self.disk_pool,
+            self.ubuntutest.main_archive)
+
+        self.getPubSource(filecontent='Source: foo\n')
+
+        publisher.A_publish(False)
+        publisher.C_doFTPArchive(False)
+        publisher.D_writeReleaseFiles(False)
+
+        suite_path = partial(
+            os.path.join, self.config.distsroot, 'breezy-autotest')
+        contents = []
+        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
+            with open(suite_path('main', 'source', name), 'rb') as f:
+                contents.append(f.read())
+
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'),
+            ByHashHasContents(contents))
+
+        archive_files = getUtility(IArchiveFileSet).getByArchive(
+            self.ubuntutest.main_archive)
+        self.assertNotEqual([], archive_files)
+        self.assertEqual([], [
+            archive_file for archive_file in archive_files
+            if archive_file.scheduled_deletion_date is not None])
+
+    def testUpdateByHashSubsequent(self):
+        # A subsequent publisher run updates by-hash directories where
+        # necessary, and marks inactive index files for later deletion.
+        self.breezy_autotest.publish_by_hash = True
+        publisher = Publisher(
+            self.logger, self.config, self.disk_pool,
+            self.ubuntutest.main_archive)
+
+        self.getPubSource(filecontent='Source: foo\n')
+
+        publisher.A_publish(False)
+        publisher.C_doFTPArchive(False)
+        publisher.D_writeReleaseFiles(False)
+
+        suite_path = partial(
+            os.path.join, self.config.distsroot, 'breezy-autotest')
+        main_contents = []
+        universe_contents = []
+        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
+            with open(suite_path('main', 'source', name), 'rb') as f:
+                main_contents.append(f.read())
+            with open(suite_path('universe', 'source', name), 'rb') as f:
+                universe_contents.append(f.read())
+
+        self.getPubSource(sourcename='baz', filecontent='Source: baz\n')
+
+        publisher.A_publish(False)
+        publisher.C_doFTPArchive(False)
+        publisher.D_writeReleaseFiles(False)
+
+        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
+            with open(suite_path('main', 'source', name), 'rb') as f:
+                main_contents.append(f.read())
+
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'),
+            ByHashHasContents(main_contents))
+        self.assertThat(
+            suite_path('universe', 'source', 'by-hash'),
+            ByHashHasContents(universe_contents))
+
+        archive_files = getUtility(IArchiveFileSet).getByArchive(
+            self.ubuntutest.main_archive)
+        self.assertContentEqual(
+            ['dists/breezy-autotest/main/source/Sources.bz2',
+             'dists/breezy-autotest/main/source/Sources.gz'],
+            [archive_file.path for archive_file in archive_files
+             if archive_file.scheduled_deletion_date is not None])
+
+    def testUpdateByHashPrune(self):
+        # The publisher prunes files from by-hash that were superseded more
+        # than a day ago.
+        self.breezy_autotest.publish_by_hash = True
+        publisher = Publisher(
+            self.logger, self.config, self.disk_pool,
+            self.ubuntutest.main_archive)
+
+        suite_path = partial(
+            os.path.join, self.config.distsroot, 'breezy-autotest')
+        main_contents = set()
+        for sourcename in ('foo', 'bar'):
+            self.getPubSource(
+                sourcename=sourcename, filecontent='Source: %s\n' % sourcename)
+            publisher.A_publish(False)
+            publisher.C_doFTPArchive(False)
+            publisher.D_writeReleaseFiles(False)
+            for name in ('Release', 'Sources.gz', 'Sources.bz2'):
+                with open(suite_path('main', 'source', name), 'rb') as f:
+                    main_contents.add(f.read())
+        transaction.commit()
+
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'),
+            ByHashHasContents(main_contents))
+        old_archive_files = []
+        for archive_file in getUtility(IArchiveFileSet).getByArchive(
+                self.ubuntutest.main_archive):
+            if ('main/source' in archive_file.path and
+                    archive_file.scheduled_deletion_date is not None):
+                old_archive_files.append(archive_file)
+        self.assertEqual(2, len(old_archive_files))
+
+        now = datetime.now(pytz.UTC)
+        removeSecurityProxy(old_archive_files[0]).scheduled_deletion_date = (
+            now + timedelta(hours=12))
+        removeSecurityProxy(old_archive_files[1]).scheduled_deletion_date = (
+            now - timedelta(hours=12))
+        old_archive_files[1].library_file.open()
+        try:
+            main_contents.remove(old_archive_files[1].library_file.read())
+        finally:
+            old_archive_files[1].library_file.close()
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'),
+            Not(ByHashHasContents(main_contents)))
+
+        publisher.D_writeReleaseFiles(True)
+        self.assertThat(
+            suite_path('main', 'source', 'by-hash'),
+            ByHashHasContents(main_contents))
+
     def testCreateSeriesAliasesNoAlias(self):
         """createSeriesAliases has nothing to do by default."""
         publisher = Publisher(

=== modified file 'lib/lp/services/helpers.py'
--- lib/lp/services/helpers.py	2014-05-07 15:28:50 +0000
+++ lib/lp/services/helpers.py	2016-03-17 14:51:01 +0000
@@ -10,6 +10,7 @@
 
 __metaclass__ = type
 
+from collections import OrderedDict
 from difflib import unified_diff
 import re
 from StringIO import StringIO
@@ -224,19 +225,37 @@
 
     >>> filenameToContentType('test.tgz')
     'application/octet-stream'
+
+    Build logs
+    >>> filenameToContentType('buildlog.txt.gz')
+    'text/plain'
+
+    Various compressed files
+
+    >>> filenameToContentType('Packages.gz')
+    'application/x-gzip'
+    >>> filenameToContentType('Packages.bz2')
+    'application/x-bzip2'
+    >>> filenameToContentType('Packages.xz')
+    'application/x-xz'
     """
-    ftmap = {".dsc": "text/plain",
-             ".changes": "text/plain",
-             ".deb": "application/x-debian-package",
-             ".udeb": "application/x-debian-package",
-             ".txt": "text/plain",
-             # For the build master logs
-             ".txt.gz": "text/plain",
-             # For live filesystem builds
-             ".manifest": "text/plain",
-             ".manifest-remove": "text/plain",
-             ".size": "text/plain",
-             }
+    ftmap = OrderedDict([
+        (".dsc", "text/plain"),
+        (".changes", "text/plain"),
+        (".deb", "application/x-debian-package"),
+        (".udeb", "application/x-debian-package"),
+        (".txt", "text/plain"),
+        # For the build master logs
+        (".txt.gz", "text/plain"),
+        # For live filesystem builds
+        (".manifest", "text/plain"),
+        (".manifest-remove", "text/plain"),
+        (".size", "text/plain"),
+        # Compressed files
+        (".gz", "application/x-gzip"),
+        (".bz2", "application/x-bzip2"),
+        (".xz", "application/x-xz"),
+        ])
     for ending in ftmap:
         if fname.endswith(ending):
             return ftmap[ending]


Follow ups