← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad

 

Review: Needs Fixing code



Diff comments:

> 
> === modified file 'lib/lp/archivepublisher/publishing.py'
> --- lib/lp/archivepublisher/publishing.py	2016-03-17 17:08:49 +0000
> +++ lib/lp/archivepublisher/publishing.py	2016-03-22 12:51:52 +0000
> @@ -810,6 +971,84 @@
>              return self.distro.displayname
>          return "LP-PPA-%s" % get_ppa_reference(self.archive)
>  
> +    def _updateByHash(self, suite, release_data):
> +        """Update by-hash files for a suite.
> +
> +        This takes Release file data which references a set of on-disk
> +        files, injects any newly-modified files from that set into the
> +        librarian and the ArchiveFile table, and updates the on-disk by-hash
> +        directories to be in sync with ArchiveFile.  Any on-disk by-hash
> +        entries that ceased to be current sufficiently long ago are removed.
> +        """
> +        archive_file_set = getUtility(IArchiveFileSet)
> +        by_hashes = ByHashes(self._config.archiveroot)
> +        suite_dir = os.path.relpath(
> +            os.path.join(self._config.distsroot, suite),
> +            self._config.archiveroot)
> +        container = "release:%s" % suite
> +
> +        # Gather information on entries in the current Release file, and
> +        # make sure nothing there is condemned.
> +        current_files = {}
> +        for current_entry in release_data["SHA256"]:
> +            path = os.path.join(suite_dir, current_entry["name"])
> +            current_files[path] = (
> +                current_entry["size"], current_entry["sha256"])
> +        archive_file_set.unscheduleDeletion(
> +            self.archive, container=container,
> +            sha256_checksums=set(
> +                sha256 for _, sha256 in current_files.values()))
> +
> +        # Remove any condemned files from the database whose stay of
> +        # execution has elapsed.  We ensure that we know about all the
> +        # relevant by-hash directory trees before doing any removals so that
> +        # we can prune them properly later.
> +        for db_file in archive_file_set.getByArchive(
> +                self.archive, container=container):
> +            by_hashes.registerChild(db_file.path)
> +        archive_file_set.reap(self.archive, container=container)
> +
> +        # Ensure that all files recorded in the database are in by-hash.
> +        db_files = archive_file_set.getByArchive(
> +            self.archive, container=container, eager_load=True)
> +        for db_file in db_files:
> +            by_hashes.add(db_file.path, db_file.library_file)
> +
> +        # Condemn any database records that do not correspond to current
> +        # index files.
> +        condemned_files = set()
> +        for db_file in db_files:
> +            if db_file.scheduled_deletion_date is None:
> +                path = db_file.path
> +                if path in current_files:
> +                    current_sha256 = current_files[path][1]
> +                else:
> +                    current_sha256 = None
> +                if db_file.library_file.content.sha256 != current_sha256:
> +                    condemned_files.add(db_file)
> +        if condemned_files:
> +            archive_file_set.scheduleDeletion(
> +                condemned_files, timedelta(days=BY_HASH_STAY_OF_EXECUTION))
> +
> +        # Ensure that all the current index files are in by-hash and have
> +        # corresponding database entries.
> +        # XXX cjwatson 2016-03-15: This should possibly use bulk creation,
> +        # although we can only avoid about a third of the queries since the
> +        # librarian client has no bulk upload methods.
> +        for path, (size, sha256) in current_files.items():
> +            full_path = os.path.join(self._config.archiveroot, path)
> +            if (os.path.exists(full_path) and
> +                    not by_hashes.known(path, "SHA256", sha256)):

I missed this before because the method somewhat deceptively takes a path argument: this only checks that the SHA256 is still there, not that it's there for the path that we care about. So two empty files will share the same ArchiveFile, with the path of whichever showed up first.

Currently there's no functional difference (beyond the uncondemnation of all ArchiveFile rows matching the SHA256 of a file at a different path that is being uncomdemned), but it is going to look a bit weird if we ever have to debug database state, and it makes diskless archives challenging at best.

> +                with open(full_path, "rb") as fileobj:
> +                    db_file = archive_file_set.newFromFile(
> +                        self.archive, container, path, fileobj,
> +                        size, filenameToContentType(path))
> +                by_hashes.add(path, db_file.library_file, copy_from_path=path)
> +
> +        # Finally, remove any files from disk that aren't recorded in the
> +        # database and aren't active.
> +        by_hashes.prune()

The whole of _updateByHash should probably log at DEBUG each new/condemned/etc. file. It's going to be slightly noisy, but primary archive publication log output is already O(indices), and we're going to regret not having logs for this complex set of operations eventually.

> +
>      def _writeReleaseFile(self, suite, release_data):
>          """Write a Release file to the archive.
>  
> 
> === modified file 'lib/lp/archivepublisher/tests/test_publisher.py'
> --- lib/lp/archivepublisher/tests/test_publisher.py	2016-03-11 11:45:56 +0000
> +++ lib/lp/archivepublisher/tests/test_publisher.py	2016-03-22 12:51:52 +0000
> @@ -1908,6 +2164,265 @@
>                  os.stat(os.path.join(dep11_path, name)).st_mtime,
>                  LessThan(now - 59))
>  
> +    def testUpdateByHashDisabled(self):
> +        # The publisher does not create by-hash directories if it is
> +        # disabled in the series configuration.
> +        self.assertFalse(self.breezy_autotest.publish_by_hash)
> +        self.assertFalse(self.breezy_autotest.advertise_by_hash)
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        self.getPubSource(filecontent='Source: foo\n')
> +
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'), Not(PathExists()))
> +        release = self.parseRelease(suite_path('Release'))
> +        self.assertNotIn('Acquire-By-Hash', release)
> +
> +    def testUpdateByHashUnadvertised(self):
> +        # If the series configuration sets publish_by_hash but not
> +        # advertise_by_hash, then by-hash directories are created but not
> +        # advertised in Release.  This is useful for testing.
> +        self.breezy_autotest.publish_by_hash = True
> +        self.assertFalse(self.breezy_autotest.advertise_by_hash)
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        self.getPubSource(filecontent='Source: foo\n')
> +
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        self.assertThat(suite_path('main', 'source', 'by-hash'), PathExists())
> +        release = self.parseRelease(suite_path('Release'))
> +        self.assertNotIn('Acquire-By-Hash', release)
> +
> +    def testUpdateByHashInitial(self):
> +        # An initial publisher run populates by-hash directories and leaves
> +        # no archive files scheduled for deletion.
> +        self.breezy_autotest.publish_by_hash = True
> +        self.breezy_autotest.advertise_by_hash = True
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        self.getPubSource(filecontent='Source: foo\n')
> +
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        contents = set()
> +        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
> +            with open(suite_path('main', 'source', name), 'rb') as f:
> +                contents.add(f.read())
> +
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(contents))
> +
> +        archive_files = getUtility(IArchiveFileSet).getByArchive(
> +            self.ubuntutest.main_archive)
> +        self.assertNotEqual([], archive_files)
> +        self.assertEqual([], [
> +            archive_file for archive_file in archive_files
> +            if archive_file.scheduled_deletion_date is not None])
> +
> +    def testUpdateByHashSubsequent(self):
> +        # A subsequent publisher run updates by-hash directories where
> +        # necessary, and marks inactive index files for later deletion.
> +        self.breezy_autotest.publish_by_hash = True
> +        self.breezy_autotest.advertise_by_hash = True
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        self.getPubSource(filecontent='Source: foo\n')
> +
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        main_contents = set()
> +        universe_contents = set()
> +        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
> +            with open(suite_path('main', 'source', name), 'rb') as f:
> +                main_contents.add(f.read())
> +            with open(suite_path('universe', 'source', name), 'rb') as f:
> +                universe_contents.add(f.read())
> +
> +        self.getPubSource(sourcename='baz', filecontent='Source: baz\n')
> +
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +
> +        for name in ('Release', 'Sources.gz', 'Sources.bz2'):
> +            with open(suite_path('main', 'source', name), 'rb') as f:
> +                main_contents.add(f.read())
> +
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(main_contents))
> +        self.assertThat(
> +            suite_path('universe', 'source', 'by-hash'),
> +            ByHashHasContents(universe_contents))
> +
> +        archive_files = getUtility(IArchiveFileSet).getByArchive(
> +            self.ubuntutest.main_archive)
> +        self.assertContentEqual(
> +            ['dists/breezy-autotest/main/source/Sources.bz2',
> +             'dists/breezy-autotest/main/source/Sources.gz'],
> +            [archive_file.path for archive_file in archive_files
> +             if archive_file.scheduled_deletion_date is not None])
> +
> +    def testUpdateByHashReprieve(self):

Technically it's not a reprieve but a pardon or a commutation, since the file is later condemned for the separate crime of a different publication becoming obsolete.

Though I guess since you're reusing the ArchiveFile row... respite? Important details. Anyway, carry on.

> +        # If a newly-modified index file is identical to a
> +        # previously-condemned one, then it is reprieved and not pruned.
> +        self.breezy_autotest.publish_by_hash = True
> +        # Enable uncompressed index files to avoid relying on stable output
> +        # from compressors in this test.
> +        self.breezy_autotest.index_compressors = [
> +            IndexCompressionType.UNCOMPRESSED]
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        # Publish empty index files.
> +        publisher.markPocketDirty(
> +            self.breezy_autotest, PackagePublishingPocket.RELEASE)
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        main_contents = set()
> +        for name in ('Release', 'Sources'):
> +            with open(suite_path('main', 'source', name), 'rb') as f:
> +                main_contents.add(f.read())
> +
> +        # Add a source package so that Sources is non-empty.
> +        pub_source = self.getPubSource(filecontent='Source: foo\n')
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +        transaction.commit()
> +        with open(suite_path('main', 'source', 'Sources'), 'rb') as f:
> +            main_contents.add(f.read())
> +        self.assertEqual(3, len(main_contents))
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(main_contents))
> +
> +        # Make the empty Sources file ready to prune.
> +        old_archive_files = []
> +        for archive_file in getUtility(IArchiveFileSet).getByArchive(
> +                self.ubuntutest.main_archive):
> +            if ('main/source' in archive_file.path and
> +                    archive_file.scheduled_deletion_date is not None):
> +                old_archive_files.append(archive_file)
> +        self.assertEqual(1, len(old_archive_files))
> +        removeSecurityProxy(old_archive_files[0]).scheduled_deletion_date = (
> +            datetime.now(pytz.UTC) - timedelta(hours=1))
> +
> +        # Delete the source package so that Sources is empty again.  The
> +        # empty file is reprieved and the non-empty one is condemned.
> +        pub_source.requestDeletion(self.ubuntutest.owner)
> +        publisher.A_publish(False)
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +        transaction.commit()
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(main_contents))
> +        archive_files = [
> +            archive_file
> +            for archive_file in getUtility(IArchiveFileSet).getByArchive(
> +                self.ubuntutest.main_archive)
> +            if archive_file.path == 'dists/breezy-autotest/main/source/Sources'
> +            ]
> +        self.assertThat(
> +            sorted(archive_files, key=attrgetter('id')),
> +            MatchesListwise([
> +                MatchesStructure(scheduled_deletion_date=Is(None)),
> +                MatchesStructure(scheduled_deletion_date=Not(Is(None))),
> +                ]))
> +
> +    def testUpdateByHashPrune(self):
> +        # The publisher prunes files from by-hash that were condemned more
> +        # than a day ago.
> +        self.breezy_autotest.publish_by_hash = True
> +        self.breezy_autotest.advertise_by_hash = True
> +        publisher = Publisher(
> +            self.logger, self.config, self.disk_pool,
> +            self.ubuntutest.main_archive)
> +
> +        suite_path = partial(
> +            os.path.join, self.config.distsroot, 'breezy-autotest')
> +        main_contents = set()
> +        for sourcename in ('foo', 'bar'):
> +            self.getPubSource(
> +                sourcename=sourcename, filecontent='Source: %s\n' % sourcename)
> +            publisher.A_publish(False)
> +            publisher.C_doFTPArchive(False)
> +            publisher.D_writeReleaseFiles(False)
> +            for name in ('Release', 'Sources.gz', 'Sources.bz2'):
> +                with open(suite_path('main', 'source', name), 'rb') as f:
> +                    main_contents.add(f.read())
> +        transaction.commit()
> +        # Undo any previous determination that breezy-autotest is dirty, so
> +        # that we can use that to check that future runs don't force index
> +        # regeneration.
> +        publisher.dirty_pockets = set()
> +
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(main_contents))
> +        old_archive_files = []
> +        for archive_file in getUtility(IArchiveFileSet).getByArchive(
> +                self.ubuntutest.main_archive):
> +            if ('main/source' in archive_file.path and
> +                    archive_file.scheduled_deletion_date is not None):
> +                old_archive_files.append(archive_file)
> +        self.assertEqual(2, len(old_archive_files))
> +
> +        now = datetime.now(pytz.UTC)
> +        removeSecurityProxy(old_archive_files[0]).scheduled_deletion_date = (
> +            now + timedelta(hours=12))
> +        removeSecurityProxy(old_archive_files[1]).scheduled_deletion_date = (
> +            now - timedelta(hours=12))
> +        old_archive_files[1].library_file.open()
> +        try:
> +            main_contents.remove(old_archive_files[1].library_file.read())
> +        finally:
> +            old_archive_files[1].library_file.close()
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            Not(ByHashHasContents(main_contents)))
> +
> +        publisher.A2_markPocketsWithDeletionsDirty()
> +        publisher.C_doFTPArchive(False)
> +        publisher.D_writeReleaseFiles(False)
> +        self.assertEqual(set(), publisher.dirty_pockets)
> +        self.assertThat(
> +            suite_path('main', 'source', 'by-hash'),
> +            ByHashHasContents(main_contents))
> +
>      def testCreateSeriesAliasesNoAlias(self):
>          """createSeriesAliases has nothing to do by default."""
>          publisher = Publisher(
> 
> === modified file 'lib/lp/registry/model/distribution.py'
> --- lib/lp/registry/model/distribution.py	2015-10-13 13:22:08 +0000
> +++ lib/lp/registry/model/distribution.py	2016-03-22 12:51:52 +0000
> @@ -1283,10 +1283,22 @@
>              bin_query, clauseTables=['BinaryPackagePublishingHistory'],
>              orderBy=['archive.id'], distinct=True)
>  
> +        reapable_af_query = """
> +        Archive.purpose = %s AND
> +        Archive.distribution = %s AND
> +        ArchiveFile.archive = archive.id AND
> +        ArchiveFile.scheduled_deletion_date < %s
> +        """ % sqlvalues(ArchivePurpose.PPA, self, UTC_NOW)
> +
> +        reapable_af_archives = Archive.select(
> +            reapable_af_query, clauseTables=['ArchiveFile'],
> +            orderBy=['archive.id'], distinct=True)

Does this query perform OK? It probably uses archivefile__archive__scheduled_deletion_date__container successfully, but worth a check.

> +
>          deleting_archives = Archive.selectBy(
>              status=ArchiveStatus.DELETING).orderBy(['archive.id'])
>  
> -        return src_archives.union(bin_archives).union(deleting_archives)
> +        return src_archives.union(bin_archives).union(
> +            reapable_af_archives).union(deleting_archives)
>  
>      def getArchiveByComponent(self, component_name):
>          """See `IDistribution`."""


-- 
https://code.launchpad.net/~cjwatson/launchpad/archive-index-by-hash/+merge/289379
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.


References