launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #20137
Re: [Merge] lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad
Diff comments:
>
> === modified file 'lib/lp/archivepublisher/publishing.py'
> --- lib/lp/archivepublisher/publishing.py 2016-03-11 11:45:56 +0000
> +++ lib/lp/archivepublisher/publishing.py 2016-03-17 14:51:01 +0000
> @@ -501,7 +600,18 @@
> *conditions).config(distinct=True).order_by(
> DistroSeries.id, BinaryPackagePublishingHistory.pocket)
>
> - for distroseries, pocket in chain(source_suites, binary_suites):
> + archive_file_suites = []
> + for container in getUtility(IArchiveFileSet).getContainersToReap(
> + self.archive, container_prefix=u"release:"):
The latter is true: publishdistro.py will only process the PPA if it shows up in getPendingPublicationPPAs, which requires an SPPH or BPPH to be in the right state, or Archive.status == DELETING.
> + try:
> + distroseries, pocket = self.distro.getDistroSeriesAndPocket(
> + container[len(u"release:"):])
> + archive_file_suites.append((distroseries, pocket))
> + except NotFoundError:
> + pass
> +
> + for distroseries, pocket in chain(
> + source_suites, binary_suites, archive_file_suites):
> if self.isDirty(distroseries, pocket):
> continue
> if (cannot_modify_suite(self.archive, distroseries, pocket)
> @@ -796,6 +906,69 @@
> return self.distro.displayname
> return "LP-PPA-%s" % get_ppa_reference(self.archive)
>
> + def _updateByHash(self, suite, release_data):
> + """Update by-hash files for a suite."""
The state that returns to the DB does ultimately come from the DB, but it's pretty unique within the publisher, so the docstring is important.
> + archive_file_set = getUtility(IArchiveFileSet)
> + by_hashes = ByHashes(self._config.archiveroot)
> + suite_dir = os.path.relpath(
> + os.path.join(self._config.distsroot, suite),
> + self._config.archiveroot)
> + container = "release:%s" % suite
> +
> + # Remove any condemned files from the database. We ensure that we
> + # know about all the relevant by-hash directory trees before doing
> + # any removals so that we can prune them properly later.
> + for archive_file in archive_file_set.getByArchive(
> + self.archive, container=container):
> + by_hashes.getChild(archive_file.path)
> + archive_file_set.reap(self.archive, container=container)
> +
> + # Gather information.
> + archive_files = archive_file_set.getByArchive(
> + self.archive, container=container, eager_load=True)
> + active_files = {}
> + for active_entry in release_data["SHA256"]:
> + path = os.path.join(suite_dir, active_entry["name"])
> + active_files[path] = (active_entry["size"], active_entry["sha256"])
> +
> + # Ensure that all files recorded in the database are in by-hash.
> + current_files = {}
> + for archive_file in archive_files:
> + by_hashes.add(archive_file.path, archive_file.library_file)
> + if archive_file.scheduled_deletion_date is None:
> + current_files[archive_file.path] = archive_file
> +
> + # Supersede any database records that do not correspond to active
> + # index files.
> + superseded_files = set()
> + for archive_file in archive_files:
> + path = archive_file.path
> + if (path not in active_files or
> + not by_hashes.exists(
> + path, "SHA256", active_files[path][1])):
> + superseded_files.add(archive_file)
> + archive_file_set.scheduleDeletion(
> + superseded_files, timedelta(days=BY_HASH_STAY_OF_EXECUTION))
> +
> + # Ensure that all the active index files are in by-hash and have
> + # corresponding database entries.
> + # XXX cjwatson 2016-03-15: This should possibly use bulk creation,
> + # although we can only avoid about a third of the queries since the
> + # librarian client has no bulk upload methods.
> + for path, (size, sha256) in active_files.items():
> + full_path = os.path.join(self._config.archiveroot, path)
> + if (os.path.exists(full_path) and
> + not by_hashes.exists(path, "SHA256", sha256)):
You're correct about my misunderstanding (my "9 lines" case is wrong, obviously), but I think there's still a race. Any ArchiveFile that survives reaping is added to the ByHashes via db_files, even if the ArchiveFile is superseded. The ArchiveFile that satisfies the .known here could expire in 5 minutes, and be reaped and pruned in the next publisher run rather than 24 hours later.
> + archive_file = archive_file_set.newFromFile(
> + self.archive, container, self._config.archiveroot, path,
> + size, filenameToContentType(path))
> + by_hashes.add(
> + path, archive_file.library_file, copy_from_path=path)
> +
> + # Finally, remove any files from disk that aren't recorded in the
> + # database and aren't active.
> + by_hashes.prune()
> +
> def _writeReleaseFile(self, suite, release_data):
> """Write a Release file to the archive.
>
--
https://code.launchpad.net/~cjwatson/launchpad/archive-index-by-hash/+merge/289379
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.
References