← Back to team overview

launchpad-reviewers team mailing list archive

Re: [Merge] lp:~cjwatson/launchpad/archive-index-by-hash into lp:launchpad

 


Diff comments:

> 
> === modified file 'lib/lp/archivepublisher/publishing.py'
> --- lib/lp/archivepublisher/publishing.py	2016-03-17 17:08:49 +0000
> +++ lib/lp/archivepublisher/publishing.py	2016-03-22 12:51:52 +0000
> @@ -231,6 +247,141 @@
>          return max(len(str(item['size'])) for item in self[key])
>  
>  
> +class IArchiveHash(Interface):
> +    """Represents a hash algorithm used for index files."""
> +
> +    hash_factory = Attribute("A hashlib class suitable for this algorithm.")
> +    deb822_name = Attribute(
> +        "Algorithm name expected by debian.deb822.Release.")
> +    apt_name = Attribute(
> +        "Algorithm name used by apt in Release files and by-hash "
> +        "subdirectories.")
> +    lfc_name = Attribute(
> +        "LibraryFileContent attribute name corresponding to this algorithm.")
> +
> +
> +@implementer(IArchiveHash)
> +class MD5ArchiveHash:
> +    hash_factory = hashlib.md5
> +    deb822_name = "md5sum"
> +    apt_name = "MD5Sum"
> +    lfc_name = "md5"
> +
> +
> +@implementer(IArchiveHash)
> +class SHA1ArchiveHash:
> +    hash_factory = hashlib.sha1
> +    deb822_name = "sha1"
> +    apt_name = "SHA1"
> +    lfc_name = "sha1"
> +
> +
> +@implementer(IArchiveHash)
> +class SHA256ArchiveHash:
> +    hash_factory = hashlib.sha256
> +    deb822_name = "sha256"
> +    apt_name = "SHA256"
> +    lfc_name = "sha256"
> +
> +
> +archive_hashes = [
> +    MD5ArchiveHash(),
> +    SHA1ArchiveHash(),
> +    SHA256ArchiveHash(),
> +    ]
> +
> +
> +class ByHash:
> +    """Represents a single by-hash directory tree."""
> +
> +    def __init__(self, root, key):
> +        self.root = root
> +        self.path = os.path.join(root, key, "by-hash")
> +        self.known_digests = defaultdict(set)
> +
> +    def add(self, lfa, copy_from_path=None):
> +        """Ensure that by-hash entries for a single file exist.
> +
> +        :param lfa: The `ILibraryFileAlias` to add.
> +        :param copy_from_path: If not None, copy file content from here
> +            rather than fetching it from the librarian.  This can be used
> +            for newly-added files to avoid needing to commit the transaction
> +            before calling this method.
> +        """
> +        for archive_hash in archive_hashes:
> +            digest = getattr(lfa.content, archive_hash.lfc_name)
> +            digest_path = os.path.join(
> +                self.path, archive_hash.apt_name, digest)
> +            self.known_digests[archive_hash.apt_name].add(digest)
> +            if not os.path.exists(digest_path):
> +                ensure_directory_exists(os.path.dirname(digest_path))
> +                if copy_from_path is not None:
> +                    os.link(
> +                        os.path.join(self.root, copy_from_path), digest_path)
> +                else:
> +                    with open(digest_path, "wb") as outfile:
> +                        lfa.open()
> +                        try:
> +                            shutil.copyfileobj(lfa, outfile, 4 * 1024 * 1024)
> +                        finally:
> +                            lfa.close()
> +
> +    def known(self, hashname, digest):
> +        """Do we know about a file with this digest?"""
> +        return digest in self.known_digests[hashname]
> +
> +    def prune(self):
> +        """Remove all by-hash entries that we have not been told to add.
> +
> +        This also removes the by-hash directory itself if no entries remain.
> +        """
> +        prune_directory = True
> +        for archive_hash in archive_hashes:
> +            hash_path = os.path.join(self.path, archive_hash.apt_name)
> +            if os.path.exists(hash_path):
> +                prune_hash_directory = True
> +                for digest in list(os.listdir(hash_path)):
> +                    if not self.known(archive_hash.apt_name, digest):
> +                        os.unlink(os.path.join(hash_path, digest))
> +                    else:
> +                        prune_hash_directory = False
> +                if prune_hash_directory:
> +                    os.rmdir(hash_path)
> +                else:
> +                    prune_directory = False
> +        if prune_directory and os.path.exists(self.path):
> +            os.rmdir(self.path)
> +
> +
> +class ByHashes:
> +    """Represents all by-hash directory trees in an archive."""
> +
> +    def __init__(self, root):
> +        self.root = root
> +        self.children = {}
> +
> +    def registerChild(self, path):
> +        """Register a single by-hash directory.

path isn't actually the directory, but a child of it.

> +
> +        Only directories that have been registered here will be pruned by
> +        the `prune` method.
> +        """
> +        key = os.path.dirname(path)
> +        if key not in self.children:
> +            self.children[key] = ByHash(self.root, key)
> +        return self.children[key]
> +
> +    def add(self, path, lfa, copy_from_path=None):
> +        self.registerChild(path).add(lfa, copy_from_path=copy_from_path)
> +
> +    def known(self, path, hashname, digest):
> +        return self.registerChild(path).known(hashname, digest)
> +
> +    def prune(self):
> +        for child in self.children.values():
> +            child.prune()
> +
> +
>  class Publisher(object):
>      """Publisher is the class used to provide the facility to publish
>      files in the pool of a Distribution. The publisher objects will be


-- 
https://code.launchpad.net/~cjwatson/launchpad/archive-index-by-hash/+merge/289379
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.


References