← Back to team overview

launchpad-reviewers team mailing list archive

[Merge] lp:~cjwatson/launchpad/fewer-by-hash-algorithms into lp:launchpad

 

Colin Watson has proposed merging lp:~cjwatson/launchpad/fewer-by-hash-algorithms into lp:launchpad.

Commit message:
Drop MD5Sum and SHA1 from by-hash; all versions of apt that support by-hash also support SHA256.

Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)

For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/fewer-by-hash-algorithms/+merge/291871

Drop MD5Sum and SHA1 from by-hash; all versions of apt that support by-hash also support SHA256.  I wanted to make sure that we had the flexibility to support multiple hash algorithms here for the sake of future SHA512 or whatever, but we don't really need to include the obsolete ones just to prove a point.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/fewer-by-hash-algorithms into lp:launchpad.
=== modified file 'lib/lp/archivepublisher/publishing.py'
--- lib/lp/archivepublisher/publishing.py	2016-04-06 00:37:09 +0000
+++ lib/lp/archivepublisher/publishing.py	2016-04-14 10:34:30 +0000
@@ -261,6 +261,8 @@
         "subdirectories.")
     lfc_name = Attribute(
         "LibraryFileContent attribute name corresponding to this algorithm.")
+    write_by_hash = Attribute(
+        "Whether to write by-hash subdirectories for this algorithm.")
 
 
 @implementer(IArchiveHash)
@@ -269,6 +271,7 @@
     deb822_name = "md5sum"
     apt_name = "MD5Sum"
     lfc_name = "md5"
+    write_by_hash = False
 
 
 @implementer(IArchiveHash)
@@ -277,6 +280,7 @@
     deb822_name = "sha1"
     apt_name = "SHA1"
     lfc_name = "sha1"
+    write_by_hash = False
 
 
 @implementer(IArchiveHash)
@@ -285,6 +289,7 @@
     deb822_name = "sha256"
     apt_name = "SHA256"
     lfc_name = "sha256"
+    write_by_hash = True
 
 
 archive_hashes = [
@@ -303,6 +308,14 @@
         self.log = log
         self.known_digests = defaultdict(lambda: defaultdict(set))
 
+    @property
+    def _usable_archive_hashes(self):
+        usable = []
+        for archive_hash in archive_hashes:
+            if archive_hash.write_by_hash:
+                usable.append(archive_hash)
+        return usable
+
     def add(self, name, lfa, copy_from_path=None):
         """Ensure that by-hash entries for a single file exist.
 
@@ -313,9 +326,9 @@
             for newly-added files to avoid needing to commit the transaction
             before calling this method.
         """
-        best_hash = archive_hashes[-1]
+        best_hash = self._usable_archive_hashes[-1]
         best_digest = getattr(lfa.content, best_hash.lfc_name)
-        for archive_hash in reversed(archive_hashes):
+        for archive_hash in reversed(self._usable_archive_hashes):
             digest = getattr(lfa.content, archive_hash.lfc_name)
             digest_path = os.path.join(
                 self.path, archive_hash.apt_name, digest)

=== modified file 'lib/lp/archivepublisher/tests/test_publisher.py'
--- lib/lp/archivepublisher/tests/test_publisher.py	2016-04-06 00:37:09 +0000
+++ lib/lp/archivepublisher/tests/test_publisher.py	2016-04-14 10:34:30 +0000
@@ -6,6 +6,7 @@
 __metaclass__ = type
 
 import bz2
+from collections import OrderedDict
 import crypt
 from datetime import (
     datetime,
@@ -453,14 +454,16 @@
 
     def __init__(self, contents):
         self.contents = contents
+        self.expected_hashes = OrderedDict([
+            ("SHA256", "sha256"),
+            ])
 
     def match(self, by_hash_path):
-        mismatch = DirContains(["MD5Sum", "SHA1", "SHA256"]).match(
-            by_hash_path)
+        mismatch = DirContains(self.expected_hashes.keys()).match(by_hash_path)
         if mismatch is not None:
             return mismatch
-        for hashname, hashattr in (
-                ("MD5Sum", "md5"), ("SHA1", "sha1"), ("SHA256", "sha256")):
+        best_hashname, best_hashattr = self.expected_hashes.items()[-1]
+        for hashname, hashattr in self.expected_hashes.items():
             digests = {
                 getattr(hashlib, hashattr)(content).hexdigest(): content
                 for content in self.contents}
@@ -470,14 +473,14 @@
                 return mismatch
             for digest, content in digests.items():
                 full_path = os.path.join(path, digest)
-                if hashname == "SHA256":
+                if hashname == best_hashname:
                     mismatch = FileContains(content).match(full_path)
                     if mismatch is not None:
                         return mismatch
                 else:
                     best_path = os.path.join(
-                        by_hash_path, "SHA256",
-                        hashlib.sha256(content).hexdigest())
+                        by_hash_path, best_hashname,
+                        getattr(hashlib, best_hashattr)(content).hexdigest())
                     mismatch = SamePath(best_path).match(full_path)
                     if mismatch is not None:
                         return mismatch
@@ -545,12 +548,6 @@
         with open_for_writing(
                 os.path.join(by_hash_path, "SHA256", sha256_digest), "w") as f:
             f.write(content)
-        for hashname, hashattr in (("MD5Sum", "md5"), ("SHA1", "sha1")):
-            digest = getattr(hashlib, hashattr)(content).hexdigest()
-            os.makedirs(os.path.join(by_hash_path, hashname))
-            os.symlink(
-                os.path.join(os.pardir, "SHA256", sha256_digest),
-                os.path.join(by_hash_path, hashname, digest))
         by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
         self.assertThat(by_hash_path, ByHashHasContents([content]))
         by_hash.add("Sources", lfa)
@@ -570,8 +567,8 @@
         self.assertFalse(by_hash.known("abc", "SHA1", sha1))
         self.assertFalse(by_hash.known("abc", "SHA256", sha256))
         by_hash.add("abc", lfa, copy_from_path="abc")
-        self.assertTrue(by_hash.known("abc", "MD5Sum", md5))
-        self.assertTrue(by_hash.known("abc", "SHA1", sha1))
+        self.assertFalse(by_hash.known("abc", "MD5Sum", md5))
+        self.assertFalse(by_hash.known("abc", "SHA1", sha1))
         self.assertTrue(by_hash.known("abc", "SHA256", sha256))
         self.assertFalse(by_hash.known("def", "SHA256", sha256))
         by_hash.add("def", lfa, copy_from_path="abc")
@@ -587,7 +584,7 @@
         by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
         by_hash.add("Sources", lfa, copy_from_path=sources_path)
         by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
-        with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
+        with open_for_writing(os.path.join(by_hash_path, "SHA256/0"), "w"):
             pass
         self.assertThat(by_hash_path, Not(ByHashHasContents([content])))
         by_hash.prune()
@@ -597,12 +594,35 @@
         root = self.makeTemporaryDirectory()
         by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
         by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
-        with open_for_writing(os.path.join(by_hash_path, "MD5Sum/0"), "w"):
+        with open_for_writing(os.path.join(by_hash_path, "SHA256/0"), "w"):
             pass
         self.assertThat(by_hash_path, PathExists())
         by_hash.prune()
         self.assertThat(by_hash_path, Not(PathExists()))
 
+    def test_prune_old_hashes(self):
+        # The initial implementation of by-hash included MD5Sum and SHA1,
+        # but we since decided that this was unnecessary cruft.  If they
+        # exist on disk, they are pruned in their entirety.
+        root = self.makeTemporaryDirectory()
+        content = "abc\n"
+        lfa = self.factory.makeLibraryFileAlias(content=content)
+        by_hash_path = os.path.join(root, "dists/foo/main/source/by-hash")
+        sha256_digest = hashlib.sha256(content).hexdigest()
+        with open_for_writing(
+                os.path.join(by_hash_path, "SHA256", sha256_digest), "w") as f:
+            f.write(content)
+        for hashname, hashattr in (("MD5Sum", "md5"), ("SHA1", "sha1")):
+            digest = getattr(hashlib, hashattr)(content).hexdigest()
+            os.makedirs(os.path.join(by_hash_path, hashname))
+            os.symlink(
+                os.path.join(os.pardir, "SHA256", sha256_digest),
+                os.path.join(by_hash_path, hashname, digest))
+        by_hash = ByHash(root, "dists/foo/main/source", DevNullLogger())
+        by_hash.add("Sources", lfa)
+        by_hash.prune()
+        self.assertThat(by_hash_path, ByHashHasContents([content]))
+
 
 class TestByHashes(TestCaseWithFactory):
     """Unit tests for details of handling a set of by-hash directory trees."""
@@ -645,8 +665,8 @@
         self.assertFalse(by_hashes.known(sources_path, "SHA1", sha1))
         self.assertFalse(by_hashes.known(sources_path, "SHA256", sha256))
         by_hashes.add(sources_path, lfa, copy_from_path=sources_path)
-        self.assertTrue(by_hashes.known(sources_path, "MD5Sum", md5))
-        self.assertTrue(by_hashes.known(sources_path, "SHA1", sha1))
+        self.assertFalse(by_hashes.known(sources_path, "MD5Sum", md5))
+        self.assertFalse(by_hashes.known(sources_path, "SHA1", sha1))
         self.assertTrue(by_hashes.known(sources_path, "SHA256", sha256))
 
     def test_prune(self):
@@ -666,8 +686,8 @@
                     content=content, db_only=True)
                 by_hashes.add(path, lfa, copy_from_path=path)
         strays = [
-            "dists/foo/main/source/by-hash/MD5Sum/0",
-            "dists/foo/main/binary-amd64/by-hash/MD5Sum/0",
+            "dists/foo/main/source/by-hash/SHA256/0",
+            "dists/foo/main/binary-amd64/by-hash/SHA256/0",
             ]
         for stray in strays:
             with open_for_writing(os.path.join(root, stray), "w"):


Follow ups