launchpad-reviewers team mailing list archive
  
  - 
     launchpad-reviewers team launchpad-reviewers team
- 
    Mailing list archive
  
- 
    Message #18167
  
 [Merge] lp:~cjwatson/launchpad/git-ref-scanner-commits into lp:launchpad
  
Colin Watson has proposed merging lp:~cjwatson/launchpad/git-ref-scanner-commits into lp:launchpad.
Commit message:
Scan author/committer/commit-message information from the tip commit of Git references.
Requested reviews:
  Launchpad code reviewers (launchpad-reviewers)
Related bugs:
  Bug #1032731 in Launchpad itself: "Support for Launchpad-hosted Git repositories"
  https://bugs.launchpad.net/launchpad/+bug/1032731
For more details, see:
https://code.launchpad.net/~cjwatson/launchpad/git-ref-scanner-commits/+merge/253501
Scan author/committer/commit-message information from the tip commit of Git references.
I ended up refactoring GitRepository.synchroniseRefs a bit in the process; it now has a separate planning stage that takes a GitHostingClient so that it can go off and fetch commit information.  The method split is mainly to make it easier to test.
-- 
Your team Launchpad code reviewers is requested to review the proposed merge of lp:~cjwatson/launchpad/git-ref-scanner-commits into lp:launchpad.
=== modified file 'lib/lp/code/errors.py'
--- lib/lp/code/errors.py	2015-03-12 15:21:27 +0000
+++ lib/lp/code/errors.py	2015-03-19 11:31:11 +0000
@@ -36,7 +36,7 @@
     'GitRepositoryCreatorNotMemberOfOwnerTeam',
     'GitRepositoryCreatorNotOwner',
     'GitRepositoryExists',
-    'GitRepositoryRefScanFault',
+    'GitRepositoryScanFault',
     'GitTargetError',
     'InvalidBranchMergeProposal',
     'InvalidMergeQueueConfig',
@@ -382,8 +382,8 @@
     """Raised when there is a hosting fault creating a Git repository."""
 
 
-class GitRepositoryRefScanFault(Exception):
-    """Raised when there is a fault getting the refs for a repository."""
+class GitRepositoryScanFault(Exception):
+    """Raised when there is a fault scanning a repository."""
 
 
 class GitTargetError(Exception):
=== modified file 'lib/lp/code/githosting.py'
--- lib/lp/code/githosting.py	2015-03-12 15:21:27 +0000
+++ lib/lp/code/githosting.py	2015-03-19 11:31:11 +0000
@@ -15,7 +15,7 @@
 
 from lp.code.errors import (
     GitRepositoryCreationFault,
-    GitRepositoryRefScanFault,
+    GitRepositoryScanFault,
     )
 
 
@@ -60,13 +60,37 @@
                 urlutils.join(self.endpoint, "repo", path, "refs"),
                 timeout=self.timeout)
         except Exception as e:
-            raise GitRepositoryRefScanFault(
+            raise GitRepositoryScanFault(
                 "Failed to get refs from Git repository: %s" % unicode(e))
         if response.status_code != 200:
-            raise GitRepositoryRefScanFault(
+            raise GitRepositoryScanFault(
                 "Failed to get refs from Git repository: %s" % response.text)
         try:
             return response.json()
         except ValueError as e:
-            raise GitRepositoryRefScanFault(
+            raise GitRepositoryScanFault(
                 "Failed to decode ref-scan response: %s" % unicode(e))
+
+    def get_commits(self, path, commit_oids):
+        try:
+            # XXX cjwatson 2015-03-01: Once we're on requests >= 2.4.2, we
+            # should just use post(json=) and drop the explicit Content-Type
+            # header.
+            response = self._makeSession().post(
+                urlutils.join(self.endpoint, "repo", path, "commits"),
+                headers={"Content-Type": "application/json"},
+                data=json.dumps(commit_oids),
+                timeout=self.timeout)
+        except Exception as e:
+            raise GitRepositoryScanFault(
+                "Failed to get commit details from Git repository: %s" %
+                unicode(e))
+        if response.status_code != 200:
+            raise GitRepositoryScanFault(
+                "Failed to get commit details from Git repository: %s" %
+                response.text)
+        try:
+            return response.json()
+        except ValueError as e:
+            raise GitRepositoryScanFault(
+                "Failed to decode commit-scan response: %s" % unicode(e))
=== modified file 'lib/lp/code/interfaces/gitref.py'
--- lib/lp/code/interfaces/gitref.py	2015-03-13 14:15:24 +0000
+++ lib/lp/code/interfaces/gitref.py	2015-03-19 11:31:11 +0000
@@ -15,6 +15,8 @@
     )
 from zope.schema import (
     Choice,
+    Datetime,
+    Text,
     TextLine,
     )
 
@@ -42,6 +44,24 @@
         title=_("Object type"), required=True, readonly=True,
         vocabulary=GitObjectType)
 
+    author = Attribute(
+        "The author of the commit pointed to by this reference.")
+    author_date = Datetime(
+        title=_("The author date of the commit pointed to by this reference."),
+        required=False, readonly=True)
+
+    committer = Attribute(
+        "The committer of the commit pointed to by this reference.")
+    committer_date = Datetime(
+        title=_(
+            "The committer date of the commit pointed to by this reference."),
+        required=False, readonly=True)
+
+    commit_message = Text(
+        title=_(
+            "The commit message of the commit pointed to by this reference."),
+        required=False, readonly=True)
+
     display_name = TextLine(
         title=_("Display name"), required=True, readonly=True,
         description=_("Display name of the reference."))
=== modified file 'lib/lp/code/interfaces/gitrepository.py'
--- lib/lp/code/interfaces/gitrepository.py	2015-03-17 10:42:24 +0000
+++ lib/lp/code/interfaces/gitrepository.py	2015-03-19 11:31:11 +0000
@@ -213,12 +213,21 @@
         :params paths: An iterable of paths.
         """
 
-    def synchroniseRefs(hosting_refs, logger=None):
+    def planRefChanges(hosting_client, hosting_path, logger=None):
+        """Plan ref changes based on information from the hosting service.
+
+        :param hosting_client: A `GitHostingClient`.
+        :param hosting_path: A path on the hosting service.
+        :param logger: An optional logger.
+        """
+
+    def synchroniseRefs(refs_to_upsert, refs_to_remove):
         """Synchronise references with those from the hosting service.
 
-        :param hosting_refs: A dictionary of reference information returned
-            from the hosting service's `/repo/PATH/refs` collection.
-        :param logger: An optional logger.
+        :param refs_to_upsert: A dictionary mapping ref paths to
+            dictionaries of their fields; these refs will be created or
+            updated as appropriate.
+        :param refs_to_remove: A set of ref paths to remove.
         """
 
     def setOwnerDefault(value):
=== modified file 'lib/lp/code/model/gitjob.py'
--- lib/lp/code/model/gitjob.py	2015-03-17 10:42:24 +0000
+++ lib/lp/code/model/gitjob.py	2015-03-19 11:31:11 +0000
@@ -189,8 +189,10 @@
                     LockType.GIT_REF_SCAN, self.repository.id,
                     Store.of(self.repository)):
                 hosting_path = self.repository.getInternalPath()
-                self.repository.synchroniseRefs(
-                    self._hosting_client.get_refs(hosting_path), logger=log)
+                refs_to_upsert, refs_to_remove = (
+                    self.repository.planRefChanges(
+                        self._hosting_client, hosting_path, logger=log))
+                self.repository.synchroniseRefs(refs_to_upsert, refs_to_remove)
         except LostObjectError:
             log.info(
                 "Skipping repository %s because it has been deleted." %
=== modified file 'lib/lp/code/model/gitref.py'
--- lib/lp/code/model/gitref.py	2015-03-13 14:15:24 +0000
+++ lib/lp/code/model/gitref.py	2015-03-19 11:31:11 +0000
@@ -6,7 +6,9 @@
     'GitRef',
     ]
 
+import pytz
 from storm.locals import (
+    DateTime,
     Int,
     Reference,
     Unicode,
@@ -36,6 +38,18 @@
 
     object_type = EnumCol(enum=GitObjectType, notNull=True)
 
+    author_id = Int(name='author', allow_none=True)
+    author = Reference(author_id, 'RevisionAuthor.id')
+    author_date = DateTime(
+        name='author_date', tzinfo=pytz.UTC, allow_none=True)
+
+    committer_id = Int(name='committer', allow_none=True)
+    committer = Reference(committer_id, 'RevisionAuthor.id')
+    committer_date = DateTime(
+        name='committer_date', tzinfo=pytz.UTC, allow_none=True)
+
+    commit_message = Unicode(name='commit_message', allow_none=True)
+
     @property
     def display_name(self):
         return self.path.split("/", 2)[-1]
=== modified file 'lib/lp/code/model/gitrepository.py'
--- lib/lp/code/model/gitrepository.py	2015-03-17 10:42:24 +0000
+++ lib/lp/code/model/gitrepository.py	2015-03-19 11:31:11 +0000
@@ -8,6 +8,8 @@
     'GitRepositorySet',
     ]
 
+from datetime import datetime
+import email
 from itertools import chain
 
 from bzrlib import urlutils
@@ -68,6 +70,7 @@
     IGitRepositorySet,
     user_has_special_git_repository_access,
     )
+from lp.code.interfaces.revision import IRevisionSet
 from lp.code.model.gitref import GitRef
 from lp.registry.enums import PersonVisibility
 from lp.registry.errors import CannotChangeInformationType
@@ -358,16 +361,28 @@
         store.flush()
 
         # Try a bulk update first.
-        column_names = ["repository_id", "path", "commit_sha1", "object_type"]
+        column_names = [
+            "repository_id", "path", "commit_sha1", "object_type",
+            "author_id", "author_date", "committer_id", "committer_date",
+            "commit_message",
+            ]
         column_types = [
             ("repository", "integer"),
             ("path", "text"),
             ("commit_sha1", "character(40)"),
             ("object_type", "integer"),
+            ("author", "integer"),
+            ("author_date", "timestamp without time zone"),
+            ("committer", "integer"),
+            ("committer_date", "timestamp without time zone"),
+            ("commit_message", "text"),
             ]
         columns = [getattr(GitRef, name) for name in column_names]
         values = [
-            (self.id, path, info["sha1"], info["type"])
+            (self.id, path, info["sha1"], info["type"],
+             info.get("author"), info.get("author_date"),
+             info.get("committer"), info.get("committer_date"),
+             info.get("commit_message"))
             for path, info in refs_info.items()]
         db_values = dbify_values(values)
         new_refs_expr = Values("new_refs", column_types, db_values)
@@ -409,23 +424,84 @@
             GitRef.repository == self, GitRef.path.is_in(paths)).remove()
         del get_property_cache(self).refs
 
-    def synchroniseRefs(self, hosting_refs, logger=None):
+    def planRefChanges(self, hosting_client, hosting_path, logger=None):
         """See `IGitRepository`."""
         new_refs = {}
-        for path, info in hosting_refs.items():
+        for path, info in hosting_client.get_refs(hosting_path).items():
             try:
                 new_refs[path] = self._convertRefInfo(info)
             except ValueError as e:
                 logger.warning("Unconvertible ref %s %s: %s" % (path, info, e))
+
+        # Plan the changes.
         current_refs = {ref.path: ref for ref in self.refs}
         refs_to_upsert = {}
         for path, info in new_refs.items():
             current_ref = current_refs.get(path)
             if (current_ref is None or
                 info["sha1"] != current_ref.commit_sha1 or
-                info["type"] != current_ref.object_type):
+                info["type"] != current_ref.object_type or
+                current_ref.author_id is None or
+                current_ref.author is None or
+                current_ref.committer_id is None or
+                current_ref.committer is None or
+                current_ref.commit_message is None):
                 refs_to_upsert[path] = info
+        oids_to_upsert = {
+            info["sha1"]: path for path, info in refs_to_upsert.items()}
         refs_to_remove = set(current_refs) - set(new_refs)
+
+        # Fetch any required commit information, and fill it into the
+        # appropriate info dictionaries.
+        commits = hosting_client.get_commits(
+            hosting_path, list(oids_to_upsert))
+        authors_to_acquire = {}
+        committers_to_acquire = {}
+        for commit in commits:
+            oid = commit["sha1"]
+            if oid in oids_to_upsert:
+                info = refs_to_upsert[oids_to_upsert[oid]]
+            else:
+                # We got a stray commit from the hosting service for some
+                # reason.  Just ignore it.
+                continue
+            author = commit.get("author")
+            if author is not None:
+                if "time" in author:
+                    info["author_date"] = datetime.fromtimestamp(
+                        author["time"], tz=pytz.UTC)
+                if "name" in author and "email" in author:
+                    author_addr = email.utils.formataddr(
+                        (author["name"], author["email"]))
+                    authors_to_acquire[author_addr] = info
+            committer = commit.get("committer")
+            if committer is not None:
+                if "time" in committer:
+                    info["committer_date"] = datetime.fromtimestamp(
+                        committer["time"], tz=pytz.UTC)
+                if "name" in committer and "email" in committer:
+                    committer_addr = email.utils.formataddr(
+                        (committer["name"], committer["email"]))
+                    committers_to_acquire[committer_addr] = info
+            if "message" in commit:
+                info["commit_message"] = commit["message"]
+        revision_authors = getUtility(IRevisionSet).acquireRevisionAuthors(
+            list(authors_to_acquire) + list(committers_to_acquire))
+        for addr, revision_author in revision_authors.items():
+            # Removing the security proxy here is safe because
+            # RevisionAuthors are always public.  We need to do this for the
+            # sake of dbify_value later.
+            if addr in authors_to_acquire:
+                authors_to_acquire[addr]["author"] = removeSecurityProxy(
+                    revision_author).id
+            if addr in committers_to_acquire:
+                committers_to_acquire[addr]["committer"] = removeSecurityProxy(
+                    revision_author).id
+
+        return refs_to_upsert, refs_to_remove
+
+    def synchroniseRefs(self, refs_to_upsert, refs_to_remove):
+        """See `IGitRepository`."""
         if refs_to_upsert:
             self.createOrUpdateRefs(refs_to_upsert)
         if refs_to_remove:
=== modified file 'lib/lp/code/model/tests/test_gitjob.py'
--- lib/lp/code/model/tests/test_gitjob.py	2015-03-17 10:51:15 +0000
+++ lib/lp/code/model/tests/test_gitjob.py	2015-03-19 11:31:11 +0000
@@ -5,8 +5,13 @@
 
 __metaclass__ = type
 
+from datetime import (
+    datetime,
+    timedelta,
+    )
 import hashlib
 
+import pytz
 from testtools.matchers import (
     MatchesSetwise,
     MatchesStructure,
@@ -25,7 +30,10 @@
     GitRefScanJob,
     )
 from lp.services.features.testing import FeatureFixture
-from lp.testing import TestCaseWithFactory
+from lp.testing import (
+    TestCaseWithFactory,
+    time_counter,
+    )
 from lp.testing.dbuser import dbuser
 from lp.testing.fakemethod import FakeMethod
 from lp.testing.layers import (
@@ -72,6 +80,27 @@
                 }}
             for path in paths}
 
+    @staticmethod
+    def makeFakeCommits(author, author_date_gen, paths):
+        epoch = datetime.fromtimestamp(0, tz=pytz.UTC)
+        dates = {path: next(author_date_gen) for path in paths}
+        return [{
+            "sha1": hashlib.sha1(path).hexdigest(),
+            "message": "tip of %s" % path,
+            "author": {
+                "name": author.displayname,
+                "email": author.preferredemail.email,
+                "time": int((dates[path] - epoch).total_seconds()),
+                },
+            "committer": {
+                "name": author.displayname,
+                "email": author.preferredemail.email,
+                "time": int((dates[path] - epoch).total_seconds()),
+                },
+            "parents": [],
+            "tree": hashlib.sha1("").hexdigest(),
+            } for path in paths]
+
     def assertRefsMatch(self, refs, repository, paths):
         matchers = [
             MatchesStructure.byEquality(
@@ -104,6 +133,11 @@
         paths = (u"refs/heads/master", u"refs/tags/1.0")
         job._hosting_client.get_refs = FakeMethod(
             result=self.makeFakeRefs(paths))
+        author = repository.owner
+        author_date_start = datetime(2015, 01, 01, tzinfo=pytz.UTC)
+        author_date_gen = time_counter(author_date_start, timedelta(days=1))
+        job._hosting_client.get_commits = FakeMethod(
+            result=self.makeFakeCommits(author, author_date_gen, paths))
         with dbuser("branchscanner"):
             job.run()
         self.assertRefsMatch(repository.refs, repository, paths)
@@ -113,6 +147,7 @@
         job = GitRefScanJob.create(repository)
         job._hosting_client.get_refs = FakeMethod(
             result={u"refs/heads/master": {}})
+        job._hosting_client.get_commits = FakeMethod(result=[])
         expected_message = (
             'Unconvertible ref refs/heads/master {}: '
             'ref info does not contain "object" key')
=== modified file 'lib/lp/code/model/tests/test_gitrepository.py'
--- lib/lp/code/model/tests/test_gitrepository.py	2015-03-17 10:47:59 +0000
+++ lib/lp/code/model/tests/test_gitrepository.py	2015-03-19 11:31:11 +0000
@@ -43,6 +43,7 @@
     IGitRepository,
     IGitRepositorySet,
     )
+from lp.code.interfaces.revision import IRevisionSet
 from lp.code.model.gitrepository import GitRepository
 from lp.registry.enums import (
     BranchSharingPolicy,
@@ -72,6 +73,7 @@
     TestCaseWithFactory,
     verifyObject,
     )
+from lp.testing.fakemethod import FakeMethod
 from lp.testing.layers import (
     DatabaseFunctionalLayer,
     ZopelessDatabaseLayer,
@@ -535,6 +537,93 @@
                 object_type=GitObjectType.BLOB,
                 ))
 
+    def test_planRefChanges(self):
+        # planRefChanges copes with planning changes to refs in a repository
+        # where some refs have been created, some deleted, and some changed.
+        repository = self.factory.makeGitRepository()
+        paths = (u"refs/heads/master", u"refs/heads/foo", u"refs/heads/bar")
+        self.factory.makeGitRefs(repository=repository, paths=paths)
+        self.assertRefsMatch(repository.refs, repository, paths)
+        master_sha1 = repository.getRefByPath(u"refs/heads/master").commit_sha1
+        foo_sha1 = repository.getRefByPath(u"refs/heads/foo").commit_sha1
+        author = self.factory.makePerson()
+        with person_logged_in(author):
+            author_email = author.preferredemail.email
+        epoch = datetime.fromtimestamp(0, tz=pytz.UTC)
+        author_date = datetime(2015, 1, 1, tzinfo=pytz.UTC)
+        committer_date = datetime(2015, 1, 2, tzinfo=pytz.UTC)
+        hosting_client = FakeMethod()
+        hosting_client.get_refs = FakeMethod(result={
+            u"refs/heads/master": {
+                u"object": {
+                    u"sha1": u"1111111111111111111111111111111111111111",
+                    u"type": u"commit",
+                    },
+                },
+            u"refs/heads/foo": {
+                u"object": {
+                    u"sha1": foo_sha1,
+                    u"type": u"commit",
+                    },
+                },
+            u"refs/tags/1.0": {
+                u"object": {
+                    u"sha1": master_sha1,
+                    u"type": u"commit",
+                    },
+                },
+            })
+        hosting_client.get_commits = FakeMethod(result=[
+            {
+                u"sha1": u"1111111111111111111111111111111111111111",
+                u"message": u"tip of master",
+                u"author": {
+                    u"name": author.displayname,
+                    u"email": author_email,
+                    u"time": int((author_date - epoch).total_seconds()),
+                    },
+                u"committer": {
+                    u"name": u"New Person",
+                    u"email": u"new-person@xxxxxxxxxxx",
+                    u"time": int((committer_date - epoch).total_seconds()),
+                    },
+                u"parents": [],
+                u"tree": unicode(hashlib.sha1("").hexdigest()),
+                }])
+        refs_to_upsert, refs_to_remove = repository.planRefChanges(
+            hosting_client, "dummy")
+
+        expected_oids = [
+            u"1111111111111111111111111111111111111111", foo_sha1, master_sha1]
+        [(_, observed_oids)] = hosting_client.get_commits.extract_args()
+        self.assertContentEqual(expected_oids, observed_oids)
+        [expected_author] = getUtility(IRevisionSet).acquireRevisionAuthors(
+            ["%s <%s>" % (author.displayname, author_email)]).values()
+        [expected_committer] = getUtility(IRevisionSet).acquireRevisionAuthors(
+            ["New Person <new-person@xxxxxxxxxxx>"]).values()
+        expected_upsert = {
+            u"refs/heads/master": {
+                u"sha1": u"1111111111111111111111111111111111111111",
+                u"type": GitObjectType.COMMIT,
+                u"author": removeSecurityProxy(expected_author).id,
+                u"author_date": author_date,
+                u"committer": removeSecurityProxy(expected_committer).id,
+                u"committer_date": committer_date,
+                u"commit_message": u"tip of master",
+                },
+            u"refs/heads/foo": {
+                u"sha1": unicode(hashlib.sha1(u"refs/heads/foo").hexdigest()),
+                u"type": GitObjectType.COMMIT,
+                },
+            u"refs/tags/1.0": {
+                u"sha1": unicode(
+                    hashlib.sha1(u"refs/heads/master").hexdigest()),
+                u"type": GitObjectType.COMMIT,
+                },
+            }
+        self.assertEqual(expected_upsert, refs_to_upsert)
+        self.assertEqual(set([u"refs/heads/bar"]), refs_to_remove)
+
     def test_synchroniseRefs(self):
         # synchroniseRefs copes with synchronising a repository where some
         # refs have been created, some deleted, and some changed.
@@ -542,28 +631,24 @@
         paths = (u"refs/heads/master", u"refs/heads/foo", u"refs/heads/bar")
         self.factory.makeGitRefs(repository=repository, paths=paths)
         self.assertRefsMatch(repository.refs, repository, paths)
-        repository.synchroniseRefs({
+        refs_to_upsert = {
             u"refs/heads/master": {
-                u"object": {
-                    u"sha1": u"1111111111111111111111111111111111111111",
-                    u"type": u"commit",
-                    },
+                u"sha1": u"1111111111111111111111111111111111111111",
+                u"type": GitObjectType.COMMIT,
                 },
             u"refs/heads/foo": {
-                u"object": {
-                    u"sha1": repository.getRefByPath(
-                        u"refs/heads/foo").commit_sha1,
-                    u"type": u"commit",
-                    },
+                u"sha1": repository.getRefByPath(
+                    u"refs/heads/foo").commit_sha1,
+                u"type": GitObjectType.COMMIT,
                 },
             u"refs/tags/1.0": {
-                u"object": {
-                    u"sha1": repository.getRefByPath(
-                        u"refs/heads/master").commit_sha1,
-                    u"type": u"commit",
-                    },
+                u"sha1": repository.getRefByPath(
+                    u"refs/heads/master").commit_sha1,
+                u"type": GitObjectType.COMMIT,
                 },
-            })
+            }
+        refs_to_remove = set([u"refs/heads/bar"])
+        repository.synchroniseRefs(refs_to_upsert, refs_to_remove)
         expected_sha1s = [
             (u"refs/heads/master",
              u"1111111111111111111111111111111111111111"),
Follow ups