launchpad-reviewers team mailing list archive
-
launchpad-reviewers team
-
Mailing list archive
-
Message #21085
Re: [Merge] lp:~cjwatson/launchpad/codeimport-git-worker into lp:launchpad
Review: Approve code
Diff comments:
> === modified file 'configs/development/launchpad-lazr.conf'
> --- configs/development/launchpad-lazr.conf 2016-06-30 16:05:11 +0000
> +++ configs/development/launchpad-lazr.conf 2016-10-11 15:36:32 +0000
> @@ -55,6 +55,7 @@
>
> [codeimport]
> bazaar_branch_store: file:///tmp/bazaar-branches
> +git_repository_store: https://git.launchpad.dev/
It's entirely confusing that git_repository_store and bazaar_branch_store are pretty radically different things. bazaar_branch_store is the intermediate tree on maple (formerly escudero), whereas git_repository_store is the actual hosting tree. The lack of puller for git imports is certainly a relief, but the chance of this not causing someone to faceplant is tiny.
A comment might be sufficient, but could we also get away with using the existing git_browse_root (GitRepository.git_https_url would be ideal, but returns None for private repos which are technically a possibility for imports)?
> foreign_tree_store: file:///tmp/foreign-branches
>
> [codeimportdispatcher]
>
> === modified file 'lib/lp/codehosting/codeimport/tests/test_worker.py'
> --- lib/lp/codehosting/codeimport/tests/test_worker.py 2016-10-11 15:36:32 +0000
> +++ lib/lp/codehosting/codeimport/tests/test_worker.py 2016-10-11 15:36:32 +0000
> @@ -1416,6 +1453,19 @@
> str(code_import.branch.id), 'git',
> 'git://git.example.com/project.git']))
>
> + def test_git_to_git_arguments(self):
> + self.pushConfig('codeimport', macaroon_secret_key='some-secret')
> + self.useFixture(GitHostingFixture())
> + code_import = self.factory.makeCodeImport(
> + git_repo_url="git://git.example.com/project.git",
> + target_rcs_type=TargetRevisionControlSystems.GIT)
> + self.assertArgumentsMatch(
> + code_import, MatchesListwise([
> + Equals(code_import.git_repository.unique_name),
> + Equals('git:git'), Equals('git://git.example.com/project.git'),
> + CodeImportJobMacaroonVerifies(code_import.git_repository)]),
> + start_job=True)
A long time in the future in an editor far, far away: "Why would I want to start the job? Silly test author, I'll drop that pointlessness while I'm refactoring. Why is the macaroon verification silently failing now?"
> +
> def test_cvs_arguments(self):
> code_import = self.factory.makeCodeImport(
> cvs_root=':pserver:foo@xxxxxxxxxxx/bar', cvs_module='bar')
>
> === modified file 'lib/lp/codehosting/codeimport/worker.py'
> --- lib/lp/codehosting/codeimport/worker.py 2016-10-11 15:36:32 +0000
> +++ lib/lp/codehosting/codeimport/worker.py 2016-10-11 15:36:32 +0000
> @@ -269,36 +282,50 @@
> of the information suitable for passing around on executables' command
> lines.
>
> - :ivar target_id: The id of the Bazaar branch associated with this code
> - import, used for locating the existing import and the foreign tree.
> + :ivar target_id: The id of the Bazaar branch or the path of the Git
> + repository associated with this code import, used for locating the
> + existing import and the foreign tree.
> :ivar rcstype: 'cvs', 'git', 'bzr-svn', 'bzr' as appropriate.
> + :ivar target_rcstype: 'bzr' or 'git' as appropriate.
> :ivar url: The branch URL if rcstype in ['bzr-svn', 'git', 'bzr'], None
> otherwise.
> :ivar cvs_root: The $CVSROOT if rcstype == 'cvs', None otherwise.
> :ivar cvs_module: The CVS module if rcstype == 'cvs', None otherwise.
> :ivar stacked_on_url: The URL of the branch that the associated branch
> is stacked on, if any.
> + :ivar macaroon: A macaroon granting authority to push to the target
> + repository if target_rcstype == 'git', None otherwise.
> """
>
> - def __init__(self, target_id, rcstype, url=None, cvs_root=None,
> - cvs_module=None, stacked_on_url=None):
> + def __init__(self, target_id, rcstype, target_rcstype, url=None,
> + cvs_root=None, cvs_module=None, stacked_on_url=None,
> + macaroon=None):
> self.target_id = target_id
> self.rcstype = rcstype
> + self.target_rcstype = target_rcstype
> self.url = url
> self.cvs_root = cvs_root
> self.cvs_module = cvs_module
> self.stacked_on_url = stacked_on_url
> + self.macaroon = macaroon
>
> @classmethod
> def fromArguments(cls, arguments):
> """Convert command line-style arguments to an instance."""
> - target_id = int(arguments.pop(0))
> + target_id = arguments.pop(0)
> rcstype = arguments.pop(0)
> + if ':' in rcstype:
> + rcstype, target_rcstype = rcstype.split(':', 1)
> + else:
> + target_rcstype = 'bzr'
XXX this for cleanup post-deployment?
> if rcstype in ['bzr-svn', 'git', 'bzr']:
> url = arguments.pop(0)
> - try:
> - stacked_on_url = arguments.pop(0)
> - except IndexError:
> + if target_rcstype == 'bzr':
> + try:
> + stacked_on_url = arguments.pop(0)
> + except IndexError:
> + stacked_on_url = None
> + else:
> stacked_on_url = None
> cvs_root = cvs_module = None
> elif rcstype == 'cvs':
> @@ -307,35 +334,54 @@
> [cvs_root, cvs_module] = arguments
> else:
> raise AssertionError("Unknown rcstype %r." % rcstype)
> + if target_rcstype == 'bzr':
> + target_id = int(target_id)
> + macaroon = None
> + elif target_rcstype == 'git':
> + macaroon = Macaroon.deserialize(arguments.pop(0))
/proc/self/cmdline isn't the most secure of password managers, but I suppose alternate means are awkward.
> + else:
> + raise AssertionError("Unknown target_rcstype %r." % target_rcstype)
> return cls(
> - target_id, rcstype, url, cvs_root, cvs_module, stacked_on_url)
> + target_id, rcstype, target_rcstype, url, cvs_root, cvs_module,
> + stacked_on_url, macaroon)
>
> @classmethod
> def fromCodeImportJob(cls, job):
> """Convert a `CodeImportJob` to an instance."""
> code_import = job.code_import
> target = code_import.target
> - if target.stacked_on is not None and not target.stacked_on.private:
> - stacked_path = branch_id_alias(target.stacked_on)
> - stacked_on_url = compose_public_url('http', stacked_path)
> + if IBranch.providedBy(target):
> + if target.stacked_on is not None and not target.stacked_on.private:
> + stacked_path = branch_id_alias(target.stacked_on)
> + stacked_on_url = compose_public_url('http', stacked_path)
> + else:
> + stacked_on_url = None
> + target_id = target.id
> else:
> - stacked_on_url = None
> + target_id = target.unique_name
It's moderately unfortunate that we're using a mutable identifier here, but since the translatePath is anonymous the worst that can happen is that the job fails due to macaroon auth failure.
> if code_import.rcs_type == RevisionControlSystems.BZR_SVN:
> return cls(
> - target.id, 'bzr-svn', str(code_import.url),
> + target_id, 'bzr-svn', 'bzr', str(code_import.url),
> stacked_on_url=stacked_on_url)
> elif code_import.rcs_type == RevisionControlSystems.CVS:
> return cls(
> - target.id, 'cvs',
> + target_id, 'cvs', 'bzr',
> cvs_root=str(code_import.cvs_root),
> cvs_module=str(code_import.cvs_module))
> elif code_import.rcs_type == RevisionControlSystems.GIT:
> - return cls(
> - target.id, 'git', str(code_import.url),
> - stacked_on_url=stacked_on_url)
> + if IBranch.providedBy(target):
> + return cls(
> + target_id, 'git', 'bzr', str(code_import.url),
> + stacked_on_url=stacked_on_url)
> + else:
> + issuer = getUtility(IMacaroonIssuer, 'code-import-job')
> + macaroon = removeSecurityProxy(issuer).issueMacaroon(job)
> + return cls(
> + target_id, 'git', 'git', str(code_import.url),
> + macaroon=macaroon)
> elif code_import.rcs_type == RevisionControlSystems.BZR:
> return cls(
> - target.id, 'bzr', str(code_import.url),
> + target_id, 'bzr', 'bzr', str(code_import.url),
> stacked_on_url=stacked_on_url)
> else:
> raise AssertionError("Unknown rcstype %r." % code_import.rcs_type)
> @@ -343,7 +389,14 @@
> def asArguments(self):
> """Return a list of arguments suitable for passing to a child process.
> """
> - result = [str(self.target_id), self.rcstype]
> + result = [str(self.target_id)]
> + if self.target_rcstype == 'bzr':
> + result.append(self.rcstype)
XXX to drop this special case later?
> + elif self.target_rcstype == 'git':
> + result.append('%s:%s' % (self.rcstype, self.target_rcstype))
> + else:
> + raise AssertionError(
> + "Unknown target_rcstype %r." % self.target_rcstype)
> if self.rcstype in ['bzr-svn', 'git', 'bzr']:
> result.append(self.url)
> if self.stacked_on_url is not None:
> @@ -918,3 +973,67 @@
> """See `PullingImportWorker.probers`."""
> from bzrlib.bzrdir import BzrProber, RemoteBzrProber
> return [BzrProber, RemoteBzrProber]
> +
> +
> +class GitToGitImportWorker(ImportWorker):
> + """An import worker for imports from Git to Git."""
> +
> + def _runGit(self, *args, **kwargs):
> + """Run git with arguments, sending output to the logger."""
> + cmd = ["git"] + list(args)
> + git_process = subprocess.Popen(
> + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, **kwargs)
> + for line in git_process.stdout:
> + line = line.decode("UTF-8", "replace").rstrip("\n")
> + # Remove any user/password data from URLs.
> + line = re.sub(r"://([^:]*:[^@]*@)(\S+)", r"://\2", line)
This is eerily similar to lp.buildmaster.model.buildfarmjobbehaviour.sanitise_arguments.
I suppose someone could devise a malicious git server that injects stderr at just the right time to split a URL logged to stdout and bypass the sanitiser, but that'd be pretty impressive.
> + self._logger.info(line)
> + retcode = git_process.wait()
> + if retcode:
> + raise subprocess.CalledProcessError(retcode, cmd)
> +
> + def _doImport(self):
> + self._logger.info("Starting job.")
> + self._logger.info(config.codeimport.git_repository_store)
> + try:
> + self._opener_policy.checkOneURL(self.source_details.url)
How clever is git at following redirects?
> + except BadUrl as e:
> + self._logger.info("Invalid URL: %s" % e)
> + return CodeImportWorkerExitCode.FAILURE_FORBIDDEN
> + unauth_target_url = urljoin(
> + config.codeimport.git_repository_store,
> + self.source_details.target_id)
> + split = urlsplit(unauth_target_url)
> + if split.hostname:
> + target_netloc = ":%s@%s" % (
> + self.source_details.macaroon.serialize(), split.hostname)
> + else:
> + target_netloc = ""
> + target_url = urlunsplit([
> + split.scheme, target_netloc, split.path, "", ""])
> + # XXX cjwatson 2016-10-11: Ideally we'd put credentials in a
> + # credentials store instead. However, git only accepts credentials
> + # that have both a non-empty username and a non-empty password.
> + self._logger.info("Getting existing repository from hosting service.")
> + try:
> + self._runGit("clone", "--bare", target_url, "repository")
Do we want a --mirror? Though --bare seems to confusingly already implicitly fetch the branches to the right place, it doesn't fetch other refs that will then be have to be obtained from the Internet by --mirror=fetch.
> + except subprocess.CalledProcessError as e:
> + self._logger.info(
> + "Unable to get existing repository from hosting service: "
> + "%s" % e)
> + return CodeImportWorkerExitCode.FAILURE
> + self._logger.info("Fetching remote repository.")
> + try:
> + self._runGit(
> + "remote", "add", "-f", "--mirror=fetch",
> + "mirror", self.source_details.url, cwd="repository")
Can we just "git fetch $URL +refs/*:refs/*"? We don't actually need the remote layer here.
> + except subprocess.CalledProcessError as e:
> + self._logger.info("Unable to fetch remote repository: %s" % e)
> + return CodeImportWorkerExitCode.FAILURE_INVALID
> + self._logger.info("Pushing repository to hosting service.")
> + try:
> + self._runGit("push", "--mirror", target_url, cwd="repository")
> + except subprocess.CalledProcessError as e:
> + self._logger.info("Unable to push to hosting service: %s" % e)
> + return CodeImportWorkerExitCode.FAILURE
> + return CodeImportWorkerExitCode.SUCCESS
>
> === modified file 'lib/lp/services/config/schema-lazr.conf'
> --- lib/lp/services/config/schema-lazr.conf 2016-10-11 15:36:32 +0000
> +++ lib/lp/services/config/schema-lazr.conf 2016-10-11 15:36:32 +0000
> @@ -417,7 +421,7 @@
> svn_revisions_import_limit: 500
>
> # Secret key for macaroons used to grant git push permission to workers.
> -macaroon_secret_key:
> +macaroon_secret_key: none
I really don't understand how config dynamic typing works, but I hope that means things will crash appropriate if this isn't set on production.
>
> [codeimportdispatcher]
> # The directory where the code import worker should be directed to
--
https://code.launchpad.net/~cjwatson/launchpad/codeimport-git-worker/+merge/308142
Your team Launchpad code reviewers is subscribed to branch lp:launchpad.
References