← Back to team overview

cloud-init-dev team mailing list archive

Re: [Merge] ~powersj/cloud-init:cii-restructure-ssh into cloud-init:master

 


Diff comments:

> diff --git a/tests/cloud_tests/platforms/instances.py b/tests/cloud_tests/platforms/instances.py
> index 3bad021..fa392b7 100644
> --- a/tests/cloud_tests/platforms/instances.py
> +++ b/tests/cloud_tests/platforms/instances.py
> @@ -98,21 +103,21 @@ class Instance(TargetBase):
>          client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
>          private_key = paramiko.RSAKey.from_private_key_file(self.ssh_key_file)
>  
> -        retries = 30
> +        retries = 3
>          while retries:
>              try:
>                  client.connect(username=self.ssh_username,
>                                 hostname=self.ssh_ip, port=self.ssh_port,
> -                               pkey=private_key, banner_timeout=30)

If we don't have a timeout on the client.connect method, then is this test now unbounded?  Or is there a default timeout for failed connections?

> +                               pkey=private_key)
>                  self._ssh_client = client
>                  return client
>              except (ConnectionRefusedError, AuthenticationException,
>                      BadHostKeyException, ConnectionResetError, SSHException,
> -                    OSError) as e:
> +                    OSError):
>                  retries -= 1
> -                time.sleep(10)
> +                time.sleep(3)

This seems like an arbitrary change;  I think it would be good to know the upper bound on how long we want to wait
before we call _ssh_connect() a bust, and then work backwards w.r.t the number of tries and how long to sleep 
within the total time we plan to wait.

>  
> -        ssh_cmd = 'Failed ssh connection to %s@%s:%s after 300 seconds' % (
> +        ssh_cmd = 'Failed ssh connection to %s@%s:%s after 3 retries' % (
>              self.ssh_username, self.ssh_ip, self.ssh_port
>          )
>          raise util.InTargetExecuteError(b'', b'', 1, ssh_cmd, 'ssh')
> @@ -120,6 +125,10 @@ class Instance(TargetBase):
>      def _wait_for_system(self, wait_for_cloud_init):
>          """Wait until system has fully booted and cloud-init has finished.
>  
> +        Attempt to connect to the instance for the duration of the
> +        boot_timeout value. This means we will retry SSH connections
> +        untill we connect or untill the timeout is reached.

s/untill/until

That comment doesn't seem to mach the ssh logic above which will only retry 3 times.

> +
>          @param wait_time: maximum time to wait
>          @return_value: None, may raise OSError if wait_time exceeded
>          """
> @@ -128,18 +137,26 @@ class Instance(TargetBase):
>              return ' '.join(l for l in test.strip().splitlines()
>                              if not l.lstrip().startswith('#'))
>  
> -        time = self.config['boot_timeout']
> +        boot_timeout = self.config['boot_timeout']
>          tests = [self.config['system_ready_script']]
>          if wait_for_cloud_init:
>              tests.append(self.config['cloud_init_ready_script'])
>  
>          formatted_tests = ' && '.join(clean_test(t) for t in tests)
>          cmd = ('i=0; while [ $i -lt {time} ] && i=$(($i+1)); do {test} && '
> -               'exit 0; sleep 1; done; exit 1').format(time=time,
> +               'exit 0; sleep 1; done; exit 1').format(time=boot_timeout,
>                                                         test=formatted_tests)
>  
> -        if self.execute(cmd, rcs=(0, 1))[-1] != 0:
> -            raise OSError('timeout: after {}s system not started'.format(time))
> -
> +        end_time = time.time() + boot_timeout
> +        while time.time() < end_time:

I would suggest a slightly different structure to ensure that we always execute the test at least once (note that if for some reason the system sleeps between end_time and the next time.time() we'll never run self.execute.

end_time = time.time() + boot_timeout
while True:
    try:
        if self.execute()[-1] == 0:
            break # success, we can exit the while loop
        else:
            raise OSError
    except util.InTargetExecuteError as e:
        log.error("Encountered error: %s", e)

    if time.time() < end_time:
        time.sleep(3)
    else:
        raise OSError

Hrm, looking at that, I'm not sure why we'd raise and OSerror with "timeout", can't execute return 1 and not be related to timing out?

> +            try:
> +                if self.execute(cmd, rcs=(0, 1),
> +                                description='wait for instance')[-1] != 0:
> +                    raise OSError('timeout: after %ss system not started' %
> +                                  boot_timeout)
> +                else:
> +                    break
> +            except util.InTargetExecuteError:
> +                time.sleep(3)
>  
>  # vi: ts=4 expandtab


-- 
https://code.launchpad.net/~powersj/cloud-init/+git/cloud-init/+merge/342010
Your team cloud-init commiters is requested to review the proposed merge of ~powersj/cloud-init:cii-restructure-ssh into cloud-init:master.


References