← Back to team overview

cloud-init-dev team mailing list archive

Re: [Merge] ~chad.smith/cloud-init:feature/azure-network-on-boot into cloud-init:master

 

This is coming together.

Diff comments:

> diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
> index 7007d9e..87f15b2 100644
> --- a/cloudinit/sources/DataSourceAzure.py
> +++ b/cloudinit/sources/DataSourceAzure.py
> @@ -252,6 +263,10 @@ class DataSourceAzure(sources.DataSource):
>  
>      dsname = 'Azure'
>      _negotiated = False
> +    _metadata_imds = sources.UNSET
> +
> +    # Regenerate network config new_instance boot and every boot
> +    update_events = {'network': [EventType.BOOT_NEW_INSTANCE, EventType.BOOT]}

The prototype yaml included a version and 'when' as the attribute which accepted the list of events.
Is this meant to model that?

updates:
 policy-version: [1] # default to 1
 network:
   when: [never, per-instance, boot, boot-change, always]
   watch-url: http://..../
 storage:
   when:

>  
>      def __init__(self, sys_cfg, distro, paths):
>          sources.DataSource.__init__(self, sys_cfg, distro, paths)
> @@ -373,41 +390,86 @@ class DataSourceAzure(sources.DataSource):
>              except NonAzureDataSource:
>                  continue
>              except BrokenAzureDataSource as exc:
> -                raise exc
> +                msg = 'BrokenAzureDataSource: %s' % exc
> +                raise sources.InvalidMetaDataException(msg)
>              except util.MountFailedError:
>                  LOG.warning("%s was not mountable", cdev)
>                  continue
>  
>              if reprovision or self._should_reprovision(ret):
>                  ret = self._reprovision()
> -            (md, self.userdata_raw, cfg, files) = ret
> +            imds_md = get_metadata_from_imds(
> +                self.fallback_interface, retries=3)
> +            (md, userdata_raw, cfg, files) = ret
>              self.seed = cdev
> -            self.metadata = util.mergemanydict([md, DEFAULT_METADATA])
> -            self.cfg = util.mergemanydict([cfg, BUILTIN_CLOUD_CONFIG])
> +            crawled_data.update({
> +                'cfg': cfg,
> +                'files': files,
> +                'metadata': util.mergemanydict(
> +                    [md, {'imds': imds_md}]),
> +                'userdata_raw': userdata_raw})
>              found = cdev
>  
>              LOG.debug("found datasource in %s", cdev)
>              break
>  
>          if not found:
> -            return False
> +            raise sources.InvalidMetaDataException('No Azure metadata found')
>  
>          if found == ddir:
>              LOG.debug("using files cached in %s", ddir)
>  
>          seed = _get_random_seed()
>          if seed:
> -            self.metadata['random_seed'] = seed
> +            crawled_data['metadata']['random_seed'] = seed
> +        crawled_data['metadata']['instance-id'] = util.read_dmi_data(
> +            'system-uuid')
> +        return crawled_data
> +
> +    def is_platform_viable(self):

check_platform_id() ?  we have a check_instance_id()

> +        """Check platform environment to report if this datasource may run."""
> +        asset_tag = util.read_dmi_data('chassis-asset-tag')
> +        if asset_tag != AZURE_CHASSIS_ASSET_TAG:
> +            LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag)
> +            return False
> +        return True
> +
> +    def clear_cached_attrs(self):
> +        """Reset any cached class attributes to defaults."""
> +        super(DataSourceAzure, self).clear_cached_attrs()
> +        self._metadata_imds = sources.UNSET
> +

Do you think we should deprecate the get_data property() so callers can be notified to change?
That is, I think we want to stop using only get_data() since we're breaking this into the various
stages (crawl, process, cache) right?

> +    def _get_data(self):
> +        """Crawl and process datasource metadata caching metadata as attrs.
> +
> +        @return: True on success, False on error, invalid or disabled
> +            datasource.
> +        """
> +        if not self.is_platform_viable():
> +            return False
> +        try:
> +            crawled_data = util.log_time(
> +                        logfunc=LOG.debug, msg='Crawl of metadata service',
> +                        func=self.crawl_metadata)
> +        except sources.InvalidMetaDataException as e:
> +            LOG.warning('Could not crawl Azure metadata: %s', e)
> +            return False
> +
> +        # Process crawled data and augment with various config defaults

This looks like it should be a class function passing in the crawled data right?

> +        self.cfg = util.mergemanydict(
> +            [crawled_data['cfg'], BUILTIN_CLOUD_CONFIG])
> +        self._metadata_imds = crawled_data['metadata']['imds']
> +        self.metadata = util.mergemanydict(
> +            [crawled_data['metadata'], DEFAULT_METADATA])
> +        self.userdata_raw = crawled_data['userdata_raw']
>  
>          user_ds_cfg = util.get_cfg_by_path(self.cfg, DS_CFG_PATH, {})
>          self.ds_cfg = util.mergemanydict([user_ds_cfg, self.ds_cfg])
>  
>          # walinux agent writes files world readable, but expects
>          # the directory to be protected.
> -        write_files(ddir, files, dirmode=0o700)
> -
> -        self.metadata['instance-id'] = util.read_dmi_data('system-uuid')
> -
> +        write_files(
> +            self.ds_cfg['data_dir'], crawled_data['files'], dirmode=0o700)
>          return True
>  
>      def device_name_to_device(self, name):
> @@ -543,22 +605,56 @@ class DataSourceAzure(sources.DataSource):
>      @property
>      def network_config(self):
>          """Generate a network config like net.generate_fallback_network() with
> -           the following execptions.
> +           the following execeptions.

heh, exceptions

>  
>             1. Probe the drivers of the net-devices present and inject them in
>                the network configuration under params: driver: <driver> value
>             2. Generate a fallback network config that does not include any of
>                the blacklisted devices.
>          """
> -        blacklist = ['mlx4_core']
>          if not self._network_config:
> -            LOG.debug('Azure: generating fallback configuration')
> -            # generate a network config, blacklist picking any mlx4_core devs
> -            netconfig = net.generate_fallback_config(
> -                blacklist_drivers=blacklist, config_driver=True)
> +            if self._metadata_imds != sources.UNSET and self._metadata_imds:
> +                netconfig = {'version': 2, 'ethernets': {}}

This needs to be a function for easier testing.

> +                LOG.debug('Azure: generating network configuration from IMDS')
> +                if self.distro and self.distro.name == 'ubuntu':
> +                    remove_ubuntu_network_config_scripts()
> +                network_metadata = self._metadata_imds['network']
> +                for idx, intf in enumerate(network_metadata['interface']):
> +                    nicname = 'eth{idx}'.format(idx=idx)
> +                    dev_config = {}
> +                    for addr4 in intf['ipv4']['ipAddress']:
> +                        privateIpv4 = addr4['privateIpAddress']
> +                        if privateIpv4:
> +                            if dev_config.get('dhcp4', False):
> +                                # Append static address config for nic > 1
> +                                netPrefix = intf['ipv4']['subnet'][0].get(
> +                                    'prefix', '24')
> +                                if not dev_config.get('addresses'):
> +                                    dev_config['addresses'] = []
> +                                dev_config['addresses'].append(
> +                                    '{ip}/{prefix}'.format(
> +                                        ip=privateIpv4, prefix=netPrefix))
> +                            else:
> +                                dev_config['dhcp4'] = True
> +                    for addr6 in intf['ipv6']['ipAddress']:
> +                        privateIpv6 = addr6['privateIpAddress']
> +                        if privateIpv6:
> +                            dev_config['dhcp6'] = True
> +                            break
> +                    if dev_config:
> +                        mac = ':'.join(re.findall(r'..', intf['macAddress']))
> +                        dev_config.update(
> +                            {'match': {'macaddress': mac.lower()},
> +                             'set-name': nicname})
> +                        netconfig['ethernets'][nicname] = dev_config
> +            else:
> +                blacklist = ['mlx4_core']
> +                LOG.debug('Azure: generating fallback configuration')
> +                # generate a network config, blacklist picking mlx4_core devs
> +                netconfig = net.generate_fallback_config(
> +                    blacklist_drivers=blacklist, config_driver=True)
>  
>              self._network_config = netconfig
> -
>          return self._network_config
>  
>  
> @@ -1025,6 +1121,89 @@ def load_azure_ds_dir(source_dir):
>      return (md, ud, cfg, {'ovf-env.xml': contents})
>  
>  
> +def get_metadata_from_imds(fallback_nic, retries):
> +    """Query Azure's network metadata service, returning a dictionary.
> +
> +    If network is not up, setup ephemeral dhcp on fallback_nic to talk to the
> +    IMDS. For more info on IMDS:
> +        https://docs.microsoft.com/en-us/azure/virtual-machines/windows/instance-metadata-service
> +
> +    @param fallback_nic: String. The name of the nic which requires active
> +        networ in order to query IMDS.
> +    @param retries: The number of retries of the IMDS_URL.
> +
> +    @return: A dict of instance metadata containing compute and network
> +        info.
> +    """
> +    if net.is_up(fallback_nic):
> +        return util.log_time(
> +            logfunc=LOG.debug,
> +            msg='Crawl of Azure Instance Metadata Service (IMDS)',
> +            func=_get_metadata_from_imds, args=(retries,))
> +    else:
> +        with EphemeralDHCPv4(fallback_nic):
> +            return util.log_time(
> +                logfunc=LOG.debug,
> +                msg='Crawl of Azure Instance Metadata Service (IMDS)',
> +                func=_get_metadata_from_imds, args=(retries,))
> +
> +
> +def _get_metadata_from_imds(retries):
> +
> +    def retry_on_url_error(msg, exception):
> +        if isinstance(exception, UrlError) and exception.code == 404:
> +            return True  # Continue retries
> +        return False  # Stop retries on all other exceptions, including 404s
> +
> +    url = IMDS_URL + "instance?api-version=2017-12-01"

I see at least two api version values, can we centralize these into variables that
explain what the data means or the expected format of this version?

> +    headers = {"Metadata": "true"}
> +    try:
> +        response = readurl(
> +            url, timeout=1, headers=headers, retries=retries,

timeout=1 ?  Is the default timeout too long? to short?

> +            exception_cb=retry_on_url_error)
> +    except Exception as e:
> +        LOG.debug('Ignoring IMDS instance metadata: %s', e)
> +        return {}
> +    try:
> +        return util.load_json(str(response))
> +    except json.decoder.JSONDecodeError:
> +        LOG.warning(
> +            'Ignoring non-json IMDS instance metadata: %s', str(response))
> +    return {}
> +
> +
> +def remove_ubuntu_network_config_scripts(paths=None):

We typically prepend maybe_ since it's possible that in new images we don't need to do this.
Would it be useful to mark a flag on the ds object to indicate whether we've removed those or not?

> +    """Remove Azure-specific ubuntu network config for non-primary nics.
> +
> +    @param paths: List of networking scripts or directories to remove when
> +        present.
> +
> +    In certain supported ubuntu images, static udev rules or netplan yaml
> +    config is delivered in the base ubuntu image to support dhcp on any
> +    additional interfaces which get attached by a customer at some point
> +    after initial boot. Since the Azure datasource can now regenerate
> +    network configuration as metadata reports these new devices, we no longer
> +    want the udev rules or netplan's 90-azure-hotplug.yaml to configure
> +    networking on eth1 or greater as it might collide with cloud-init's
> +    configuration.
> +
> +    Remove the any existing extended network scripts if the datasource is
> +    enabled to write network per-boot.
> +    """
> +    if not paths:
> +        paths = UBUNTU_EXTENDED_NETWORK_SCRIPTS
> +    LOG.debug(
> +        'Removing Ubuntu extended network scripts because cloud-init updates'
> +        ' Azure network configuration on the following event: %s.',
> +        EventType.BOOT)
> +    for path in paths:
> +        if os.path.exists(path):
> +            if os.path.isdir(path):
> +                util.del_dir(path)
> +            else:
> +                util.del_file(path)
> +
> +
>  class BrokenAzureDataSource(Exception):
>      pass
>  
> diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
> index e82716e..fbc9d2d 100644
> --- a/tests/unittests/test_datasource/test_azure.py
> +++ b/tests/unittests/test_datasource/test_azure.py
> @@ -77,9 +83,110 @@ def construct_valid_ovf_env(data=None, pubkeys=None,
>      return content
>  
>  
> +NETWORK_METADATA = {

What URL path and api-version is this?

> +    "network": {
> +        "interface": [
> +            {
> +                "macAddress": "000D3A047598",
> +                "ipv6": {
> +                    "ipAddress": []
> +                },
> +                "ipv4": {
> +                    "subnet": [
> +                        {
> +                           "prefix": "24",
> +                           "address": "10.0.0.0"
> +                        }
> +                    ],
> +                    "ipAddress": [
> +                        {
> +                           "privateIpAddress": "10.0.0.4",
> +                           "publicIpAddress": "104.46.124.81"
> +                        }
> +                    ]
> +                }
> +            }
> +        ]
> +    }
> +}
> +
> +
> +class TestGetMetadataFromIMDS(HttprettyTestCase):
> +
> +    with_logs = True
> +
> +    def setUp(self):
> +        super(TestGetMetadataFromIMDS, self).setUp()
> +        self.network_md_url = dsaz.IMDS_URL + "instance?api-version=2017-12-01"
> +
> +    @mock.patch('cloudinit.sources.DataSourceAzure.readurl')
> +    @mock.patch('cloudinit.sources.DataSourceAzure.EphemeralDHCPv4')
> +    @mock.patch('cloudinit.sources.DataSourceAzure.net.is_up')
> +    def test_get_metadata_does_not_dhcp_if_network_is_up(
> +            self, m_net_is_up, m_dhcp, m_readurl):
> +        """Do not perform DHCP setup when nic is already up."""
> +        m_net_is_up.return_value = True
> +        m_readurl.return_value = url_helper.StringResponse(
> +            json.dumps(NETWORK_METADATA).encode('utf-8'))
> +        self.assertEqual(
> +            NETWORK_METADATA,
> +            dsaz.get_metadata_from_imds('eth9', retries=3))
> +
> +        m_net_is_up.assert_called_with('eth9')
> +        m_dhcp.assert_not_called()
> +        self.assertIn(
> +            "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time
> +            self.logs.getvalue())
> +
> +    @mock.patch('cloudinit.sources.DataSourceAzure.readurl')
> +    @mock.patch('cloudinit.sources.DataSourceAzure.EphemeralDHCPv4')
> +    @mock.patch('cloudinit.sources.DataSourceAzure.net.is_up')
> +    def test_get_metadata_performs_dhcp_when_network_is_down(
> +            self, m_net_is_up, m_dhcp, m_readurl):
> +        """Do not perform DHCP setup when nic is already up."""

comment doesn't match test-case.

> +        m_net_is_up.return_value = False
> +        m_readurl.return_value = url_helper.StringResponse(
> +            json.dumps(NETWORK_METADATA).encode('utf-8'))
> +
> +        self.assertEqual(
> +            NETWORK_METADATA,
> +            dsaz.get_metadata_from_imds('eth9', retries=2))
> +
> +        m_net_is_up.assert_called_with('eth9')
> +        m_dhcp.assert_called_with('eth9')
> +        self.assertIn(
> +            "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time
> +            self.logs.getvalue())
> +
> +        m_readurl.assert_called_with(
> +            self.network_md_url, exception_cb=mock.ANY,
> +            headers={'Metadata': 'true'}, retries=2, timeout=1)
> +
> +    @mock.patch('cloudinit.url_helper.time.sleep')
> +    @mock.patch('cloudinit.sources.DataSourceAzure.net.is_up')
> +    def test_get_metadata_from_imds_empty_when_no_imds_present(
> +            self, m_net_is_up, m_sleep):
> +        """Return empty dict when IMDS network metadata is absent."""
> +        httpretty.register_uri(
> +            httpretty.GET,
> +            dsaz.IMDS_URL + 'instance?api-version=2017-12-01',
> +            body={}, status=404)
> +
> +        m_net_is_up.return_value = True  # skips dhcp
> +
> +        self.assertEqual({}, dsaz.get_metadata_from_imds('eth9', retries=2))
> +
> +        m_net_is_up.assert_called_with('eth9')
> +        self.assertEqual([mock.call(1), mock.call(1)], m_sleep.call_args_list)
> +        self.assertIn(
> +            "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time
> +            self.logs.getvalue())
> +
> +
>  class TestAzureDataSource(CiTestCase):
>  
>      with_logs = True
> +    maxDiff = None

Should this go in CiTestCase by default?

>  
>      def setUp(self):
>          super(TestAzureDataSource, self).setUp()
> @@ -1188,6 +1430,24 @@ class TestCanDevBeReformatted(CiTestCase):
>                        "(datasource.Azure.never_destroy_ntfs)", msg)
>  
>  
> +class TestClearCachedData(CiTestCase):
> +
> +    def test_clear_cached_attrs_clears_imds(self):
> +        """All class attributes are reset to defaults, including imds data."""
> +        tmp = self.tmp_dir()
> +        paths = helpers.Paths(
> +            {'cloud_dir': tmp, 'run_dir': tmp})
> +        dsrc = dsaz.DataSourceAzure({}, distro=None, paths=paths)
> +        dsrc.metadata = 'md'
> +        dsrc.userdata = 'ud'
> +        dsrc._metadata_imds = 'imds'
> +        dsrc._dirty_cache = True
> +        dsrc.clear_cached_attrs()
> +        self.assertEqual({}, dsrc.metadata)
> +        self.assertEqual(None, dsrc.userdata)
> +        self.assertEqual(UNSET, dsrc._metadata_imds)

I'm not super happy that we have *three* different values as the default.
I'd like to not have to know this in the unittest.
Can we instantiate a new ds instance.  deep_copy it, and then assert
the 3 fields are equal after clear_cache() instead?

> +
> +
>  class TestAzureNetExists(CiTestCase):
>  
>      def test_azure_net_must_exist_for_legacy_objpkl(self):
> @@ -1398,4 +1658,52 @@ class TestAzureDataSourcePreprovisioning(CiTestCase):
>          self.assertEqual(m_net.call_count, 1)
>  
>  
> +class TestRemoveUbuntuNetworkConfigScripts(CiTestCase):
> +
> +    with_logs = True
> +
> +    def setUp(self):
> +        super(TestRemoveUbuntuNetworkConfigScripts, self).setUp()
> +        self.tmp = self.tmp_dir()
> +
> +    def test_remove_network_scripts_removes_both_files_and_directories(self):
> +        """Any files or directories in paths are removed when present."""
> +        file1 = self.tmp_path('file1', dir=self.tmp)
> +        subdir = self.tmp_path('sub1', dir=self.tmp)
> +        subfile = self.tmp_path('leaf1', dir=subdir)
> +        write_file(file1, 'file1content')

Can we instead mock the os.listdirs os.path.exists?

> +        write_file(subfile, 'leafcontent')
> +        dsaz.remove_ubuntu_network_config_scripts(paths=[subdir, file1])
> +
> +        for path in (file1, subdir, subfile):
> +            self.assertFalse(os.path.exists(path),
> +                             'Found unremoved: %s' % path)
> +
> +        expected_logs = [
> +            'Removing Ubuntu extended network scripts because cloud-init'
> +            ' updates Azure network configuration on the following event:'
> +            ' System boot.',
> +            'Recursively deleting %s' % subdir,
> +            'Attempting to remove %s' % file1]
> +        for log in expected_logs:
> +            self.assertIn(log, self.logs.getvalue())
> +
> +    def test_remove_network_scripts_only_attemts_removal_if_path_exists(self):
> +        """Any files or directories absent are skipped without error."""
> +        dsaz.remove_ubuntu_network_config_scripts(paths=[
> +            '/not/a/dir/', '/not/a/file'])
> +        self.assertNotIn('/not/a', self.logs.getvalue())  # No delete logs
> +
> +    @mock.patch('cloudinit.sources.DataSourceAzure.os.path.exists')
> +    def test_remove_network_scripts_default_removes_stock_scripts(self,
> +                                                                  m_exists):
> +        """Azure's stock ubuntu image scripts and artifacts are removed."""
> +        # Report path absent on all to avoid delete operation
> +        m_exists.return_value = False
> +        dsaz.remove_ubuntu_network_config_scripts()
> +        calls = m_exists.call_args_list
> +        for path in dsaz.UBUNTU_EXTENDED_NETWORK_SCRIPTS:
> +            self.assertIn(mock.call(path), calls)
> +
> +
>  # vi: ts=4 expandtab


-- 
https://code.launchpad.net/~chad.smith/cloud-init/+git/cloud-init/+merge/348704
Your team cloud-init commiters is requested to review the proposed merge of ~chad.smith/cloud-init:feature/azure-network-on-boot into cloud-init:master.


References