← Back to team overview

cloud-init-dev team mailing list archive

[Merge] ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master

 

Anh Vo (MSFT) has proposed merging ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master.

Commit message:
Azure: Record boot timestamps, system information, and diagnostic events

Collect and record the following information through KVP:
 + timestamps related to kernel initialization and systemd activation 
   of cloud-init services
 + system information including cloud-init version, kernel version, 
   distro version, and python version
 + diagnostic events for the most common provisioning error issues
   such as empty dhcp lease, corrupted ovf-env.xml, etc.

Requested reviews:
  cloud-init commiters (cloud-init-dev)

For more details, see:
https://code.launchpad.net/~vtqanh/cloud-init/+git/cloud-init/+merge/369785
-- 
Your team cloud-init commiters is requested to review the proposed merge of ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index d2fad9b..f83b27e 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -28,7 +28,10 @@ from cloudinit.reporting import events
 
 from cloudinit.sources.helpers.azure import (azure_ds_reporter,
                                              azure_ds_telemetry_reporter,
-                                             get_metadata_from_fabric)
+                                             get_metadata_from_fabric,
+                                             get_boot_telemetry,
+                                             get_system_info,
+                                             report_diagnostic_event)
 
 LOG = logging.getLogger(__name__)
 
@@ -354,7 +357,7 @@ class DataSourceAzure(sources.DataSource):
                 bname = str(pk['fingerprint'] + ".crt")
                 fp_files += [os.path.join(ddir, bname)]
                 LOG.debug("ssh authentication: "
-                          "using fingerprint from fabirc")
+                          "using fingerprint from fabric")
 
         with events.ReportEventStack(
                 name="waiting-for-ssh-public-key",
@@ -419,12 +422,17 @@ class DataSourceAzure(sources.DataSource):
                     ret = load_azure_ds_dir(cdev)
 
             except NonAzureDataSource:
+                report_diagnostic_event(
+                    "Did not find Azure data source in %s" % cdev)
                 continue
             except BrokenAzureDataSource as exc:
                 msg = 'BrokenAzureDataSource: %s' % exc
+                report_diagnostic_event(msg)
                 raise sources.InvalidMetaDataException(msg)
             except util.MountFailedError:
-                LOG.warning("%s was not mountable", cdev)
+                msg = '%s was not mountable' % cdev
+                report_diagnostic_event(msg)
+                LOG.warning(msg)
                 continue
 
             perform_reprovision = reprovision or self._should_reprovision(ret)
@@ -432,6 +440,7 @@ class DataSourceAzure(sources.DataSource):
                 if util.is_FreeBSD():
                     msg = "Free BSD is not supported for PPS VMs"
                     LOG.error(msg)
+                    report_diagnostic_event(msg)
                     raise sources.InvalidMetaDataException(msg)
                 ret = self._reprovision()
             imds_md = get_metadata_from_imds(
@@ -450,7 +459,9 @@ class DataSourceAzure(sources.DataSource):
             break
 
         if not found:
-            raise sources.InvalidMetaDataException('No Azure metadata found')
+            msg = 'No Azure metadata found'
+            report_diagnostic_event(msg)
+            raise sources.InvalidMetaDataException(msg)
 
         if found == ddir:
             LOG.debug("using files cached in %s", ddir)
@@ -469,9 +480,13 @@ class DataSourceAzure(sources.DataSource):
                 self._report_ready(lease=self._ephemeral_dhcp_ctx.lease)
                 self._ephemeral_dhcp_ctx.clean_network()  # Teardown ephemeral
             else:
-                with EphemeralDHCPv4() as lease:
-                    self._report_ready(lease=lease)
-
+                try:
+                    with EphemeralDHCPv4() as lease:
+                        self._report_ready(lease=lease)
+                except Exception as e:
+                    report_diagnostic_event(
+                        "exception while reporting ready: %s" % e)
+                    raise
         return crawled_data
 
     def _is_platform_viable(self):
@@ -493,6 +508,16 @@ class DataSourceAzure(sources.DataSource):
         if not self._is_platform_viable():
             return False
         try:
+            get_boot_telemetry(self.distro)
+        except Exception as e:
+            LOG.warning("Failed to get boot telemetry: %s", e)
+
+        try:
+            get_system_info()
+        except Exception as e:
+            LOG.warning("Failed to get system information: %s", e)
+
+        try:
             crawled_data = util.log_time(
                         logfunc=LOG.debug, msg='Crawl of metadata service',
                         func=self.crawl_metadata)
@@ -554,24 +579,50 @@ class DataSourceAzure(sources.DataSource):
 
         def exc_cb(msg, exception):
             if isinstance(exception, UrlError) and exception.code == 404:
+                if self.imds_poll_counter == self.imds_logging_threshold:
+                    # Reducing the logging frequency as we are polling IMDS
+                    self.imds_logging_threshold *= 2
+                    LOG.debug("Call to IMDS with arguments %s failed "
+                              "with status code %s after %s retries",
+                              msg, exception.code, self.imds_poll_counter)
+                self.imds_poll_counter += 1
                 return True
+
             # If we get an exception while trying to call IMDS, we
             # call DHCP and setup the ephemeral network to acquire the new IP.
+            LOG.debug("Call to IMDS with arguments %s failed  with "
+                      "status code %s", msg, exception.code)
+            report_diagnostic_event("polling IMDS failed with exception %s"
+                                    % exception.code)
             return False
 
         LOG.debug("Wait for vnetswitch to happen")
+        self.imds_logging_threshold = 1
+        self.imds_poll_counter = 1
+        dhcp_attempts = 0
+        vnet_switched = False
         while True:
             try:
-                # Save our EphemeralDHCPv4 context so we avoid repeated dhcp
-                self._ephemeral_dhcp_ctx = EphemeralDHCPv4()
-                lease = self._ephemeral_dhcp_ctx.obtain_lease()
+                # Save our EphemeralDHCPv4 context to avoid repeated dhcp
+                with events.ReportEventStack(
+                        name="obtain-dhcp-lease",
+                        description="obtain dhcp lease",
+                        parent=azure_ds_reporter):
+                    self._ephemeral_dhcp_ctx = EphemeralDHCPv4()
+                    lease = self._ephemeral_dhcp_ctx.obtain_lease()
+
+                if vnet_switched:
+                    dhcp_attempts += 1
                 if report_ready:
                     try:
                         nl_sock = netlink.create_bound_netlink_socket()
                     except netlink.NetlinkCreateSocketError as e:
+                        report_diagnostic_event(e)
                         LOG.warning(e)
                         self._ephemeral_dhcp_ctx.clean_network()
-                        return
+                        return_val = None
+                        break
+
                     path = REPORTED_READY_MARKER_FILE
                     LOG.info(
                         "Creating a marker file to report ready: %s", path)
@@ -579,17 +630,34 @@ class DataSourceAzure(sources.DataSource):
                         pid=os.getpid(), time=time()))
                     self._report_ready(lease=lease)
                     report_ready = False
-                    try:
-                        netlink.wait_for_media_disconnect_connect(
-                            nl_sock, lease['interface'])
-                    except AssertionError as error:
-                        LOG.error(error)
-                        return
+
+                    with events.ReportEventStack(
+                            name="wait-for-media-disconnect-connect",
+                            description="wait for vnet switch",
+                            parent=azure_ds_reporter):
+                        try:
+                            netlink.wait_for_media_disconnect_connect(
+                                nl_sock, lease['interface'])
+                        except AssertionError as error:
+                            report_diagnostic_event(error)
+                            LOG.error(error)
+                            return_val = None
+                            break
+
+                    vnet_switched = True
                     self._ephemeral_dhcp_ctx.clean_network()
                 else:
-                    return readurl(url, timeout=IMDS_TIMEOUT_IN_SECONDS,
-                                   headers=headers, exception_cb=exc_cb,
-                                   infinite=True, log_req_resp=False).contents
+                    with events.ReportEventStack(
+                            name="get-reprovision-data-from-imds",
+                            description="get reprovision data from imds",
+                            parent=azure_ds_reporter):
+                        return_val = readurl(url,
+                                             timeout=IMDS_TIMEOUT_IN_SECONDS,
+                                             headers=headers,
+                                             exception_cb=exc_cb,
+                                             infinite=True,
+                                             log_req_resp=False).contents
+                    break
             except UrlError:
                 # Teardown our EphemeralDHCPv4 context on failure as we retry
                 self._ephemeral_dhcp_ctx.clean_network()
@@ -598,6 +666,14 @@ class DataSourceAzure(sources.DataSource):
                 if nl_sock:
                     nl_sock.close()
 
+        if vnet_switched:
+            report_diagnostic_event("attempted dhcp %d times after reuse" %
+                                    dhcp_attempts)
+            report_diagnostic_event("polled imds %d times after reuse" %
+                                    self.imds_poll_counter)
+
+        return return_val
+
     @azure_ds_telemetry_reporter
     def _report_ready(self, lease):
         """Tells the fabric provisioning has completed """
@@ -666,7 +742,9 @@ class DataSourceAzure(sources.DataSource):
                   self.ds_cfg['agent_command'])
         try:
             fabric_data = metadata_func()
-        except Exception:
+        except Exception as e:
+            report_diagnostic_event(
+                "Error communicating with Azure fabric: %s" % e)
             LOG.warning(
                 "Error communicating with Azure fabric; You may experience."
                 "connectivity issues.", exc_info=True)
@@ -1027,7 +1105,9 @@ def read_azure_ovf(contents):
     try:
         dom = minidom.parseString(contents)
     except Exception as e:
-        raise BrokenAzureDataSource("Invalid ovf-env.xml: %s" % e)
+        error_str = "Invalid ovf-env.xml: %s" % e
+        report_diagnostic_event(error_str)
+        raise BrokenAzureDataSource(error_str)
 
     results = find_child(dom.documentElement,
                          lambda n: n.localName == "ProvisioningSection")
@@ -1294,8 +1374,12 @@ def get_metadata_from_imds(fallback_nic, retries):
     if net.is_up(fallback_nic):
         return util.log_time(**kwargs)
     else:
-        with EphemeralDHCPv4(fallback_nic):
-            return util.log_time(**kwargs)
+        try:
+            with EphemeralDHCPv4(fallback_nic):
+                return util.log_time(**kwargs)
+        except Exception as e:
+            report_diagnostic_event("exception while getting metadata: %s" % e)
+            raise
 
 
 @azure_ds_telemetry_reporter
@@ -1308,11 +1392,14 @@ def _get_metadata_from_imds(retries):
             url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
             retries=retries, exception_cb=retry_on_url_exc)
     except Exception as e:
-        LOG.debug('Ignoring IMDS instance metadata: %s', e)
+        msg = 'Ignoring IMDS instance metadata: %s' % e
+        report_diagnostic_event(msg)
+        LOG.debug(msg)
         return {}
     try:
         return util.load_json(str(response))
-    except json.decoder.JSONDecodeError:
+    except json.decoder.JSONDecodeError as e:
+        report_diagnostic_event('non-json imds response' % e)
         LOG.warning(
             'Ignoring non-json IMDS instance metadata: %s', str(response))
     return {}
@@ -1355,21 +1442,18 @@ def maybe_remove_ubuntu_network_config_scripts(paths=None):
                 util.del_file(path)
 
 
+@azure_ds_telemetry_reporter
 def _is_platform_viable(seed_dir):
-    with events.ReportEventStack(
-                name="check-platform-viability",
-                description="found azure asset tag",
-                parent=azure_ds_reporter) as evt:
-
-        """Check platform environment to report if this datasource may run."""
-        asset_tag = util.read_dmi_data('chassis-asset-tag')
-        if asset_tag == AZURE_CHASSIS_ASSET_TAG:
-            return True
-        LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag)
-        evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag
-        if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')):
-            return True
-        return False
+    """Check platform environment to report if this datasource may run."""
+    asset_tag = util.read_dmi_data('chassis-asset-tag')
+    if asset_tag == AZURE_CHASSIS_ASSET_TAG:
+        return True
+    msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag
+    LOG.debug(msg)
+    report_diagnostic_event(msg)
+    if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')):
+        return True
+    return False
 
 
 class BrokenAzureDataSource(Exception):
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index 82c4c8c..3e7da49 100755
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -16,7 +16,9 @@ from xml.etree import ElementTree
 
 from cloudinit import url_helper
 from cloudinit import util
+from cloudinit import version
 from cloudinit.reporting import events
+from datetime import datetime
 
 LOG = logging.getLogger(__name__)
 
@@ -24,6 +26,10 @@ LOG = logging.getLogger(__name__)
 # value is applied if the endpoint can't be found within a lease file
 DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10"
 
+BOOT_EVENT_TYPE = 'boot-telemetry'
+SYSTEMINFO_EVENT_TYPE = 'system-info'
+DIAGNOSTIC_EVENT_TYPE = 'diagnostic'
+
 azure_ds_reporter = events.ReportEventStack(
     name="azure-ds",
     description="initialize reporter for azure ds",
@@ -40,6 +46,73 @@ def azure_ds_telemetry_reporter(func):
     return impl
 
 
+@azure_ds_telemetry_reporter
+def get_boot_telemetry(distro):
+    """Report timestamps related to kernel initialization and systemd
+       activation of cloud-init"""
+    if not distro.uses_systemd():
+        LOG.debug("%s not using systemd, skipping boot telemetry collection",
+                  distro.name)
+        return False
+    else:
+        LOG.debug("Collecting boot telemetry for %s", distro.name)
+
+        kernel_start = float(time.time()) - float(util.uptime())
+        out, err = util.subp(['/bin/systemctl',
+                              'show', '-p',
+                              'UserspaceTimestampMonotonic'],
+                             capture=True)
+    if len(err):
+        LOG.debug("Gathering boot telemetry output %s:%s", out, err)
+        return False
+
+    tsm = out.split("=")[1]
+    user_start = kernel_start + (float(tsm) / 1000000)
+
+    out, err = util.subp(['/bin/systemctl', 'show',
+                          'cloud-init-local', '-p',
+                          'InactiveExitTimestampMonotonic'],
+                         capture=True)
+    if len(err):
+        LOG.debug("Gathering boot telemetry output %s:%s", out, err)
+        return False
+
+    tsm = out.split("=")[1]
+    cloudinit_activation = kernel_start + (float(tsm) / 1000000)
+    evt = events.ReportingEvent(
+        BOOT_EVENT_TYPE, 'boot-telemetry',
+        "kernel_start=%s user_start=%s cloudinit_activation=%s" %
+        (datetime.utcfromtimestamp(kernel_start).isoformat() + 'Z',
+         datetime.utcfromtimestamp(user_start).isoformat() + 'Z',
+         datetime.utcfromtimestamp(cloudinit_activation).isoformat() + 'Z'),
+        events.DEFAULT_EVENT_ORIGIN)
+    events.report_event(evt)
+    return True
+
+
+@azure_ds_telemetry_reporter
+def get_system_info():
+    """Collect and report system information"""
+    info = util.system_info()
+    evt = events.ReportingEvent(
+        SYSTEMINFO_EVENT_TYPE, 'system information',
+        "cloudinit_version=%s, kernel_version=%s, variant=%s, "
+        "distro_name=%s, distro_version=%s, flavor=%s, "
+        "python_version=%s" %
+        (version.version_string(), info['release'], info['variant'],
+         info['dist'][0], info['dist'][1], info['dist'][2],
+         info['python']), events.DEFAULT_EVENT_ORIGIN)
+    events.report_event(evt)
+
+
+def report_diagnostic_event(str):
+    """Report a diagnostic event"""
+    evt = events.ReportingEvent(
+        DIAGNOSTIC_EVENT_TYPE, 'diagnostic message',
+        str, events.DEFAULT_EVENT_ORIGIN)
+    events.report_event(evt)
+
+
 @contextmanager
 def cd(newdir):
     prevdir = os.getcwd()
@@ -360,16 +433,19 @@ class WALinuxAgentShim(object):
             value = dhcp245
             LOG.debug("Using Azure Endpoint from dhcp options")
         if value is None:
+            report_diagnostic_event("No Azure endpoint from dhcp options")
             LOG.debug('Finding Azure endpoint from networkd...')
             value = WALinuxAgentShim._networkd_get_value_from_leases()
         if value is None:
             # Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json
             # a dhclient exit hook that calls cloud-init-dhclient-hook
+            report_diagnostic_event("No Azure endpoint from networkd")
             LOG.debug('Finding Azure endpoint from hook json...')
             dhcp_options = WALinuxAgentShim._load_dhclient_json()
             value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options)
         if value is None:
             # Fallback and check the leases file if unsuccessful
+            report_diagnostic_event("No Azure endpoint from dhclient logs")
             LOG.debug("Unable to find endpoint in dhclient logs. "
                       " Falling back to check lease files")
             if fallback_lease_file is None:
@@ -381,11 +457,15 @@ class WALinuxAgentShim(object):
                 value = WALinuxAgentShim._get_value_from_leases_file(
                     fallback_lease_file)
         if value is None:
-            LOG.warning("No lease found; using default endpoint")
+            msg = "No lease found; using default endpoint"
+            report_diagnostic_event(msg)
+            LOG.warning(msg)
             value = DEFAULT_WIRESERVER_ENDPOINT
 
         endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value)
-        LOG.debug('Azure endpoint found at %s', endpoint_ip_address)
+        msg = 'Azure endpoint found at %s' % endpoint_ip_address
+        report_diagnostic_event(msg)
+        LOG.debug(msg)
         return endpoint_ip_address
 
     @azure_ds_telemetry_reporter
@@ -399,16 +479,19 @@ class WALinuxAgentShim(object):
             try:
                 response = http_client.get(
                     'http://{0}/machine/?comp=goalstate'.format(self.endpoint))
-            except Exception:
+            except Exception as e:
                 if attempts < 10:
                     time.sleep(attempts + 1)
                 else:
+                    report_diagnostic_event(
+                        "failed to register with Azure: %s" % e)
                     raise
             else:
                 break
             attempts += 1
         LOG.debug('Successfully fetched GoalState XML.')
         goal_state = GoalState(response.contents, http_client)
+        report_diagnostic_event("container_id %s" % goal_state.container_id)
         ssh_keys = []
         if goal_state.certificates_xml is not None and pubkey_info is not None:
             LOG.debug('Certificate XML found; parsing out public keys.')
@@ -449,11 +532,20 @@ class WALinuxAgentShim(object):
             container_id=goal_state.container_id,
             instance_id=goal_state.instance_id,
         )
-        http_client.post(
-            "http://{0}/machine?comp=health".format(self.endpoint),
-            data=document,
-            extra_headers={'Content-Type': 'text/xml; charset=utf-8'},
-        )
+        # Host will collect kvps when cloud-init reports ready.
+        # some kvps might still be in the queue. We yield the scheduler
+        # to make sure we process all kvps up till this point.
+        time.sleep(0)
+        try:
+            http_client.post(
+                "http://{0}/machine?comp=health".format(self.endpoint),
+                data=document,
+                extra_headers={'Content-Type': 'text/xml; charset=utf-8'},
+            )
+        except Exception as e:
+            report_diagnostic_event("exception while reporting ready: %s" % e)
+            raise
+
         LOG.info('Reported ready to Azure fabric.')
 
 

Follow ups