cloud-init-dev team mailing list archive
-
cloud-init-dev team
-
Mailing list archive
-
Message #06382
[Merge] ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master
Anh Vo (MSFT) has proposed merging ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master.
Commit message:
Azure: Record boot timestamps, system information, and diagnostic events
Collect and record the following information through KVP:
+ timestamps related to kernel initialization and systemd activation
of cloud-init services
+ system information including cloud-init version, kernel version,
distro version, and python version
+ diagnostic events for the most common provisioning error issues
such as empty dhcp lease, corrupted ovf-env.xml, etc.
Requested reviews:
cloud-init commiters (cloud-init-dev)
For more details, see:
https://code.launchpad.net/~vtqanh/cloud-init/+git/cloud-init/+merge/369785
--
Your team cloud-init commiters is requested to review the proposed merge of ~vtqanh/cloud-init:ProvisioningTelemetry into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index d2fad9b..f83b27e 100755
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -28,7 +28,10 @@ from cloudinit.reporting import events
from cloudinit.sources.helpers.azure import (azure_ds_reporter,
azure_ds_telemetry_reporter,
- get_metadata_from_fabric)
+ get_metadata_from_fabric,
+ get_boot_telemetry,
+ get_system_info,
+ report_diagnostic_event)
LOG = logging.getLogger(__name__)
@@ -354,7 +357,7 @@ class DataSourceAzure(sources.DataSource):
bname = str(pk['fingerprint'] + ".crt")
fp_files += [os.path.join(ddir, bname)]
LOG.debug("ssh authentication: "
- "using fingerprint from fabirc")
+ "using fingerprint from fabric")
with events.ReportEventStack(
name="waiting-for-ssh-public-key",
@@ -419,12 +422,17 @@ class DataSourceAzure(sources.DataSource):
ret = load_azure_ds_dir(cdev)
except NonAzureDataSource:
+ report_diagnostic_event(
+ "Did not find Azure data source in %s" % cdev)
continue
except BrokenAzureDataSource as exc:
msg = 'BrokenAzureDataSource: %s' % exc
+ report_diagnostic_event(msg)
raise sources.InvalidMetaDataException(msg)
except util.MountFailedError:
- LOG.warning("%s was not mountable", cdev)
+ msg = '%s was not mountable' % cdev
+ report_diagnostic_event(msg)
+ LOG.warning(msg)
continue
perform_reprovision = reprovision or self._should_reprovision(ret)
@@ -432,6 +440,7 @@ class DataSourceAzure(sources.DataSource):
if util.is_FreeBSD():
msg = "Free BSD is not supported for PPS VMs"
LOG.error(msg)
+ report_diagnostic_event(msg)
raise sources.InvalidMetaDataException(msg)
ret = self._reprovision()
imds_md = get_metadata_from_imds(
@@ -450,7 +459,9 @@ class DataSourceAzure(sources.DataSource):
break
if not found:
- raise sources.InvalidMetaDataException('No Azure metadata found')
+ msg = 'No Azure metadata found'
+ report_diagnostic_event(msg)
+ raise sources.InvalidMetaDataException(msg)
if found == ddir:
LOG.debug("using files cached in %s", ddir)
@@ -469,9 +480,13 @@ class DataSourceAzure(sources.DataSource):
self._report_ready(lease=self._ephemeral_dhcp_ctx.lease)
self._ephemeral_dhcp_ctx.clean_network() # Teardown ephemeral
else:
- with EphemeralDHCPv4() as lease:
- self._report_ready(lease=lease)
-
+ try:
+ with EphemeralDHCPv4() as lease:
+ self._report_ready(lease=lease)
+ except Exception as e:
+ report_diagnostic_event(
+ "exception while reporting ready: %s" % e)
+ raise
return crawled_data
def _is_platform_viable(self):
@@ -493,6 +508,16 @@ class DataSourceAzure(sources.DataSource):
if not self._is_platform_viable():
return False
try:
+ get_boot_telemetry(self.distro)
+ except Exception as e:
+ LOG.warning("Failed to get boot telemetry: %s", e)
+
+ try:
+ get_system_info()
+ except Exception as e:
+ LOG.warning("Failed to get system information: %s", e)
+
+ try:
crawled_data = util.log_time(
logfunc=LOG.debug, msg='Crawl of metadata service',
func=self.crawl_metadata)
@@ -554,24 +579,50 @@ class DataSourceAzure(sources.DataSource):
def exc_cb(msg, exception):
if isinstance(exception, UrlError) and exception.code == 404:
+ if self.imds_poll_counter == self.imds_logging_threshold:
+ # Reducing the logging frequency as we are polling IMDS
+ self.imds_logging_threshold *= 2
+ LOG.debug("Call to IMDS with arguments %s failed "
+ "with status code %s after %s retries",
+ msg, exception.code, self.imds_poll_counter)
+ self.imds_poll_counter += 1
return True
+
# If we get an exception while trying to call IMDS, we
# call DHCP and setup the ephemeral network to acquire the new IP.
+ LOG.debug("Call to IMDS with arguments %s failed with "
+ "status code %s", msg, exception.code)
+ report_diagnostic_event("polling IMDS failed with exception %s"
+ % exception.code)
return False
LOG.debug("Wait for vnetswitch to happen")
+ self.imds_logging_threshold = 1
+ self.imds_poll_counter = 1
+ dhcp_attempts = 0
+ vnet_switched = False
while True:
try:
- # Save our EphemeralDHCPv4 context so we avoid repeated dhcp
- self._ephemeral_dhcp_ctx = EphemeralDHCPv4()
- lease = self._ephemeral_dhcp_ctx.obtain_lease()
+ # Save our EphemeralDHCPv4 context to avoid repeated dhcp
+ with events.ReportEventStack(
+ name="obtain-dhcp-lease",
+ description="obtain dhcp lease",
+ parent=azure_ds_reporter):
+ self._ephemeral_dhcp_ctx = EphemeralDHCPv4()
+ lease = self._ephemeral_dhcp_ctx.obtain_lease()
+
+ if vnet_switched:
+ dhcp_attempts += 1
if report_ready:
try:
nl_sock = netlink.create_bound_netlink_socket()
except netlink.NetlinkCreateSocketError as e:
+ report_diagnostic_event(e)
LOG.warning(e)
self._ephemeral_dhcp_ctx.clean_network()
- return
+ return_val = None
+ break
+
path = REPORTED_READY_MARKER_FILE
LOG.info(
"Creating a marker file to report ready: %s", path)
@@ -579,17 +630,34 @@ class DataSourceAzure(sources.DataSource):
pid=os.getpid(), time=time()))
self._report_ready(lease=lease)
report_ready = False
- try:
- netlink.wait_for_media_disconnect_connect(
- nl_sock, lease['interface'])
- except AssertionError as error:
- LOG.error(error)
- return
+
+ with events.ReportEventStack(
+ name="wait-for-media-disconnect-connect",
+ description="wait for vnet switch",
+ parent=azure_ds_reporter):
+ try:
+ netlink.wait_for_media_disconnect_connect(
+ nl_sock, lease['interface'])
+ except AssertionError as error:
+ report_diagnostic_event(error)
+ LOG.error(error)
+ return_val = None
+ break
+
+ vnet_switched = True
self._ephemeral_dhcp_ctx.clean_network()
else:
- return readurl(url, timeout=IMDS_TIMEOUT_IN_SECONDS,
- headers=headers, exception_cb=exc_cb,
- infinite=True, log_req_resp=False).contents
+ with events.ReportEventStack(
+ name="get-reprovision-data-from-imds",
+ description="get reprovision data from imds",
+ parent=azure_ds_reporter):
+ return_val = readurl(url,
+ timeout=IMDS_TIMEOUT_IN_SECONDS,
+ headers=headers,
+ exception_cb=exc_cb,
+ infinite=True,
+ log_req_resp=False).contents
+ break
except UrlError:
# Teardown our EphemeralDHCPv4 context on failure as we retry
self._ephemeral_dhcp_ctx.clean_network()
@@ -598,6 +666,14 @@ class DataSourceAzure(sources.DataSource):
if nl_sock:
nl_sock.close()
+ if vnet_switched:
+ report_diagnostic_event("attempted dhcp %d times after reuse" %
+ dhcp_attempts)
+ report_diagnostic_event("polled imds %d times after reuse" %
+ self.imds_poll_counter)
+
+ return return_val
+
@azure_ds_telemetry_reporter
def _report_ready(self, lease):
"""Tells the fabric provisioning has completed """
@@ -666,7 +742,9 @@ class DataSourceAzure(sources.DataSource):
self.ds_cfg['agent_command'])
try:
fabric_data = metadata_func()
- except Exception:
+ except Exception as e:
+ report_diagnostic_event(
+ "Error communicating with Azure fabric: %s" % e)
LOG.warning(
"Error communicating with Azure fabric; You may experience."
"connectivity issues.", exc_info=True)
@@ -1027,7 +1105,9 @@ def read_azure_ovf(contents):
try:
dom = minidom.parseString(contents)
except Exception as e:
- raise BrokenAzureDataSource("Invalid ovf-env.xml: %s" % e)
+ error_str = "Invalid ovf-env.xml: %s" % e
+ report_diagnostic_event(error_str)
+ raise BrokenAzureDataSource(error_str)
results = find_child(dom.documentElement,
lambda n: n.localName == "ProvisioningSection")
@@ -1294,8 +1374,12 @@ def get_metadata_from_imds(fallback_nic, retries):
if net.is_up(fallback_nic):
return util.log_time(**kwargs)
else:
- with EphemeralDHCPv4(fallback_nic):
- return util.log_time(**kwargs)
+ try:
+ with EphemeralDHCPv4(fallback_nic):
+ return util.log_time(**kwargs)
+ except Exception as e:
+ report_diagnostic_event("exception while getting metadata: %s" % e)
+ raise
@azure_ds_telemetry_reporter
@@ -1308,11 +1392,14 @@ def _get_metadata_from_imds(retries):
url, timeout=IMDS_TIMEOUT_IN_SECONDS, headers=headers,
retries=retries, exception_cb=retry_on_url_exc)
except Exception as e:
- LOG.debug('Ignoring IMDS instance metadata: %s', e)
+ msg = 'Ignoring IMDS instance metadata: %s' % e
+ report_diagnostic_event(msg)
+ LOG.debug(msg)
return {}
try:
return util.load_json(str(response))
- except json.decoder.JSONDecodeError:
+ except json.decoder.JSONDecodeError as e:
+ report_diagnostic_event('non-json imds response' % e)
LOG.warning(
'Ignoring non-json IMDS instance metadata: %s', str(response))
return {}
@@ -1355,21 +1442,18 @@ def maybe_remove_ubuntu_network_config_scripts(paths=None):
util.del_file(path)
+@azure_ds_telemetry_reporter
def _is_platform_viable(seed_dir):
- with events.ReportEventStack(
- name="check-platform-viability",
- description="found azure asset tag",
- parent=azure_ds_reporter) as evt:
-
- """Check platform environment to report if this datasource may run."""
- asset_tag = util.read_dmi_data('chassis-asset-tag')
- if asset_tag == AZURE_CHASSIS_ASSET_TAG:
- return True
- LOG.debug("Non-Azure DMI asset tag '%s' discovered.", asset_tag)
- evt.description = "Non-Azure DMI asset tag '%s' discovered.", asset_tag
- if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')):
- return True
- return False
+ """Check platform environment to report if this datasource may run."""
+ asset_tag = util.read_dmi_data('chassis-asset-tag')
+ if asset_tag == AZURE_CHASSIS_ASSET_TAG:
+ return True
+ msg = "Non-Azure DMI asset tag '%s' discovered." % asset_tag
+ LOG.debug(msg)
+ report_diagnostic_event(msg)
+ if os.path.exists(os.path.join(seed_dir, 'ovf-env.xml')):
+ return True
+ return False
class BrokenAzureDataSource(Exception):
diff --git a/cloudinit/sources/helpers/azure.py b/cloudinit/sources/helpers/azure.py
index 82c4c8c..3e7da49 100755
--- a/cloudinit/sources/helpers/azure.py
+++ b/cloudinit/sources/helpers/azure.py
@@ -16,7 +16,9 @@ from xml.etree import ElementTree
from cloudinit import url_helper
from cloudinit import util
+from cloudinit import version
from cloudinit.reporting import events
+from datetime import datetime
LOG = logging.getLogger(__name__)
@@ -24,6 +26,10 @@ LOG = logging.getLogger(__name__)
# value is applied if the endpoint can't be found within a lease file
DEFAULT_WIRESERVER_ENDPOINT = "a8:3f:81:10"
+BOOT_EVENT_TYPE = 'boot-telemetry'
+SYSTEMINFO_EVENT_TYPE = 'system-info'
+DIAGNOSTIC_EVENT_TYPE = 'diagnostic'
+
azure_ds_reporter = events.ReportEventStack(
name="azure-ds",
description="initialize reporter for azure ds",
@@ -40,6 +46,73 @@ def azure_ds_telemetry_reporter(func):
return impl
+@azure_ds_telemetry_reporter
+def get_boot_telemetry(distro):
+ """Report timestamps related to kernel initialization and systemd
+ activation of cloud-init"""
+ if not distro.uses_systemd():
+ LOG.debug("%s not using systemd, skipping boot telemetry collection",
+ distro.name)
+ return False
+ else:
+ LOG.debug("Collecting boot telemetry for %s", distro.name)
+
+ kernel_start = float(time.time()) - float(util.uptime())
+ out, err = util.subp(['/bin/systemctl',
+ 'show', '-p',
+ 'UserspaceTimestampMonotonic'],
+ capture=True)
+ if len(err):
+ LOG.debug("Gathering boot telemetry output %s:%s", out, err)
+ return False
+
+ tsm = out.split("=")[1]
+ user_start = kernel_start + (float(tsm) / 1000000)
+
+ out, err = util.subp(['/bin/systemctl', 'show',
+ 'cloud-init-local', '-p',
+ 'InactiveExitTimestampMonotonic'],
+ capture=True)
+ if len(err):
+ LOG.debug("Gathering boot telemetry output %s:%s", out, err)
+ return False
+
+ tsm = out.split("=")[1]
+ cloudinit_activation = kernel_start + (float(tsm) / 1000000)
+ evt = events.ReportingEvent(
+ BOOT_EVENT_TYPE, 'boot-telemetry',
+ "kernel_start=%s user_start=%s cloudinit_activation=%s" %
+ (datetime.utcfromtimestamp(kernel_start).isoformat() + 'Z',
+ datetime.utcfromtimestamp(user_start).isoformat() + 'Z',
+ datetime.utcfromtimestamp(cloudinit_activation).isoformat() + 'Z'),
+ events.DEFAULT_EVENT_ORIGIN)
+ events.report_event(evt)
+ return True
+
+
+@azure_ds_telemetry_reporter
+def get_system_info():
+ """Collect and report system information"""
+ info = util.system_info()
+ evt = events.ReportingEvent(
+ SYSTEMINFO_EVENT_TYPE, 'system information',
+ "cloudinit_version=%s, kernel_version=%s, variant=%s, "
+ "distro_name=%s, distro_version=%s, flavor=%s, "
+ "python_version=%s" %
+ (version.version_string(), info['release'], info['variant'],
+ info['dist'][0], info['dist'][1], info['dist'][2],
+ info['python']), events.DEFAULT_EVENT_ORIGIN)
+ events.report_event(evt)
+
+
+def report_diagnostic_event(str):
+ """Report a diagnostic event"""
+ evt = events.ReportingEvent(
+ DIAGNOSTIC_EVENT_TYPE, 'diagnostic message',
+ str, events.DEFAULT_EVENT_ORIGIN)
+ events.report_event(evt)
+
+
@contextmanager
def cd(newdir):
prevdir = os.getcwd()
@@ -360,16 +433,19 @@ class WALinuxAgentShim(object):
value = dhcp245
LOG.debug("Using Azure Endpoint from dhcp options")
if value is None:
+ report_diagnostic_event("No Azure endpoint from dhcp options")
LOG.debug('Finding Azure endpoint from networkd...')
value = WALinuxAgentShim._networkd_get_value_from_leases()
if value is None:
# Option-245 stored in /run/cloud-init/dhclient.hooks/<ifc>.json
# a dhclient exit hook that calls cloud-init-dhclient-hook
+ report_diagnostic_event("No Azure endpoint from networkd")
LOG.debug('Finding Azure endpoint from hook json...')
dhcp_options = WALinuxAgentShim._load_dhclient_json()
value = WALinuxAgentShim._get_value_from_dhcpoptions(dhcp_options)
if value is None:
# Fallback and check the leases file if unsuccessful
+ report_diagnostic_event("No Azure endpoint from dhclient logs")
LOG.debug("Unable to find endpoint in dhclient logs. "
" Falling back to check lease files")
if fallback_lease_file is None:
@@ -381,11 +457,15 @@ class WALinuxAgentShim(object):
value = WALinuxAgentShim._get_value_from_leases_file(
fallback_lease_file)
if value is None:
- LOG.warning("No lease found; using default endpoint")
+ msg = "No lease found; using default endpoint"
+ report_diagnostic_event(msg)
+ LOG.warning(msg)
value = DEFAULT_WIRESERVER_ENDPOINT
endpoint_ip_address = WALinuxAgentShim.get_ip_from_lease_value(value)
- LOG.debug('Azure endpoint found at %s', endpoint_ip_address)
+ msg = 'Azure endpoint found at %s' % endpoint_ip_address
+ report_diagnostic_event(msg)
+ LOG.debug(msg)
return endpoint_ip_address
@azure_ds_telemetry_reporter
@@ -399,16 +479,19 @@ class WALinuxAgentShim(object):
try:
response = http_client.get(
'http://{0}/machine/?comp=goalstate'.format(self.endpoint))
- except Exception:
+ except Exception as e:
if attempts < 10:
time.sleep(attempts + 1)
else:
+ report_diagnostic_event(
+ "failed to register with Azure: %s" % e)
raise
else:
break
attempts += 1
LOG.debug('Successfully fetched GoalState XML.')
goal_state = GoalState(response.contents, http_client)
+ report_diagnostic_event("container_id %s" % goal_state.container_id)
ssh_keys = []
if goal_state.certificates_xml is not None and pubkey_info is not None:
LOG.debug('Certificate XML found; parsing out public keys.')
@@ -449,11 +532,20 @@ class WALinuxAgentShim(object):
container_id=goal_state.container_id,
instance_id=goal_state.instance_id,
)
- http_client.post(
- "http://{0}/machine?comp=health".format(self.endpoint),
- data=document,
- extra_headers={'Content-Type': 'text/xml; charset=utf-8'},
- )
+ # Host will collect kvps when cloud-init reports ready.
+ # some kvps might still be in the queue. We yield the scheduler
+ # to make sure we process all kvps up till this point.
+ time.sleep(0)
+ try:
+ http_client.post(
+ "http://{0}/machine?comp=health".format(self.endpoint),
+ data=document,
+ extra_headers={'Content-Type': 'text/xml; charset=utf-8'},
+ )
+ except Exception as e:
+ report_diagnostic_event("exception while reporting ready: %s" % e)
+ raise
+
LOG.info('Reported ready to Azure fabric.')
Follow ups