← Back to team overview

cloud-init-dev team mailing list archive

[Merge] ~chad.smith/cloud-init:bug/1800223-retry-imds-on-timeout into cloud-init:master

 

Chad Smith has proposed merging ~chad.smith/cloud-init:bug/1800223-retry-imds-on-timeout into cloud-init:master.

Commit message:
azure: retry imds polling on requests.Timeout

There is an infrequent race when the booting instance can hit the IMDS
service before it is fully available. This results in a
requests.ConnectTimeout being raised.
Azure's retry_callback logic now retries on either 404s or Timeouts.

LP:1800223

Requested reviews:
  Server Team CI bot (server-team-bot): continuous-integration
  cloud-init commiters (cloud-init-dev)

For more details, see:
https://code.launchpad.net/~chad.smith/cloud-init/+git/cloud-init/+merge/358112
-- 
Your team cloud-init commiters is requested to review the proposed merge of ~chad.smith/cloud-init:bug/1800223-retry-imds-on-timeout into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index d0358e9..aaa705c 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -12,6 +12,7 @@ import json
 import os
 import os.path
 import re
+import requests
 from time import time
 from xml.dom import minidom
 import xml.etree.ElementTree as ET
@@ -514,7 +515,10 @@ class DataSourceAzure(sources.DataSource):
 
         def exc_cb(msg, exception):
             if isinstance(exception, UrlError) and exception.code == 404:
-                return True
+                return True  # Continue retries
+            cause = exception.cause
+            if cause and isinstance(cause, requests.Timeout):
+                return True  # Continue retries
             # If we get an exception while trying to call IMDS, we
             # call DHCP and setup the ephemeral network to acquire the new IP.
             return False
@@ -1170,8 +1174,12 @@ def get_metadata_from_imds(fallback_nic, retries):
 def _get_metadata_from_imds(retries):
 
     def retry_on_url_error(msg, exception):
-        if isinstance(exception, UrlError) and exception.code == 404:
-            return True  # Continue retries
+        if isinstance(exception, UrlError):
+            if exception.code == 404:
+                return True  # Continue retries
+            cause = exception.cause
+            if cause and isinstance(cause, requests.Timeout):
+                return True  # Continue retries
         return False  # Stop retries on all other exceptions
 
     url = IMDS_URL + "instance?api-version=2017-12-01"
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index cd6e7e7..929aa2e 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -17,6 +17,7 @@ import crypt
 import httpretty
 import json
 import os
+import requests
 import stat
 import xml.etree.ElementTree as ET
 import yaml
@@ -184,6 +185,35 @@ class TestGetMetadataFromIMDS(HttprettyTestCase):
             "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time
             self.logs.getvalue())
 
+    @mock.patch('requests.Session.request')
+    @mock.patch('cloudinit.url_helper.time.sleep')
+    @mock.patch(MOCKPATH + 'net.is_up')
+    def test_get_metadata_from_imds_retries_on_timeout(
+            self, m_net_is_up, m_sleep, m_request):
+        """Retry IMDS network metadata on timeout errors."""
+
+        self.attempt = 0
+        m_request.side_effect = requests.Timeout('Fake Connection Timeout')
+
+        def retry_callback(request, uri, headers):
+            self.attempt += 1
+            raise requests.Timeout('Fake connection timeout')
+
+        httpretty.register_uri(
+            httpretty.GET,
+            dsaz.IMDS_URL + 'instance?api-version=2017-12-01',
+            body=retry_callback)
+
+        m_net_is_up.return_value = True  # skips dhcp
+
+        self.assertEqual({}, dsaz.get_metadata_from_imds('eth9', retries=3))
+
+        m_net_is_up.assert_called_with('eth9')
+        self.assertEqual([mock.call(1)]*3, m_sleep.call_args_list)
+        self.assertIn(
+            "Crawl of Azure Instance Metadata Service (IMDS) took",  # log_time
+            self.logs.getvalue())
+
 
 class TestAzureDataSource(CiTestCase):