← Back to team overview

cloud-init-dev team mailing list archive

[Merge] ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master

 

Chad Smith has proposed merging ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master.

Requested reviews:
  cloud-init commiters (cloud-init-dev)
Related bugs:
  Bug #1735331 in cloud-init: "ec2: zesty tempfile sandbox dhclient.pid file can't be created"
  https://bugs.launchpad.net/cloud-init/+bug/1735331

For more details, see:
https://code.launchpad.net/~chad.smith/cloud-init/+git/cloud-init/+merge/334543

ec2: Fix sandboxed dhclient background process cleanup.

There is a race condition where our sandboxed dhclient properly writes a lease file but has not yet written a pid file. If the sandbox temporary directory is torn down before the dhclient subprocess writes a pidfile DataSourceEc2Local gets a traceback and the instance will fallback to DataSourceEc2 in the init-network stage. This wastes boot cycles we'd rather not spend.

Fix handling of sandboxed dhclient to wait for both pidfile and leasefile before proceding. If either file doesn't show in 5 seconds, log a warning and return empty lease results {}.

LP: #1735331

-- 
Your team cloud-init commiters is requested to review the proposed merge of ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master.
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
index d8624d8..3be8869 100644
--- a/cloudinit/net/dhcp.py
+++ b/cloudinit/net/dhcp.py
@@ -119,13 +119,23 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
     cmd = [sandbox_dhclient_cmd, '-1', '-v', '-lf', lease_file,
            '-pf', pid_file, interface, '-sf', '/bin/true']
     util.subp(cmd, capture=True)
-    pid = None
+
+    # dhclient doesn't write a pid file until after it forks when it gets a
+    # proper lease response. Since cleandir is a tempdirectory that gets
+    # removed, we need to wait for that pidfile creation before the
+    # cleandir is removed, otherwise we get FileNotFound errors.
+    if util.wait_for_files([pid_file, lease_file], maxwait=5, naplen=0.01):
+        LOG.warning("dhclient did not write a lease or pid file in 5 seconds")
+        return {}
+    pid_content = util.load_file(pid_file).strip()
     try:
-        pid = int(util.load_file(pid_file).strip())
-        return parse_dhcp_lease_file(lease_file)
-    finally:
-        if pid:
-            os.kill(pid, signal.SIGKILL)
+        pid = int(pid_content)
+    except ValueError:
+        LOG.debug(
+            "pid file contains non-integer content '%s'", pid_content)
+    else:
+        os.kill(pid, signal.SIGKILL)
+    return parse_dhcp_lease_file(lease_file)
 
 
 def networkd_parse_lease(content):
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
index 3d8e15c..71e9aec 100644
--- a/cloudinit/net/tests/test_dhcp.py
+++ b/cloudinit/net/tests/test_dhcp.py
@@ -1,6 +1,5 @@
 # This file is part of cloud-init. See LICENSE file for license information.
 
-import mock
 import os
 import signal
 from textwrap import dedent
@@ -9,7 +8,8 @@ from cloudinit.net.dhcp import (
     InvalidDHCPLeaseFileError, maybe_perform_dhcp_discovery,
     parse_dhcp_lease_file, dhcp_discovery, networkd_load_leases)
 from cloudinit.util import ensure_file, write_file
-from cloudinit.tests.helpers import CiTestCase, wrap_and_call, populate_dir
+from cloudinit.tests.helpers import (
+    CiTestCase, mock, populate_dir, wrap_and_call)
 
 
 class TestParseDHCPLeasesFile(CiTestCase):
@@ -117,6 +117,62 @@ class TestDHCPDiscoveryClean(CiTestCase):
 
     @mock.patch('cloudinit.net.dhcp.os.kill')
     @mock.patch('cloudinit.net.dhcp.util.subp')
+    def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp,
+                                                             m_kill):
+        """dhcp_discovery logs a warning when pidfile contains invalid content.
+
+        Lease processing still occurs and no proc kill is attempted.
+        """
+        tmpdir = self.tmp_dir()
+        dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+        script_content = '#!/bin/bash\necho fake-dhclient'
+        write_file(dhclient_script, script_content, mode=0o755)
+        write_file(self.tmp_path('dhclient.pid', tmpdir), '')  # Empty pid ''
+        lease_content = dedent("""
+            lease {
+              interface "eth9";
+              fixed-address 192.168.2.74;
+              option subnet-mask 255.255.255.0;
+              option routers 192.168.2.1;
+            }
+        """)
+        write_file(self.tmp_path('dhcp.leases', tmpdir), lease_content)
+
+        self.assertItemsEqual(
+            [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
+              'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
+            dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+        self.assertIn(
+            "pid file contains non-integer content ''", self.logs.getvalue())
+        m_kill.assert_not_called()
+
+    @mock.patch('cloudinit.net.dhcp.os.kill')
+    @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
+    @mock.patch('cloudinit.net.dhcp.util.subp')
+    def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
+                                                                  m_subp,
+                                                                  m_wait,
+                                                                  m_kill):
+        """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
+        tmpdir = self.tmp_dir()
+        dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+        script_content = '#!/bin/bash\necho fake-dhclient'
+        write_file(dhclient_script, script_content, mode=0o755)
+        # Don't create pid or leases file
+        pidfile = self.tmp_path('dhclient.pid', tmpdir)
+        leasefile = self.tmp_path('dhcp.leases', tmpdir)
+        m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
+        self.assertEqual({}, dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+        self.assertEqual(
+            mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
+            m_wait.call_args_list[0])
+        self.assertIn(
+            'WARNING: dhclient did not write a lease or pid file in 5 seconds',
+            self.logs.getvalue())
+        m_kill.assert_not_called()
+
+    @mock.patch('cloudinit.net.dhcp.os.kill')
+    @mock.patch('cloudinit.net.dhcp.util.subp')
     def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
         """dhcp_discovery brings up the interface and runs dhclient.
 
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 8c3492d..14367e9 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -11,7 +11,6 @@ from functools import partial
 import os
 import os.path
 import re
-import time
 from xml.dom import minidom
 import xml.etree.ElementTree as ET
 
@@ -321,7 +320,7 @@ class DataSourceAzure(sources.DataSource):
         # https://bugs.launchpad.net/cloud-init/+bug/1717611
         missing = util.log_time(logfunc=LOG.debug,
                                 msg="waiting for SSH public key files",
-                                func=wait_for_files,
+                                func=util.wait_for_files,
                                 args=(fp_files, 900))
 
         if len(missing):
@@ -556,8 +555,8 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
                              is_new_instance=False):
     # wait for ephemeral disk to come up
     naplen = .2
-    missing = wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
-                             log_pre="Azure ephemeral disk: ")
+    missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
+                                  log_pre="Azure ephemeral disk: ")
 
     if missing:
         LOG.warning("ephemeral device '%s' did not appear after %d seconds.",
@@ -639,28 +638,6 @@ def pubkeys_from_crt_files(flist):
     return pubkeys
 
 
-def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
-    need = set(flist)
-    waited = 0
-    while True:
-        need -= set([f for f in need if os.path.exists(f)])
-        if len(need) == 0:
-            LOG.debug("%sAll files appeared after %s seconds: %s",
-                      log_pre, waited, flist)
-            return []
-        if waited == 0:
-            LOG.info("%sWaiting up to %s seconds for the following files: %s",
-                     log_pre, maxwait, flist)
-        if waited + naplen > maxwait:
-            break
-        time.sleep(naplen)
-        waited += naplen
-
-    LOG.warning("%sStill missing files after %s seconds: %s",
-                log_pre, maxwait, need)
-    return need
-
-
 def write_files(datadir, files, dirmode=None):
 
     def _redact_password(cnt, fname):
diff --git a/cloudinit/util.py b/cloudinit/util.py
index e1290aa..17462a5 100644
--- a/cloudinit/util.py
+++ b/cloudinit/util.py
@@ -2541,4 +2541,26 @@ def load_shell_content(content, add_empty=False, empty_val=None):
     return data
 
 
+def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
+    need = set(flist)
+    waited = 0
+    while True:
+        need -= set([f for f in need if os.path.exists(f)])
+        if len(need) == 0:
+            LOG.debug("%sAll files appeared after %s seconds: %s",
+                      log_pre, waited, flist)
+            return []
+        if waited == 0:
+            LOG.info("%sWaiting up to %s seconds for the following files: %s",
+                     log_pre, maxwait, flist)
+        if waited + naplen > maxwait:
+            break
+        time.sleep(naplen)
+        waited += naplen
+
+    LOG.warning("%sStill missing files after %s seconds: %s",
+                log_pre, maxwait, need)
+    return need
+
+
 # vi: ts=4 expandtab
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index 0a11777..7cb1812 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -171,7 +171,6 @@ scbus-1 on xpt0 bus 0
         self.apply_patches([
             (dsaz, 'list_possible_azure_ds_devs', dsdevs),
             (dsaz, 'invoke_agent', _invoke_agent),
-            (dsaz, 'wait_for_files', _wait_for_files),
             (dsaz, 'pubkeys_from_crt_files', _pubkeys_from_crt_files),
             (dsaz, 'perform_hostname_bounce', mock.MagicMock()),
             (dsaz, 'get_hostname', mock.MagicMock()),
@@ -179,6 +178,8 @@ scbus-1 on xpt0 bus 0
             (dsaz, 'get_metadata_from_fabric', self.get_metadata_from_fabric),
             (dsaz.util, 'read_dmi_data', mock.MagicMock(
                 side_effect=_dmi_mocks)),
+            (dsaz.util, 'wait_for_files', mock.MagicMock(
+                side_effect=_wait_for_files)),
         ])
 
         dsrc = dsaz.DataSourceAzure(
@@ -647,7 +648,7 @@ class TestAzureBounce(TestCase):
         self.patches.enter_context(
             mock.patch.object(dsaz, 'invoke_agent'))
         self.patches.enter_context(
-            mock.patch.object(dsaz, 'wait_for_files'))
+            mock.patch.object(dsaz.util, 'wait_for_files'))
         self.patches.enter_context(
             mock.patch.object(dsaz, 'list_possible_azure_ds_devs',
                               mock.MagicMock(return_value=[])))

Follow ups