← Back to team overview

cloud-init-dev team mailing list archive

[Merge] ~jasonzio/cloud-init:master into cloud-init:master

 

Jason Zions has proposed merging ~jasonzio/cloud-init:master into cloud-init:master.

Commit message:
Address race between dhcp_discovery() and dhclient on RHEL

Wait for the pid named in dhclient's pidfile to have a parent pid of 1 before killing dhclient and removing the temp directory.

Address some minor differences between RHEL and SLES 15 when Azure data source checks to see if an ntfs-formatted ephemeral disk can be reformatted.

Requested reviews:
  cloud-init commiters (cloud-init-dev)

For more details, see:
https://code.launchpad.net/~jasonzio/cloud-init/+git/cloud-init/+merge/357427

Fixes https://bugs.launchpad.net/cloud-init/+bug/1794399

The race reproduces only in "preprovisioned" VMs, which Azure publicly supports only for Ubuntu. Testing of RHEL preprovisioned VMs in Azure reliably repro's the issue.

The RHEL vs SLES ephemeral disk issue can be repro'd with SLES 15 by provisioning a VM using cloud-init, then deallocating and restarting the VM. If the bug is present, the ephemeral volume (/dev/sdb1) will not be mounted on /mnt, and attempts to mount it will fail, reporting the volume is formatted as "ntfs". With this fix, the volume will be properly mounted after the VM is restarted.
-- 
Your team cloud-init commiters is requested to review the proposed merge of ~jasonzio/cloud-init:master into cloud-init:master.
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
index 12cf509..9bed145 100644
--- a/cloudinit/net/dhcp.py
+++ b/cloudinit/net/dhcp.py
@@ -9,6 +9,7 @@ import logging
 import os
 import re
 import signal
+import time
 
 from cloudinit.net import (
     EphemeralIPv4Network, find_fallback_nic, get_devicelist)
@@ -162,9 +163,12 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
            '-pf', pid_file, interface, '-sf', '/bin/true']
     util.subp(cmd, capture=True)
 
-    # dhclient doesn't write a pid file until after it forks when it gets a
-    # proper lease response. Since cleandir is a temp directory that gets
-    # removed, we need to wait for that pidfile creation before the
+    # On some distros, dhclient doesn't write a pid file until after it forks
+    # when it gets a proper lease response; on others, it writes the pid file
+    # even before it forks, and the child process rewrites it with its own
+    # pid. Since cleandir is a temp directory that gets removed, we need to
+    # wait for the process named in the pidfile to have a parent pid of 1,
+    # i.e. to have completely daemonized, before
     # cleandir is removed, otherwise we get FileNotFound errors.
     missing = util.wait_for_files(
         [pid_file, lease_file], maxwait=5, naplen=0.01)
@@ -172,14 +176,29 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
         LOG.warning("dhclient did not produce expected files: %s",
                     ', '.join(os.path.basename(f) for f in missing))
         return []
-    pid_content = util.load_file(pid_file).strip()
-    try:
-        pid = int(pid_content)
-    except ValueError:
-        LOG.debug(
-            "pid file contains non-integer content '%s'", pid_content)
-    else:
-        os.kill(pid, signal.SIGKILL)
+
+    for attempt in range(0, 1000):
+        pid_content = util.load_file(pid_file).strip()
+        try:
+            pid = int(pid_content)
+        except ValueError:
+            LOG.debug(
+                "pid file contains non-integer content '%s'", pid_content)
+        else:
+            LOG.debug("dhclient has pid=%d, checking if has forked already",
+                      pid)
+            ppid = util.get_proc_ppid(pid)
+            LOG.debug("dhclient has ppid=%d", ppid)
+            if not (ppid == 1):
+                LOG.debug("this is not yet the daemon process, attempt=%d",
+                          attempt)
+                time.sleep(0.01)
+                continue
+            LOG.debug("killing dhclient with pid=%d", pid)
+            os.kill(pid, signal.SIGKILL)
+        return parse_dhcp_lease_file(lease_file)
+
+    LOG.error("dhclient daemon not killed, rm of temp directory might fail")
     return parse_dhcp_lease_file(lease_file)
 
 
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
index db25b6f..a476e6d 100644
--- a/cloudinit/net/tests/test_dhcp.py
+++ b/cloudinit/net/tests/test_dhcp.py
@@ -146,13 +146,15 @@ class TestDHCPDiscoveryClean(CiTestCase):
             "pid file contains non-integer content ''", self.logs.getvalue())
         m_kill.assert_not_called()
 
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
     @mock.patch('cloudinit.net.dhcp.os.kill')
     @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
     @mock.patch('cloudinit.net.dhcp.util.subp')
     def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
                                                                   m_subp,
                                                                   m_wait,
-                                                                  m_kill):
+                                                                  m_kill,
+                                                                  m_getppid):
         """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
         tmpdir = self.tmp_dir()
         dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
@@ -162,6 +164,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
         pidfile = self.tmp_path('dhclient.pid', tmpdir)
         leasefile = self.tmp_path('dhcp.leases', tmpdir)
         m_wait.return_value = [pidfile]  # Return the missing pidfile wait for
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
         self.assertEqual([], dhcp_discovery(dhclient_script, 'eth9', tmpdir))
         self.assertEqual(
             mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
@@ -171,9 +174,10 @@ class TestDHCPDiscoveryClean(CiTestCase):
             self.logs.getvalue())
         m_kill.assert_not_called()
 
+    @mock.patch('cloudinit.net.dhcp.util.get_proc_ppid')
     @mock.patch('cloudinit.net.dhcp.os.kill')
     @mock.patch('cloudinit.net.dhcp.util.subp')
-    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
+    def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill, m_getppid):
         """dhcp_discovery brings up the interface and runs dhclient.
 
         It also returns the parsed dhcp.leases file generated in the sandbox.
@@ -195,6 +199,7 @@ class TestDHCPDiscoveryClean(CiTestCase):
         pid_file = os.path.join(tmpdir, 'dhclient.pid')
         my_pid = 1
         write_file(pid_file, "%d\n" % my_pid)
+        m_getppid.return_value = 1  # Indicate that dhclient has daemonized
 
         self.assertItemsEqual(
             [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index d0358e9..e9793cf 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -707,7 +707,7 @@ def can_dev_be_reformatted(devpath, preserve_ntfs):
         file_count = util.mount_cb(cand_path, count_files, mtype="ntfs",
                                    update_env_for_mount={'LANG': 'C'})
     except util.MountFailedError as e:
-        if "mount: unknown filesystem type 'ntfs'" in str(e):
+        if "unknown filesystem type 'ntfs'" in str(e):
             return True, (bmsg + ' but this system cannot mount NTFS,'
                           ' assuming there are no important files.'
                           ' Formatting allowed.')
diff --git a/cloudinit/util.py b/cloudinit/util.py
index c67d6be..8624a77 100644
--- a/cloudinit/util.py
+++ b/cloudinit/util.py
@@ -2875,4 +2875,17 @@ def udevadm_settle(exists=None, timeout=None):
     return subp(settle_cmd)
 
 
+def get_proc_ppid(pid):
+    """
+    Return the parent pid of a process.
+    """
+    ppid = 0
+    contents = load_file("/proc/%d/stat" % int(pid), quiet=True)
+    if contents:
+        parts = contents.split(" ", 4)
+        # man proc says
+        #  ppid %d     (4) The PID of the parent.
+        ppid = int(parts[3])
+    return ppid
+
 # vi: ts=4 expandtab
diff --git a/packages/suse/cloud-init.spec.in b/packages/suse/cloud-init.spec.in
index e781d74..26894b3 100644
--- a/packages/suse/cloud-init.spec.in
+++ b/packages/suse/cloud-init.spec.in
@@ -93,6 +93,7 @@ version_pys=$(cd "%{buildroot}" && find . -name version.py -type f)
 
 # Program binaries
 %{_bindir}/cloud-init*
+%{_bindir}/cloud-id*
 
 # systemd files
 /usr/lib/systemd/system-generators/*
diff --git a/tests/unittests/test_util.py b/tests/unittests/test_util.py
index 5a14479..6c23eae 100644
--- a/tests/unittests/test_util.py
+++ b/tests/unittests/test_util.py
@@ -1171,4 +1171,9 @@ class TestGetProcEnv(helpers.TestCase):
         self.assertEqual({}, util.get_proc_env(1))
         self.assertEqual(1, m_load_file.call_count)
 
+    def test_get_proc_ppid(self):
+	my_pid = os.getpid()
+	my_ppid = os.getppid()
+	self.assertEqual(my_ppid, util.get_proc_ppid(my_pid))
+
 # vi: ts=4 expandtab