cloud-init-dev team mailing list archive
-
cloud-init-dev team
-
Mailing list archive
-
Message #03863
[Merge] ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master
Chad Smith has proposed merging ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master.
Requested reviews:
cloud-init commiters (cloud-init-dev)
Related bugs:
Bug #1735331 in cloud-init: "ec2: zesty tempfile sandbox dhclient.pid file can't be created"
https://bugs.launchpad.net/cloud-init/+bug/1735331
For more details, see:
https://code.launchpad.net/~chad.smith/cloud-init/+git/cloud-init/+merge/334543
ec2: Fix sandboxed dhclient background process cleanup.
There is a race condition where our sandboxed dhclient properly writes a lease file but has not yet written a pid file. If the sandbox temporary directory is torn down before the dhclient subprocess writes a pidfile DataSourceEc2Local gets a traceback and the instance will fallback to DataSourceEc2 in the init-network stage. This wastes boot cycles we'd rather not spend.
Fix handling of sandboxed dhclient to wait for both pidfile and leasefile before proceding. If either file doesn't show in 5 seconds, log a warning and return empty lease results {}.
LP: #1735331
--
Your team cloud-init commiters is requested to review the proposed merge of ~chad.smith/cloud-init:fix-dhclient-absent-pidfile into cloud-init:master.
diff --git a/cloudinit/net/dhcp.py b/cloudinit/net/dhcp.py
index d8624d8..3be8869 100644
--- a/cloudinit/net/dhcp.py
+++ b/cloudinit/net/dhcp.py
@@ -119,13 +119,23 @@ def dhcp_discovery(dhclient_cmd_path, interface, cleandir):
cmd = [sandbox_dhclient_cmd, '-1', '-v', '-lf', lease_file,
'-pf', pid_file, interface, '-sf', '/bin/true']
util.subp(cmd, capture=True)
- pid = None
+
+ # dhclient doesn't write a pid file until after it forks when it gets a
+ # proper lease response. Since cleandir is a tempdirectory that gets
+ # removed, we need to wait for that pidfile creation before the
+ # cleandir is removed, otherwise we get FileNotFound errors.
+ if util.wait_for_files([pid_file, lease_file], maxwait=5, naplen=0.01):
+ LOG.warning("dhclient did not write a lease or pid file in 5 seconds")
+ return {}
+ pid_content = util.load_file(pid_file).strip()
try:
- pid = int(util.load_file(pid_file).strip())
- return parse_dhcp_lease_file(lease_file)
- finally:
- if pid:
- os.kill(pid, signal.SIGKILL)
+ pid = int(pid_content)
+ except ValueError:
+ LOG.debug(
+ "pid file contains non-integer content '%s'", pid_content)
+ else:
+ os.kill(pid, signal.SIGKILL)
+ return parse_dhcp_lease_file(lease_file)
def networkd_parse_lease(content):
diff --git a/cloudinit/net/tests/test_dhcp.py b/cloudinit/net/tests/test_dhcp.py
index 3d8e15c..71e9aec 100644
--- a/cloudinit/net/tests/test_dhcp.py
+++ b/cloudinit/net/tests/test_dhcp.py
@@ -1,6 +1,5 @@
# This file is part of cloud-init. See LICENSE file for license information.
-import mock
import os
import signal
from textwrap import dedent
@@ -9,7 +8,8 @@ from cloudinit.net.dhcp import (
InvalidDHCPLeaseFileError, maybe_perform_dhcp_discovery,
parse_dhcp_lease_file, dhcp_discovery, networkd_load_leases)
from cloudinit.util import ensure_file, write_file
-from cloudinit.tests.helpers import CiTestCase, wrap_and_call, populate_dir
+from cloudinit.tests.helpers import (
+ CiTestCase, mock, populate_dir, wrap_and_call)
class TestParseDHCPLeasesFile(CiTestCase):
@@ -117,6 +117,62 @@ class TestDHCPDiscoveryClean(CiTestCase):
@mock.patch('cloudinit.net.dhcp.os.kill')
@mock.patch('cloudinit.net.dhcp.util.subp')
+ def test_dhcp_discovery_run_in_sandbox_warns_invalid_pid(self, m_subp,
+ m_kill):
+ """dhcp_discovery logs a warning when pidfile contains invalid content.
+
+ Lease processing still occurs and no proc kill is attempted.
+ """
+ tmpdir = self.tmp_dir()
+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+ script_content = '#!/bin/bash\necho fake-dhclient'
+ write_file(dhclient_script, script_content, mode=0o755)
+ write_file(self.tmp_path('dhclient.pid', tmpdir), '') # Empty pid ''
+ lease_content = dedent("""
+ lease {
+ interface "eth9";
+ fixed-address 192.168.2.74;
+ option subnet-mask 255.255.255.0;
+ option routers 192.168.2.1;
+ }
+ """)
+ write_file(self.tmp_path('dhcp.leases', tmpdir), lease_content)
+
+ self.assertItemsEqual(
+ [{'interface': 'eth9', 'fixed-address': '192.168.2.74',
+ 'subnet-mask': '255.255.255.0', 'routers': '192.168.2.1'}],
+ dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+ self.assertIn(
+ "pid file contains non-integer content ''", self.logs.getvalue())
+ m_kill.assert_not_called()
+
+ @mock.patch('cloudinit.net.dhcp.os.kill')
+ @mock.patch('cloudinit.net.dhcp.util.wait_for_files')
+ @mock.patch('cloudinit.net.dhcp.util.subp')
+ def test_dhcp_discovery_run_in_sandbox_waits_on_lease_and_pid(self,
+ m_subp,
+ m_wait,
+ m_kill):
+ """dhcp_discovery waits for the presence of pidfile and dhcp.leases."""
+ tmpdir = self.tmp_dir()
+ dhclient_script = os.path.join(tmpdir, 'dhclient.orig')
+ script_content = '#!/bin/bash\necho fake-dhclient'
+ write_file(dhclient_script, script_content, mode=0o755)
+ # Don't create pid or leases file
+ pidfile = self.tmp_path('dhclient.pid', tmpdir)
+ leasefile = self.tmp_path('dhcp.leases', tmpdir)
+ m_wait.return_value = [pidfile] # Return the missing pidfile wait for
+ self.assertEqual({}, dhcp_discovery(dhclient_script, 'eth9', tmpdir))
+ self.assertEqual(
+ mock.call([pidfile, leasefile], maxwait=5, naplen=0.01),
+ m_wait.call_args_list[0])
+ self.assertIn(
+ 'WARNING: dhclient did not write a lease or pid file in 5 seconds',
+ self.logs.getvalue())
+ m_kill.assert_not_called()
+
+ @mock.patch('cloudinit.net.dhcp.os.kill')
+ @mock.patch('cloudinit.net.dhcp.util.subp')
def test_dhcp_discovery_run_in_sandbox(self, m_subp, m_kill):
"""dhcp_discovery brings up the interface and runs dhclient.
diff --git a/cloudinit/sources/DataSourceAzure.py b/cloudinit/sources/DataSourceAzure.py
index 8c3492d..14367e9 100644
--- a/cloudinit/sources/DataSourceAzure.py
+++ b/cloudinit/sources/DataSourceAzure.py
@@ -11,7 +11,6 @@ from functools import partial
import os
import os.path
import re
-import time
from xml.dom import minidom
import xml.etree.ElementTree as ET
@@ -321,7 +320,7 @@ class DataSourceAzure(sources.DataSource):
# https://bugs.launchpad.net/cloud-init/+bug/1717611
missing = util.log_time(logfunc=LOG.debug,
msg="waiting for SSH public key files",
- func=wait_for_files,
+ func=util.wait_for_files,
args=(fp_files, 900))
if len(missing):
@@ -556,8 +555,8 @@ def address_ephemeral_resize(devpath=RESOURCE_DISK_PATH, maxwait=120,
is_new_instance=False):
# wait for ephemeral disk to come up
naplen = .2
- missing = wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
- log_pre="Azure ephemeral disk: ")
+ missing = util.wait_for_files([devpath], maxwait=maxwait, naplen=naplen,
+ log_pre="Azure ephemeral disk: ")
if missing:
LOG.warning("ephemeral device '%s' did not appear after %d seconds.",
@@ -639,28 +638,6 @@ def pubkeys_from_crt_files(flist):
return pubkeys
-def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
- need = set(flist)
- waited = 0
- while True:
- need -= set([f for f in need if os.path.exists(f)])
- if len(need) == 0:
- LOG.debug("%sAll files appeared after %s seconds: %s",
- log_pre, waited, flist)
- return []
- if waited == 0:
- LOG.info("%sWaiting up to %s seconds for the following files: %s",
- log_pre, maxwait, flist)
- if waited + naplen > maxwait:
- break
- time.sleep(naplen)
- waited += naplen
-
- LOG.warning("%sStill missing files after %s seconds: %s",
- log_pre, maxwait, need)
- return need
-
-
def write_files(datadir, files, dirmode=None):
def _redact_password(cnt, fname):
diff --git a/cloudinit/util.py b/cloudinit/util.py
index e1290aa..17462a5 100644
--- a/cloudinit/util.py
+++ b/cloudinit/util.py
@@ -2541,4 +2541,26 @@ def load_shell_content(content, add_empty=False, empty_val=None):
return data
+def wait_for_files(flist, maxwait, naplen=.5, log_pre=""):
+ need = set(flist)
+ waited = 0
+ while True:
+ need -= set([f for f in need if os.path.exists(f)])
+ if len(need) == 0:
+ LOG.debug("%sAll files appeared after %s seconds: %s",
+ log_pre, waited, flist)
+ return []
+ if waited == 0:
+ LOG.info("%sWaiting up to %s seconds for the following files: %s",
+ log_pre, maxwait, flist)
+ if waited + naplen > maxwait:
+ break
+ time.sleep(naplen)
+ waited += naplen
+
+ LOG.warning("%sStill missing files after %s seconds: %s",
+ log_pre, maxwait, need)
+ return need
+
+
# vi: ts=4 expandtab
diff --git a/tests/unittests/test_datasource/test_azure.py b/tests/unittests/test_datasource/test_azure.py
index 0a11777..7cb1812 100644
--- a/tests/unittests/test_datasource/test_azure.py
+++ b/tests/unittests/test_datasource/test_azure.py
@@ -171,7 +171,6 @@ scbus-1 on xpt0 bus 0
self.apply_patches([
(dsaz, 'list_possible_azure_ds_devs', dsdevs),
(dsaz, 'invoke_agent', _invoke_agent),
- (dsaz, 'wait_for_files', _wait_for_files),
(dsaz, 'pubkeys_from_crt_files', _pubkeys_from_crt_files),
(dsaz, 'perform_hostname_bounce', mock.MagicMock()),
(dsaz, 'get_hostname', mock.MagicMock()),
@@ -179,6 +178,8 @@ scbus-1 on xpt0 bus 0
(dsaz, 'get_metadata_from_fabric', self.get_metadata_from_fabric),
(dsaz.util, 'read_dmi_data', mock.MagicMock(
side_effect=_dmi_mocks)),
+ (dsaz.util, 'wait_for_files', mock.MagicMock(
+ side_effect=_wait_for_files)),
])
dsrc = dsaz.DataSourceAzure(
@@ -647,7 +648,7 @@ class TestAzureBounce(TestCase):
self.patches.enter_context(
mock.patch.object(dsaz, 'invoke_agent'))
self.patches.enter_context(
- mock.patch.object(dsaz, 'wait_for_files'))
+ mock.patch.object(dsaz.util, 'wait_for_files'))
self.patches.enter_context(
mock.patch.object(dsaz, 'list_possible_azure_ds_devs',
mock.MagicMock(return_value=[])))
Follow ups