cloud-init-dev team mailing list archive
-
cloud-init-dev team
-
Mailing list archive
-
Message #04165
[Merge] ~mgerdts/cloud-init:bug-1746605 into cloud-init:master
Mike Gerdts has proposed merging ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.
Requested reviews:
cloud-init commiters (cloud-init-dev)
Related bugs:
Bug #1746605 in cloud-init: "DataSourceSmartOS needs locking and retries"
https://bugs.launchpad.net/cloud-init/+bug/1746605
For more details, see:
https://code.launchpad.net/~mgerdts/cloud-init/+git/cloud-init/+merge/337271
Fixing a race between cloud-init and mdata-get (called from rc.local) which results in cross talk on the serial port. The locking strategy used by mdata-get is used for reasons discussed at
https://github.com/joyent/mdata-client/issues/11#issuecomment-362490774
Additionally, a retry is performed to help when 1) something else had the port open but not locked before cloud-init starts and 2) the metadata server in the host restarts.
--
Your team cloud-init commiters is requested to review the proposed merge of ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceSmartOS.py b/cloudinit/sources/DataSourceSmartOS.py
index 86bfa5d..24187d6 100644
--- a/cloudinit/sources/DataSourceSmartOS.py
+++ b/cloudinit/sources/DataSourceSmartOS.py
@@ -1,4 +1,5 @@
# Copyright (C) 2013 Canonical Ltd.
+# Copyright (c) 2018, Joyent, Inc.
#
# Author: Ben Howard <ben.howard@xxxxxxxxxxxxx>
#
@@ -21,6 +22,8 @@
import base64
import binascii
+import errno
+import fcntl
import json
import os
import random
@@ -339,7 +342,11 @@ class JoyentMetadataClient(object):
binascii.crc32(body.encode('utf-8')) & 0xffffffff)
def _get_value_from_frame(self, expected_request_id, frame):
- frame_data = self.line_regex.match(frame).groupdict()
+ match = self.line_regex.match(frame)
+ if match is None:
+ raise JoyentMetadataFetchException(
+ 'No regex match for frame "%s"' % frame)
+ frame_data = match.groupdict()
if int(frame_data['length']) != len(frame_data['body']):
raise JoyentMetadataFetchException(
'Incorrect frame length given ({0} != {1}).'.format(
@@ -378,9 +385,16 @@ class JoyentMetadataClient(object):
self.fp.flush()
response = bytearray()
- response.extend(self.fp.read(1))
- while response[-1:] != b'\n':
- response.extend(self.fp.read(1))
+ while True:
+ try:
+ response.extend(self.fp.read(1))
+ if response[-1:] == b'\n':
+ break
+ except OSError as exc:
+ if exc.errno == errno.EAGAIN:
+ raise JoyentMetadataFetchException(
+ "Timeout during read. Partial response: b'%s'" %
+ response)
if need_close:
self.close_transport()
@@ -395,12 +409,21 @@ class JoyentMetadataClient(object):
return value
def get(self, key, default=None, strip=False):
- result = self.request(rtype='GET', param=key)
- if result is None:
- return default
- if result and strip:
- result = result.strip()
- return result
+ # Do a couple tries in case someone else has the serial port open
+ # before this process opened it. This also helps in the event that
+ # the metadata server goes away in middle of a conversation.
+ for tries in [1, 2]:
+ try:
+ result = self.request(rtype='GET', param=key)
+ if result is None:
+ return default
+ if result and strip:
+ result = result.strip()
+ return result
+ except JoyentMetadataFetchException as exc:
+ last_exc = exc
+ pass
+ raise(last_exc)
def get_json(self, key, default=None):
result = self.get(key, default=default)
@@ -471,6 +494,7 @@ class JoyentMetadataSerialClient(JoyentMetadataClient):
ser = serial.Serial(self.device, timeout=self.timeout)
if not ser.isOpen():
raise SystemError("Unable to open %s" % self.device)
+ fcntl.lockf(ser, fcntl.LOCK_EX)
self.fp = ser
def __repr__(self):
diff --git a/packages/bddeb b/packages/bddeb
index 4f2e2dd..7e96130 100755
--- a/packages/bddeb
+++ b/packages/bddeb
@@ -68,7 +68,8 @@ def write_debian_folder(root, templ_data, is_python2, cloud_util_deps):
test_reqs = run_helper(
'read-dependencies',
['--requirements-file', 'test-requirements.txt',
- '--system-pkg-names', '--python-version', pyver]).splitlines()
+ '--system-pkg-names', '--python-version', pyver,
+ '-d', 'debian']).splitlines()
requires = ['cloud-utils | cloud-guest-utils'] if cloud_util_deps else []
# We consolidate all deps as Build-Depends as our package build runs all
diff --git a/test-requirements.txt b/test-requirements.txt
index d9d41b5..548132c 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -11,3 +11,5 @@ coverage
# Only really needed on older versions of python
contextlib2
setuptools
+
+pyserial
diff --git a/tests/unittests/test_datasource/test_smartos.py b/tests/unittests/test_datasource/test_smartos.py
index 88bae5f..4d9f9ab 100644
--- a/tests/unittests/test_datasource/test_smartos.py
+++ b/tests/unittests/test_datasource/test_smartos.py
@@ -1,4 +1,5 @@
# Copyright (C) 2013 Canonical Ltd.
+# Copyright (c) 2018, Joyent, Inc.
#
# Author: Ben Howard <ben.howard@xxxxxxxxxxxxx>
#
@@ -15,12 +16,16 @@ from __future__ import print_function
from binascii import crc32
import json
+import multiprocessing
import os
import os.path
import re
import shutil
+import signal
import stat
+import subprocess
import tempfile
+import unittest
import uuid
from cloudinit import serial
@@ -872,4 +877,67 @@ class TestNetworkConversion(TestCase):
found = convert_net(SDC_NICS_SINGLE_GATEWAY)
self.assertEqual(expected, found)
+
+@unittest.skipUnless(DataSourceSmartOS.get_smartos_environ() ==
+ DataSourceSmartOS.SMARTOS_ENV_KVM,
+ "Only supported on KVM and bhyve guests under SmartOS")
+@unittest.skipUnless(os.access(DataSourceSmartOS.SERIAL_DEVICE, os.W_OK),
+ "Requires write access to " +
+ DataSourceSmartOS.SERIAL_DEVICE)
+class TestSerialConcurrency(TestCase):
+ """
+ This class tests locking on an actual serial port, and as such can only
+ be run in a kvm or bhyve guest running on a SmartOS host. A test run on
+ a metadata socket will not be valid because a metadata socket ensures
+ there is only one session over a connection. In contrast, in the
+ absence of proper locking multiple processes opening the same serial
+ port can corrupt each others' exchanges with the metadata server.
+ """
+ def setUp(self):
+ self.mdata_proc = multiprocessing.Process(target=self.start_mdata_loop)
+ self.mdata_proc.start()
+ super(TestSerialConcurrency, self).setUp()
+
+ def tearDown(self):
+ # os.kill() rather than mdata_proc.terminate() to avoid console spam.
+ os.kill(self.mdata_proc.pid, signal.SIGKILL)
+ self.mdata_proc.join()
+ super(TestSerialConcurrency, self).tearDown()
+
+ def start_mdata_loop(self):
+ """
+ The mdata-get command is repeatedly run in a separate process so
+ that it may try to race with metadata operations performed in the
+ main test process. Use of mdata-get is better than two processes
+ using the protocol implementation in DataSourceSmartOS because we
+ are testing to be sure that cloud-init and mdata-get respect each
+ others locks.
+ """
+ while True:
+ try:
+ subprocess.check_output(['/usr/sbin/mdata-get', 'sdc:routes'])
+ except subprocess.CalledProcessError:
+ pass
+
+ def test_all_keys(self):
+ self.assertIsNotNone(self.mdata_proc.pid)
+ ds = DataSourceSmartOS
+ keys = [tup[0] for tup in ds.SMARTOS_ATTRIB_MAP.values()]
+ keys.extend(ds.SMARTOS_ATTRIB_JSON.values())
+
+ client = ds.jmc_client_factory()
+ self.assertIsNotNone(client)
+
+ # The behavior that we are testing for was observed mdata-get running
+ # 10 times at roughly the same time as cloud-init fetched each key
+ # once. cloud-init would regularly see failures before making it
+ # through all keys once.
+ for it in range(0, 3):
+ for key in keys:
+ # We don't care about the return value, just that it doesn't
+ # thrown any exceptions.
+ client.get(key)
+
+ self.assertIsNone(self.mdata_proc.exitcode)
+
# vi: ts=4 expandtab
Follow ups