← Back to team overview

cloud-init-dev team mailing list archive

[Merge] ~mgerdts/cloud-init:bug-1746605 into cloud-init:master

 

Mike Gerdts has proposed merging ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.

Requested reviews:
  cloud-init commiters (cloud-init-dev)
Related bugs:
  Bug #1746605 in cloud-init: "DataSourceSmartOS needs locking and retries"
  https://bugs.launchpad.net/cloud-init/+bug/1746605

For more details, see:
https://code.launchpad.net/~mgerdts/cloud-init/+git/cloud-init/+merge/337271

Fixing a race between cloud-init and mdata-get (called from rc.local) which results in cross talk on the serial port. The locking strategy used by mdata-get is used for reasons discussed at

https://github.com/joyent/mdata-client/issues/11#issuecomment-362490774

Additionally, a retry is performed to help when 1) something else had the port open but not locked before cloud-init starts and 2) the metadata server in the host restarts.

-- 
Your team cloud-init commiters is requested to review the proposed merge of ~mgerdts/cloud-init:bug-1746605 into cloud-init:master.
diff --git a/cloudinit/sources/DataSourceSmartOS.py b/cloudinit/sources/DataSourceSmartOS.py
index 86bfa5d..24187d6 100644
--- a/cloudinit/sources/DataSourceSmartOS.py
+++ b/cloudinit/sources/DataSourceSmartOS.py
@@ -1,4 +1,5 @@
 # Copyright (C) 2013 Canonical Ltd.
+# Copyright (c) 2018, Joyent, Inc.
 #
 # Author: Ben Howard <ben.howard@xxxxxxxxxxxxx>
 #
@@ -21,6 +22,8 @@
 
 import base64
 import binascii
+import errno
+import fcntl
 import json
 import os
 import random
@@ -339,7 +342,11 @@ class JoyentMetadataClient(object):
             binascii.crc32(body.encode('utf-8')) & 0xffffffff)
 
     def _get_value_from_frame(self, expected_request_id, frame):
-        frame_data = self.line_regex.match(frame).groupdict()
+        match = self.line_regex.match(frame)
+        if match is None:
+            raise JoyentMetadataFetchException(
+                'No regex match for frame "%s"' % frame)
+        frame_data = match.groupdict()
         if int(frame_data['length']) != len(frame_data['body']):
             raise JoyentMetadataFetchException(
                 'Incorrect frame length given ({0} != {1}).'.format(
@@ -378,9 +385,16 @@ class JoyentMetadataClient(object):
         self.fp.flush()
 
         response = bytearray()
-        response.extend(self.fp.read(1))
-        while response[-1:] != b'\n':
-            response.extend(self.fp.read(1))
+        while True:
+            try:
+                response.extend(self.fp.read(1))
+                if response[-1:] == b'\n':
+                    break
+            except OSError as exc:
+                if exc.errno == errno.EAGAIN:
+                    raise JoyentMetadataFetchException(
+                        "Timeout during read. Partial response: b'%s'" %
+                        response)
 
         if need_close:
             self.close_transport()
@@ -395,12 +409,21 @@ class JoyentMetadataClient(object):
         return value
 
     def get(self, key, default=None, strip=False):
-        result = self.request(rtype='GET', param=key)
-        if result is None:
-            return default
-        if result and strip:
-            result = result.strip()
-        return result
+        # Do a couple tries in case someone else has the serial port open
+        # before this process opened it.  This also helps in the event that
+        # the metadata server goes away in middle of a conversation.
+        for tries in [1, 2]:
+            try:
+                result = self.request(rtype='GET', param=key)
+                if result is None:
+                    return default
+                if result and strip:
+                    result = result.strip()
+                return result
+            except JoyentMetadataFetchException as exc:
+                last_exc = exc
+                pass
+        raise(last_exc)
 
     def get_json(self, key, default=None):
         result = self.get(key, default=default)
@@ -471,6 +494,7 @@ class JoyentMetadataSerialClient(JoyentMetadataClient):
         ser = serial.Serial(self.device, timeout=self.timeout)
         if not ser.isOpen():
             raise SystemError("Unable to open %s" % self.device)
+        fcntl.lockf(ser, fcntl.LOCK_EX)
         self.fp = ser
 
     def __repr__(self):
diff --git a/packages/bddeb b/packages/bddeb
index 4f2e2dd..7e96130 100755
--- a/packages/bddeb
+++ b/packages/bddeb
@@ -68,7 +68,8 @@ def write_debian_folder(root, templ_data, is_python2, cloud_util_deps):
     test_reqs = run_helper(
         'read-dependencies',
         ['--requirements-file', 'test-requirements.txt',
-         '--system-pkg-names', '--python-version', pyver]).splitlines()
+         '--system-pkg-names', '--python-version', pyver,
+         '-d', 'debian']).splitlines()
 
     requires = ['cloud-utils | cloud-guest-utils'] if cloud_util_deps else []
     # We consolidate all deps as Build-Depends as our package build runs all
diff --git a/test-requirements.txt b/test-requirements.txt
index d9d41b5..548132c 100644
--- a/test-requirements.txt
+++ b/test-requirements.txt
@@ -11,3 +11,5 @@ coverage
 # Only really needed on older versions of python
 contextlib2
 setuptools
+
+pyserial
diff --git a/tests/unittests/test_datasource/test_smartos.py b/tests/unittests/test_datasource/test_smartos.py
index 88bae5f..4d9f9ab 100644
--- a/tests/unittests/test_datasource/test_smartos.py
+++ b/tests/unittests/test_datasource/test_smartos.py
@@ -1,4 +1,5 @@
 # Copyright (C) 2013 Canonical Ltd.
+# Copyright (c) 2018, Joyent, Inc.
 #
 # Author: Ben Howard <ben.howard@xxxxxxxxxxxxx>
 #
@@ -15,12 +16,16 @@ from __future__ import print_function
 
 from binascii import crc32
 import json
+import multiprocessing
 import os
 import os.path
 import re
 import shutil
+import signal
 import stat
+import subprocess
 import tempfile
+import unittest
 import uuid
 
 from cloudinit import serial
@@ -872,4 +877,67 @@ class TestNetworkConversion(TestCase):
         found = convert_net(SDC_NICS_SINGLE_GATEWAY)
         self.assertEqual(expected, found)
 
+
+@unittest.skipUnless(DataSourceSmartOS.get_smartos_environ() ==
+                     DataSourceSmartOS.SMARTOS_ENV_KVM,
+                     "Only supported on KVM and bhyve guests under SmartOS")
+@unittest.skipUnless(os.access(DataSourceSmartOS.SERIAL_DEVICE, os.W_OK),
+                     "Requires write access to " +
+                     DataSourceSmartOS.SERIAL_DEVICE)
+class TestSerialConcurrency(TestCase):
+    """
+       This class tests locking on an actual serial port, and as such can only
+       be run in a kvm or bhyve guest running on a SmartOS host.  A test run on
+       a metadata socket will not be valid because a metadata socket ensures
+       there is only one session over a connection.  In contrast, in the
+       absence of proper locking multiple processes opening the same serial
+       port can corrupt each others' exchanges with the metadata server.
+    """
+    def setUp(self):
+        self.mdata_proc = multiprocessing.Process(target=self.start_mdata_loop)
+        self.mdata_proc.start()
+        super(TestSerialConcurrency, self).setUp()
+
+    def tearDown(self):
+        # os.kill() rather than mdata_proc.terminate() to avoid console spam.
+        os.kill(self.mdata_proc.pid, signal.SIGKILL)
+        self.mdata_proc.join()
+        super(TestSerialConcurrency, self).tearDown()
+
+    def start_mdata_loop(self):
+        """
+           The mdata-get command is repeatedly run in a separate process so
+           that it may try to race with metadata operations performed in the
+           main test process.  Use of mdata-get is better than two processes
+           using the protocol implementation in DataSourceSmartOS because we
+           are testing to be sure that cloud-init and mdata-get respect each
+           others locks.
+        """
+        while True:
+            try:
+                subprocess.check_output(['/usr/sbin/mdata-get', 'sdc:routes'])
+            except subprocess.CalledProcessError:
+                pass
+
+    def test_all_keys(self):
+        self.assertIsNotNone(self.mdata_proc.pid)
+        ds = DataSourceSmartOS
+        keys = [tup[0] for tup in ds.SMARTOS_ATTRIB_MAP.values()]
+        keys.extend(ds.SMARTOS_ATTRIB_JSON.values())
+
+        client = ds.jmc_client_factory()
+        self.assertIsNotNone(client)
+
+        # The behavior that we are testing for was observed mdata-get running
+        # 10 times at roughly the same time as cloud-init fetched each key
+        # once.  cloud-init would regularly see failures before making it
+        # through all keys once.
+        for it in range(0, 3):
+            for key in keys:
+                # We don't care about the return value, just that it doesn't
+                # thrown any exceptions.
+                client.get(key)
+
+        self.assertIsNone(self.mdata_proc.exitcode)
+
 # vi: ts=4 expandtab

Follow ups