nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00299
[Merge] ~xavpaice/hw-health-charm:add_smartctl into hw-health-charm:master
Xav Paice has proposed merging ~xavpaice/hw-health-charm:add_smartctl into hw-health-charm:master.
Requested reviews:
Nagios Charm developers (nagios-charmers)
For more details, see:
https://code.launchpad.net/~xavpaice/hw-health-charm/+git/hw-health-charm/+merge/363833
--
Your team Nagios Charm developers is requested to review the proposed merge of ~xavpaice/hw-health-charm:add_smartctl into hw-health-charm:master.
diff --git a/Makefile b/Makefile
index 20a6f9b..8f9d4fb 100644
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,12 @@ help:
lint:
@echo "Running flake8"
+<<<<<<< Makefile
cd src && tox -e pep8
+=======
+ @flake8 --max-line-length=120 ./src
+>>>>>>> Makefile
test: unittest lint
diff --git a/src/actions/actions.py b/src/actions/actions.py
index 9e27f0c..a851b2a 100755
--- a/src/actions/actions.py
+++ b/src/actions/actions.py
@@ -18,7 +18,13 @@ import os
import subprocess
import sys
+<<<<<<< src/actions/actions.py
from charmhelpers.core.hookenv import action_set, action_fail, log
+=======
+from charms.layer import basic # noqa E402
+basic.bootstrap_charm_deps()
+from charmhelpers.core.hookenv import action_set, action_fail, log # noqa E402
+>>>>>>> src/actions/actions.py
def clear_sel(args):
diff --git a/src/config.yaml b/src/config.yaml
index 1a5bcfd..9dbf3e8 100644
--- a/src/config.yaml
+++ b/src/config.yaml
@@ -16,4 +16,8 @@ options:
enable_ipmi:
type: boolean
default: True
- description: Enable the use of freeipmi tools to monitor hardware status.
\ No newline at end of file
+ description: Enable the use of freeipmi tools to monitor hardware status.
+ enable_smart:
+ type: boolean
+ default: True
+ description: Enable SMART monitoring of HDDs.
\ No newline at end of file
diff --git a/src/files/smart/check_smart.py b/src/files/smart/check_smart.py
new file mode 100755
index 0000000..a931b57
--- /dev/null
+++ b/src/files/smart/check_smart.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: us-ascii -*-
+
+import subprocess
+from nagios_plugin3 import CriticalError, UnknownError, try_check
+
+
+def check_smartctl():
+ """Check the status of disk devices using SMART
+
+ Use smartctl to collect a list of supported devices, and check the status of each.
+ See the smartctl(8) manpage for a description of the return codes and what they mean.
+ :return: either a string to indicate the devices are OK, or raise an exception.
+ """
+ devices = smartctl_scan()
+ errors = []
+ for device in devices:
+ command = ['sudo', 'smartctl', '-H', device, '--quietmode=silent']
+ try:
+ status_out = subprocess.run(command)
+ except subprocess.CalledProcessError:
+ # smartctl returned a non-OK state, but it's silent so won't output any stdout nor stderr for that
+ errors.append("{} RC: {}".format(device, status_out.returncode))
+ if len(errors) > 0:
+ raise CriticalError("Smartctl found drives not OK, {}".format(",".join(errors)))
+ msg = "OK: All SMART devices report OK"
+ print(msg)
+ return "check_smartctl returned OK"
+
+
+def smartctl_scan():
+ """Run smartctl --scan to get a list of devices
+
+ :return: a list of devices that are capable of smartctl checks
+ """
+ command = ['smartctl', '--scan']
+ try:
+ raw_devices = subprocess.check_output(command).decode()
+ except subprocess.CalledProcessError as e:
+ raise UnknownError("smartctl failed to scan with error {}".format(e))
+ raw_devices = raw_devices.strip()
+ devices = [dev.split(' ')[0] for dev in raw_devices.split('\n')]
+ return devices
+
+
+def main():
+ try_check(check_smartctl)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/files/smart/check_smart_sudoer b/src/files/smart/check_smart_sudoer
new file mode 100644
index 0000000..989729a
--- /dev/null
+++ b/src/files/smart/check_smart_sudoer
@@ -0,0 +1,2 @@
+nagios ALL=(root) NOPASSWD: /usr/sbin/smartctl
+
diff --git a/src/layer.yaml b/src/layer.yaml
index 054b43e..a3c8c45 100644
--- a/src/layer.yaml
+++ b/src/layer.yaml
@@ -12,5 +12,7 @@ options:
packages:
- "freeipmi"
- "libipc-run-perl"
+ - "smartmontools"
+ - "lm-sensors"
repo: "lp:hw-health-charm"
is: "hw-health"
diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py
index c144fa4..9da532f 100644
--- a/src/lib/utils/hwdiscovery.py
+++ b/src/lib/utils/hwdiscovery.py
@@ -33,14 +33,16 @@ def get_tools():
# LSI3008-IR
tools.add('sas3ircu')
- # HPE (Gen10 and newer) or HP (older)
hw_vendor = dmidecode('system-manufacturer')
- if hw_vendor == 'HPE':
+ if hw_vendor == 'Supermicro':
+ tools.add('sas3ircu')
+ """ TODO implement HP/HPE monitoring
+ # HPE (Gen10 and newer) or HP (older)
+ elif hw_vendor == 'HPE':
tools.add('hpe') # ['hponcfg', 'ilorest']
elif hw_vendor == 'HP':
tools.add('hp') # ['hponcfg', 'hp-health', 'hpssacli']
- elif hw_vendor == 'Supermicro':
- tools.add('sas3ircu')
+ """
# SW RAID?
if _supports_mdadm():
@@ -49,6 +51,9 @@ def get_tools():
if hookenv.config('enable_ipmi'):
tools.add('ipmi')
+ if hookenv.config('enable_smart'):
+ tools.add('smart')
+
return tools
diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py
index bef4b4f..7d93792 100644
--- a/src/lib/utils/tooling.py
+++ b/src/lib/utils/tooling.py
@@ -13,17 +13,20 @@ TOOLING = {
'megacli': {
'snap': 'megacli',
'filename': 'megaraid/check_megacli.py',
- 'cronjob': 'megaraid/check_megacli.sh'
+ 'cronjob': 'megaraid/check_megacli.sh',
+ 'install': 'resource'
},
'sas2ircu': {
'snap': 'sas2ircu',
'filename': 'mpt/check_sas2ircu.py',
- 'cronjob': 'mpt/check_sas2ircu.sh'
+ 'cronjob': 'mpt/check_sas2ircu.sh',
+ 'install': 'resource'
},
'sas3ircu': {
'snap': 'sas3ircu',
'filename': 'mpt/check_sas3ircu.py',
- 'cronjob': 'mpt/check_sas3ircu.sh'
+ 'cronjob': 'mpt/check_sas3ircu.sh',
+ 'install': 'resource'
},
'mdadm': {
'filename': 'check_mdadm.py',
@@ -33,6 +36,10 @@ TOOLING = {
'filename': 'ipmi/check_ipmi_sensor',
'sudoers': 'ipmi/check_ipmi_sensor_sudoer'
},
+ 'smart': {
+ 'filename': 'smart/check_smart.py',
+ 'sudoers': 'smart/check_smart_sudoer'
+ }
}
PLUGINS_DIR = '/usr/local/lib/nagios/plugins/'
@@ -44,10 +51,17 @@ SUDOERS_DIR = '/etc/sudoers.d'
def install_resource(tools, resource):
"""Install hardware diagnostic tools from a charm resource
+ Install hardware diagnostic tools from a charm resource. We currently exclude a set of tools which are
+ installed by packages elsewhere.
:param tools: set of tools to include
:param resource: resource object
:return: boolean, status of installation
"""
+ exclude_tools = set()
+ for tool in TOOLING.keys():
+ if not TOOLING[tool].get('install') == 'resource':
+ # any tool that isn't marked to be installed by resource needs excluding further down
+ exclude_tools.add(tool)
if not isinstance(tools, set):
try:
tools = set(tools)
@@ -56,8 +70,8 @@ def install_resource(tools, resource):
return False
if not tools:
return False
- elif tools & {'mdadm', 'ipmi'}:
- tools = tools - {'mdadm', 'ipmi'}
+ elif tools & exclude_tools: # if the two sets intersect
+ tools = tools - exclude_tools
if not tools:
return True
@@ -196,7 +210,10 @@ def configure_nrpe_checks(tools):
hookenv.log(
'Cronjob file [{}] removed'.format(cronjob_file),
hookenv.DEBUG)
-
+ if os.path.exists('/usr/bin/sensors'):
+ nrpe_setup.add_check('check_sensors', "lmsensors status", '/usr/lib/nagios/plugins/check_sensors')
+ nrpe_setup.write()
+ count += 1
return count > 0
@@ -219,6 +236,7 @@ def remove_nrpe_checks(tools):
hookenv.log('Check removed from nrpe: {}'.format(tool))
count += 1
+ # TODO remove nrpe check for check_sensors
if count > 0:
nrpe_setup.write()
return True
diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
index 29cea0f..1890eb5 100644
--- a/src/reactive/hw_health.py
+++ b/src/reactive/hw_health.py
@@ -14,6 +14,7 @@ from utils.tooling import (
configure_nrpe_checks,
install_resource,
remove_nrpe_checks,
+ TOOLING
)
@@ -36,16 +37,21 @@ def install():
status.blocked('Hardware not supported')
else:
resource = hookenv.resource_get('tools')
+ resource_needed_tools = [tool for tool in tools if TOOLING[tool].get('install') == 'resource']
if resource:
hookenv.log('Installing from resource')
status.waiting('Installing from attached resource')
installed = install_resource(tools, resource)
else:
- hookenv.log(
- 'Missing Juju resource: tools - alternative method is not'
- ' available yet', hookenv.ERROR)
- status.blocked('Missing Juju resource: tools')
- return
+ if resource_needed_tools:
+ # we know that this tool needs to be provided by a resource but there's no resource available
+ hookenv.log(
+ 'Missing Juju resource: tools - alternative method is not'
+ ' available yet', hookenv.ERROR)
+ status.blocked('Missing Juju resource: tools')
+ return
+ else:
+ installed = True
# TODO(aluria): install vendor utilities via snaps
# https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
# Snap interface for hw tooling is not supported
@@ -62,8 +68,8 @@ def install():
set_flag('hw-health.installed')
status.waiting('Preparing scripts installation')
else:
- missing_tools = tools - {'mdadm', 'ipmi'}
- status.blocked('Tools not found: {}'.format(', '.join(missing_tools)))
+ status.blocked('Not all required tools found in tools resource: {}'.format(
+ ', '.join(resource_needed_tools)))
@when('hw-health.installed')
diff --git a/src/tests/charms/__init__.py b/src/tests/charms/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/tests/charms/__init__.py
diff --git a/src/tests/charms/layer/__init__.py b/src/tests/charms/layer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/tests/charms/layer/__init__.py
diff --git a/src/tests/charms/layer/basic.py b/src/tests/charms/layer/basic.py
new file mode 100644
index 0000000..5434905
--- /dev/null
+++ b/src/tests/charms/layer/basic.py
@@ -0,0 +1,2 @@
+def bootstrap_charm_deps():
+ pass
diff --git a/src/tests/unit/test_actions.py b/src/tests/unit/test_actions.py
index 3df07a3..bb5a492 100644
--- a/src/tests/unit/test_actions.py
+++ b/src/tests/unit/test_actions.py
@@ -12,10 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
+import sys
import unittest
import unittest.mock as mock
-from actions.actions import clear_sel
+sys.path.insert(0, '{}/{}'.format(os.getcwd(), 'tests'))
+from actions.actions import clear_sel # noqa: E402
class ClearSelTestCase(unittest.TestCase):
@@ -27,5 +30,9 @@ class ClearSelTestCase(unittest.TestCase):
sel_output = "Unittest system event log output"
mock_subprocess.return_value = sel_output
mock_check_call.return_value = None
+<<<<<<< src/tests/unit/test_actions.py
clear_sel('dummy_arg')
+=======
+ clear_sel('unittest')
+>>>>>>> src/tests/unit/test_actions.py
mock_check_call.assert_called_once_with(['action-set', "cleared_log={}".format(sel_output)])
diff --git a/src/tests/unit/test_check_smart.py b/src/tests/unit/test_check_smart.py
new file mode 100644
index 0000000..f3556d1
--- /dev/null
+++ b/src/tests/unit/test_check_smart.py
@@ -0,0 +1,31 @@
+import sys
+import unittest
+import unittest.mock as mock
+
+sys.path.append('files/smart')
+import check_smart # noqa E402
+
+
+class TestCheckSmart(unittest.TestCase):
+
+ @mock.patch('subprocess.check_output')
+ def test_smartctl_scan(self, mock_smartctl):
+ mock_smartctl.return_value = b"""/dev/sda -d scsi # /dev/sda, SCSI device
+/dev/sdb -d scsi # /dev/sdb, SCSI device
+/dev/sdc -d scsi # /dev/sdc, SCSI device
+/dev/sdd -d scsi # /dev/sdd, SCSI device
+/dev/sde -d scsi # /dev/sde, SCSI device
+
+ """
+ actual = check_smart.smartctl_scan()
+ expected = ['/dev/sda', '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde']
+ self.assertEqual(actual, expected)
+
+ @mock.patch('subprocess.run')
+ @mock.patch('check_smart.smartctl_scan')
+ def test_check_smartctl(self, mock_devices, mock_smartctl):
+ mock_smartctl.return_code = 0
+ mock_devices.return_value = ['/dev/sda']
+ actual = check_smart.check_smartctl()
+ expected = "check_smartctl returned OK"
+ self.assertEqual(actual, expected)
diff --git a/src/tests/unit/test_hwdiscovery.py b/src/tests/unit/test_hwdiscovery.py
index 7cb516e..5fc3342 100644
--- a/src/tests/unit/test_hwdiscovery.py
+++ b/src/tests/unit/test_hwdiscovery.py
@@ -1,6 +1,4 @@
import io
-import os # noqa: F401
-import subprocess # noqa: F401
import sys
import unittest
import unittest.mock as mock
@@ -26,7 +24,7 @@ class TestHWDiscovery(unittest.TestCase):
driver.side_effect = side_effect_glob
mock_hookenv.return_value = True
real = hwdiscovery.get_tools()
- expected = set(['megacli', 'ipmi'])
+ expected = set(['megacli', 'ipmi', 'smart'])
self.assertEqual(real, expected)
@mock.patch('hwdiscovery._supports_mdadm')
@@ -42,10 +40,10 @@ class TestHWDiscovery(unittest.TestCase):
supports_mdadm.return_value = False
dmidecode.return_value = 'unittest'
- mock_hookenv.return_value = True
+ mock_hookenv.return_value = False
driver.side_effect = side_effect_glob
real = hwdiscovery.get_tools()
- expected = set(['sas2ircu', 'ipmi'])
+ expected = set(['sas2ircu'])
self.assertEqual(real, expected)
@mock.patch('hwdiscovery._supports_mdadm')
@@ -68,6 +66,7 @@ class TestHWDiscovery(unittest.TestCase):
expected = set(['sas3ircu'])
self.assertEqual(real, expected)
+ """ TODO implement HP
@mock.patch('hwdiscovery._supports_mdadm')
@mock.patch('hwdiscovery.dmidecode')
@mock.patch('glob.glob')
@@ -93,6 +92,7 @@ class TestHWDiscovery(unittest.TestCase):
real = hwdiscovery.get_tools()
expected = set(['hp'])
self.assertEqual(real, expected)
+ """
@mock.patch('hwdiscovery._supports_mdadm')
@mock.patch('hwdiscovery.dmidecode')
diff --git a/src/tests/unit/test_tooling.py b/src/tests/unit/test_tooling.py
index 5236f6e..9d55b83 100644
--- a/src/tests/unit/test_tooling.py
+++ b/src/tests/unit/test_tooling.py
@@ -131,3 +131,7 @@ class TestTooling(unittest.TestCase):
def test_configure_tools_unsupported_tools(self):
self.assertEqual(
tooling.configure_tools(['unittest1', 'unittest2']), False)
+
+ def test_configure_nrpe_checks(self):
+ # TODO write this test
+ pass