nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00301
Re: [Merge] ~xavpaice/hw-health-charm:add_smart into hw-health-charm:master
Review: Needs Fixing
Some comments / notes inline. One issue with smartctl checks in general is that they won't work well with some types of RAID cards.
I think it'd be good to have the README updated for the smartctl checks, suggest to add a warning about smartctl and RAID controllers there.
For future work, could expand this check for Dell megaraid cards. Some howto:
https://www.thomas-krenn.com/en/wiki/Smartmontools_with_MegaRAID_Controller
This will required the storcli binary, see here: https://discourse.maas.io/t/running-smart-tests-against-megaraid-controllers/185
Diff comments:
> diff --git a/src/files/smart/check_smart.py b/src/files/smart/check_smart.py
> new file mode 100755
> index 0000000..a931b57
> --- /dev/null
> +++ b/src/files/smart/check_smart.py
> @@ -0,0 +1,51 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import subprocess
> +from nagios_plugin3 import CriticalError, UnknownError, try_check
> +
> +
> +def check_smartctl():
> + """Check the status of disk devices using SMART
> +
> + Use smartctl to collect a list of supported devices, and check the status of each.
> + See the smartctl(8) manpage for a description of the return codes and what they mean.
> + :return: either a string to indicate the devices are OK, or raise an exception.
> + """
> + devices = smartctl_scan()
> + errors = []
> + for device in devices:
> + command = ['sudo', 'smartctl', '-H', device, '--quietmode=silent']
This won't work with raid cards. E.g. Dell megaraid will return
sudo smartctl -H /dev/sda
smartctl 6.5 2016-01-24 r4214 [x86_64-linux-4.15.0-45-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org
Smartctl open device: /dev/sda failed: DELL or MegaRaid controller, please try adding '-d megaraid,N'
Could we use '--quietmode=errorsonly' instead and have some addtl error info right away?
> + try:
> + status_out = subprocess.run(command)
> + except subprocess.CalledProcessError:
> + # smartctl returned a non-OK state, but it's silent so won't output any stdout nor stderr for that
> + errors.append("{} RC: {}".format(device, status_out.returncode))
... and include the errorsonly output from ^^, and also print them out?
> + if len(errors) > 0:
Nit: could get rid of superfluous len() and use
if errors:
...
> + raise CriticalError("Smartctl found drives not OK, {}".format(",".join(errors)))
> + msg = "OK: All SMART devices report OK"
> + print(msg)
> + return "check_smartctl returned OK"
> +
> +
> +def smartctl_scan():
> + """Run smartctl --scan to get a list of devices
> +
> + :return: a list of devices that are capable of smartctl checks
> + """
> + command = ['smartctl', '--scan']
> + try:
> + raw_devices = subprocess.check_output(command).decode()
> + except subprocess.CalledProcessError as e:
> + raise UnknownError("smartctl failed to scan with error {}".format(e))
> + raw_devices = raw_devices.strip()
> + devices = [dev.split(' ')[0] for dev in raw_devices.split('\n')]
Nit, maybe use
raw_devices.splitlines()
> + return devices
> +
> +
> +def main():
> + try_check(check_smartctl)
> +
> +
> +if __name__ == '__main__':
> + main()
> diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py
> index c144fa4..9da532f 100644
> --- a/src/lib/utils/hwdiscovery.py
> +++ b/src/lib/utils/hwdiscovery.py
> @@ -33,14 +33,16 @@ def get_tools():
> # LSI3008-IR
> tools.add('sas3ircu')
>
> - # HPE (Gen10 and newer) or HP (older)
> hw_vendor = dmidecode('system-manufacturer')
> - if hw_vendor == 'HPE':
> + if hw_vendor == 'Supermicro':
> + tools.add('sas3ircu')
This seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message
> + """ TODO implement HP/HPE monitoring
> + # HPE (Gen10 and newer) or HP (older)
> + elif hw_vendor == 'HPE':
> tools.add('hpe') # ['hponcfg', 'ilorest']
> elif hw_vendor == 'HP':
> tools.add('hp') # ['hponcfg', 'hp-health', 'hpssacli']
> - elif hw_vendor == 'Supermicro':
> - tools.add('sas3ircu')
> + """
>
> # SW RAID?
> if _supports_mdadm():
> diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py
> index bef4b4f..7d93792 100644
> --- a/src/lib/utils/tooling.py
> +++ b/src/lib/utils/tooling.py
> @@ -196,7 +210,10 @@ def configure_nrpe_checks(tools):
> hookenv.log(
> 'Cronjob file [{}] removed'.format(cronjob_file),
> hookenv.DEBUG)
> -
> + if os.path.exists('/usr/bin/sensors'):
Also this seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message
> + nrpe_setup.add_check('check_sensors', "lmsensors status", '/usr/lib/nagios/plugins/check_sensors')
> + nrpe_setup.write()
> + count += 1
> return count > 0
>
>
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index 29cea0f..1890eb5 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -36,16 +37,21 @@ def install():
> status.blocked('Hardware not supported')
> else:
> resource = hookenv.resource_get('tools')
> + resource_needed_tools = [tool for tool in tools if TOOLING[tool].get('install') == 'resource']
> if resource:
> hookenv.log('Installing from resource')
> status.waiting('Installing from attached resource')
> installed = install_resource(tools, resource)
> else:
> - hookenv.log(
> - 'Missing Juju resource: tools - alternative method is not'
> - ' available yet', hookenv.ERROR)
> - status.blocked('Missing Juju resource: tools')
> - return
> + if resource_needed_tools:
> + # we know that this tool needs to be provided by a resource but there's no resource available
> + hookenv.log(
> + 'Missing Juju resource: tools - alternative method is not'
> + ' available yet', hookenv.ERROR)
Wonder if this might be misleading, as in "not available yet but this process will keep searching", poss. just say "alternative method is not available"
> + status.blocked('Missing Juju resource: tools')
> + return
> + else:
> + installed = True
> # TODO(aluria): install vendor utilities via snaps
> # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
> # Snap interface for hw tooling is not supported
> diff --git a/src/tests/unit/test_check_smart.py b/src/tests/unit/test_check_smart.py
> new file mode 100644
> index 0000000..f3556d1
> --- /dev/null
> +++ b/src/tests/unit/test_check_smart.py
> @@ -0,0 +1,31 @@
> +import sys
> +import unittest
> +import unittest.mock as mock
> +
> +sys.path.append('files/smart')
> +import check_smart # noqa E402
> +
> +
> +class TestCheckSmart(unittest.TestCase):
> +
> + @mock.patch('subprocess.check_output')
> + def test_smartctl_scan(self, mock_smartctl):
> + mock_smartctl.return_value = b"""/dev/sda -d scsi # /dev/sda, SCSI device
> +/dev/sdb -d scsi # /dev/sdb, SCSI device
> +/dev/sdc -d scsi # /dev/sdc, SCSI device
> +/dev/sdd -d scsi # /dev/sdd, SCSI device
> +/dev/sde -d scsi # /dev/sde, SCSI device
> +
> + """
> + actual = check_smart.smartctl_scan()
> + expected = ['/dev/sda', '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde']
> + self.assertEqual(actual, expected)
> +
> + @mock.patch('subprocess.run')
> + @mock.patch('check_smart.smartctl_scan')
> + def test_check_smartctl(self, mock_devices, mock_smartctl):
> + mock_smartctl.return_code = 0
> + mock_devices.return_value = ['/dev/sda']
> + actual = check_smart.check_smartctl()
> + expected = "check_smartctl returned OK"
> + self.assertEqual(actual, expected)
Would be great to have a test for a failing drive as well
--
https://code.launchpad.net/~xavpaice/hw-health-charm/+git/hw-health-charm/+merge/363834
Your team Nagios Charm developers is subscribed to branch hw-health-charm:master.
References