← Back to team overview

nagios-charmers team mailing list archive

Re: [Merge] ~xavpaice/hw-health-charm:add_smart into hw-health-charm:master

 

Review: Needs Fixing

Some comments / notes inline. One issue with smartctl checks in general is that they won't work well with some types of RAID cards.

I think it'd be good to have the README updated for the smartctl checks, suggest to add a warning about smartctl and RAID controllers there.

For future work, could expand this check for Dell megaraid cards. Some howto:
https://www.thomas-krenn.com/en/wiki/Smartmontools_with_MegaRAID_Controller

This will required the storcli binary, see here: https://discourse.maas.io/t/running-smart-tests-against-megaraid-controllers/185



Diff comments:

> diff --git a/src/files/smart/check_smart.py b/src/files/smart/check_smart.py
> new file mode 100755
> index 0000000..a931b57
> --- /dev/null
> +++ b/src/files/smart/check_smart.py
> @@ -0,0 +1,51 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import subprocess
> +from nagios_plugin3 import CriticalError, UnknownError, try_check
> +
> +
> +def check_smartctl():
> +    """Check the status of disk devices using SMART
> +
> +    Use smartctl to collect a list of supported devices, and check the status of each.
> +    See the smartctl(8) manpage for a description of the return codes and what they mean.
> +    :return: either a string to indicate the devices are OK, or raise an exception.
> +    """
> +    devices = smartctl_scan()
> +    errors = []
> +    for device in devices:
> +        command = ['sudo', 'smartctl', '-H', device, '--quietmode=silent']

This won't work with raid cards. E.g. Dell megaraid will return
sudo smartctl -H /dev/sda
smartctl 6.5 2016-01-24 r4214 [x86_64-linux-4.15.0-45-generic] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, www.smartmontools.org

Smartctl open device: /dev/sda failed: DELL or MegaRaid controller, please try adding '-d megaraid,N'


Could we use '--quietmode=errorsonly' instead and have some addtl error info right away?

> +        try:
> +            status_out = subprocess.run(command)
> +        except subprocess.CalledProcessError:
> +            # smartctl returned a non-OK state, but it's silent so won't output any stdout nor stderr for that
> +            errors.append("{} RC: {}".format(device, status_out.returncode))

... and include the errorsonly output from ^^, and also print them out?

> +    if len(errors) > 0:

Nit: could get rid of superfluous len() and use
if errors:
   ...

> +        raise CriticalError("Smartctl found drives not OK, {}".format(",".join(errors)))
> +    msg = "OK: All SMART devices report OK"
> +    print(msg)
> +    return "check_smartctl returned OK"
> +
> +
> +def smartctl_scan():
> +    """Run smartctl --scan to get a list of devices
> +
> +    :return: a list of devices that are capable of smartctl checks
> +    """
> +    command = ['smartctl', '--scan']
> +    try:
> +        raw_devices = subprocess.check_output(command).decode()
> +    except subprocess.CalledProcessError as e:
> +        raise UnknownError("smartctl failed to scan with error {}".format(e))
> +    raw_devices = raw_devices.strip()
> +    devices = [dev.split(' ')[0] for dev in raw_devices.split('\n')]

Nit, maybe use 
raw_devices.splitlines()

> +    return devices
> +
> +
> +def main():
> +    try_check(check_smartctl)
> +
> +
> +if __name__ == '__main__':
> +    main()
> diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py
> index c144fa4..9da532f 100644
> --- a/src/lib/utils/hwdiscovery.py
> +++ b/src/lib/utils/hwdiscovery.py
> @@ -33,14 +33,16 @@ def get_tools():
>                      # LSI3008-IR
>                      tools.add('sas3ircu')
>  
> -    # HPE (Gen10 and newer) or HP (older)
>      hw_vendor = dmidecode('system-manufacturer')
> -    if hw_vendor == 'HPE':
> +    if hw_vendor == 'Supermicro':
> +        tools.add('sas3ircu')

This seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message

> +    """ TODO implement HP/HPE monitoring
> +    # HPE (Gen10 and newer) or HP (older)
> +    elif hw_vendor == 'HPE':
>          tools.add('hpe')  # ['hponcfg', 'ilorest']
>      elif hw_vendor == 'HP':
>          tools.add('hp')  # ['hponcfg', 'hp-health', 'hpssacli']
> -    elif hw_vendor == 'Supermicro':
> -        tools.add('sas3ircu')
> +    """
>  
>      # SW RAID?
>      if _supports_mdadm():
> diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py
> index bef4b4f..7d93792 100644
> --- a/src/lib/utils/tooling.py
> +++ b/src/lib/utils/tooling.py
> @@ -196,7 +210,10 @@ def configure_nrpe_checks(tools):
>                  hookenv.log(
>                      'Cronjob file [{}] removed'.format(cronjob_file),
>                      hookenv.DEBUG)
> -
> +    if os.path.exists('/usr/bin/sensors'):

Also this seems to be a different topic than smartctl chks, is that on purpose? Maybe note in commit message

> +        nrpe_setup.add_check('check_sensors', "lmsensors status", '/usr/lib/nagios/plugins/check_sensors')
> +        nrpe_setup.write()
> +        count += 1
>      return count > 0
>  
>  
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index 29cea0f..1890eb5 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -36,16 +37,21 @@ def install():
>          status.blocked('Hardware not supported')
>      else:
>          resource = hookenv.resource_get('tools')
> +        resource_needed_tools = [tool for tool in tools if TOOLING[tool].get('install') == 'resource']
>          if resource:
>              hookenv.log('Installing from resource')
>              status.waiting('Installing from attached resource')
>              installed = install_resource(tools, resource)
>          else:
> -            hookenv.log(
> -                'Missing Juju resource: tools - alternative method is not'
> -                ' available yet', hookenv.ERROR)
> -            status.blocked('Missing Juju resource: tools')
> -            return
> +            if resource_needed_tools:
> +                # we know that this tool needs to be provided by a resource but there's no resource available
> +                hookenv.log(
> +                    'Missing Juju resource: tools - alternative method is not'
> +                    ' available yet', hookenv.ERROR)

Wonder if this might be misleading, as in "not available yet but this process will keep searching", poss. just say "alternative method is not available"

> +                status.blocked('Missing Juju resource: tools')
> +                return
> +            else:
> +                installed = True
>              # TODO(aluria): install vendor utilities via snaps
>              # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
>              # Snap interface for hw tooling is not supported
> diff --git a/src/tests/unit/test_check_smart.py b/src/tests/unit/test_check_smart.py
> new file mode 100644
> index 0000000..f3556d1
> --- /dev/null
> +++ b/src/tests/unit/test_check_smart.py
> @@ -0,0 +1,31 @@
> +import sys
> +import unittest
> +import unittest.mock as mock
> +
> +sys.path.append('files/smart')
> +import check_smart  # noqa E402
> +
> +
> +class TestCheckSmart(unittest.TestCase):
> +
> +    @mock.patch('subprocess.check_output')
> +    def test_smartctl_scan(self, mock_smartctl):
> +        mock_smartctl.return_value = b"""/dev/sda -d scsi # /dev/sda, SCSI device
> +/dev/sdb -d scsi # /dev/sdb, SCSI device
> +/dev/sdc -d scsi # /dev/sdc, SCSI device
> +/dev/sdd -d scsi # /dev/sdd, SCSI device
> +/dev/sde -d scsi # /dev/sde, SCSI device
> +
> +        """
> +        actual = check_smart.smartctl_scan()
> +        expected = ['/dev/sda', '/dev/sdb', '/dev/sdc', '/dev/sdd', '/dev/sde']
> +        self.assertEqual(actual, expected)
> +
> +    @mock.patch('subprocess.run')
> +    @mock.patch('check_smart.smartctl_scan')
> +    def test_check_smartctl(self, mock_devices, mock_smartctl):
> +        mock_smartctl.return_code = 0
> +        mock_devices.return_value = ['/dev/sda']
> +        actual = check_smart.check_smartctl()
> +        expected = "check_smartctl returned OK"
> +        self.assertEqual(actual, expected)

Would be great to have a test for a failing drive as well



-- 
https://code.launchpad.net/~xavpaice/hw-health-charm/+git/hw-health-charm/+merge/363834
Your team Nagios Charm developers is subscribed to branch hw-health-charm:master.


References