← Back to team overview

nagios-charmers team mailing list archive

Re: [Merge] ~aluria/hw-health-charm/+git/hw-health-charm:unittests-engrot6 into hw-health-charm:master

 

Review: Approve

Looks good. A few comments inline, but I don't think anything requiring another review (ping me here or IRC if you believe otherwise).

Main issue is I suspect that reading from Popen object's stdout before the process has terminated could result in truncated data, and the Python docs warn you not to do it at all because it might lock up, so might want to switch to one of the higher level helpers.

Diff comments:

> diff --git a/src/files/check_mdadm.py b/src/files/check_mdadm.py
> old mode 100644
> new mode 100755
> index 741865e..56e37a8
> --- a/src/files/check_mdadm.py
> +++ b/src/files/check_mdadm.py
> @@ -2,108 +2,29 @@
>  # -*- coding: us-ascii -*-
>  
>  import os
> -import re
> -import subprocess
>  
> +from nagios_plugin3 import CriticalError, WarnError, UnknownError, try_check
>  
> -def get_devices():
> -    if os.path.exists('/sbin/mdadm'):
> -        try:
> -            devices_raw = subprocess.Popen(['/sbin/mdadm',
> -                                            '--detail', '--scan'],
> -                                           stdout=subprocess.PIPE,
> -                                           stderr=subprocess.PIPE)
> -            devices_re = re.compile('^ARRAY\s+([^ ]+) ')
> -            devices = set()
> -            for line in devices_raw.stdout.readline().decode():
> -                device_re = devices_re.match(line)
> -                if device_re is not None:
> -                    devices.add(device_re.group(1))
> -            return devices
> -        except Exception as e:
> -            # log error
> -            pass
> -    return
> +INPUT_FILE = '/var/lib/nagios/mdadm.out'
>  
>  
> -def main():
> -    devices = get_devices()
> -    if len(devices) == 0:
> -        print('WARNING: unexpectedly checked no devices')
> -        return 1
> -
> -    mdadm_detail = ['/sbin/mdadm', '--detail']
> -    mdadm_detail.extend(sorted(devices))
> -    try:
> -        devices_details_raw = subprocess.Popen(mdadm_detail,
> -                                               stdout=subprocess.PIPE,
> -                                               stderr=subprocess.PIPE)
> -    except Exception as error:
> -        print('WARNING: error executing mdadm: {}'.format(error))
> -        return 1
> -
> -    devices_re = '^(/[^ ]+):$'
> -    state_re = '^\s*State :\s+(\S+)\s*'
> -    status_re = '^\s*(Active|Working|Failed|Spare) Devices :\s+(\d+)'
> -
> -    devices_cre = re.compile(devices_re)
> -    state_cre = re.compile(state_re)
> -    status_cre = re.compile(status_re)
> -
> -    device = None
> -    devices_stats = {}
> -    for line in devices_details_raw.stdout.readline().decode():
> -        m = devices_cre.match(line)
> -        if m:
> -            device = m.group(1)
> -            devices_stats[device] = {'stats': {'Active': 0,
> -                                               'Working': 0,
> -                                               'Failed': 0,
> -                                               'Spare': 0},
> -                                     'degraded': False}
> -            continue
> +def parse_output():
> +    if not os.path.exists(INPUT_FILE):
> +        raise UnknownError('UNKNOWN: file not found ({})'.format(INPUT_FILE))

You also need to check the age of the file. Otherwise, there is no notification if the cron_mdadm.py starts failing because old data will just be used.

>  
> -        m = state_cre.match(line)
> -        if m:
> -            if 'degraded' in m.group(1) and device:
> -                devices_stats[device]['degraded'] = True
> -            continue
> +    with open(INPUT_FILE, 'r') as fd:
> +        for line in fd.readlines():
> +            line = line.strip()
> +            if line.startswith('CRITICAL: '):
> +                raise CriticalError(line)
> +            elif line.startswith('WARNING: '):
> +                raise WarnError(line)
> +            else:
> +                print(line)
>  
> -        m = status_cre.match(line)
> -        if m and device:
> -            devices_stats[device]['stats'][m.group(1)] = int(m.group(2))
>  
> -    rc = devices_details_raw.wait()
> -    if rc:
> -        print('WARNING: mdadm returned exit status {}'.format(int(rc)))
> -        return 1
> -
> -    parts = []
> -    msg = []
> -    critical = False
> -    for device in devices_stats:
> -        if devices_stats[device]['degraded']:
> -            critical = True
> -            parts.append('{} degraded'.format(device))
> -        else:
> -            parts.append('{} ok'.format(device))
> -
> -        for status in sorted(devices_stats[device]['stats']):
> -            parts.append('{}[{}]'
> -                         .format(status,
> -                                 devices_stats[device]['stats'][status]))
> -            if status == 'Failed' \
> -                    and devices_stats[device]['stats'][status] > 0:
> -                critical = True
> -        msg.append(', '.join(parts))
> -
> -    msg = '; '.join(msg)
> -    if critical:
> -        print('CRITICAL: {}'.format(msg))
> -        return 2
> -    else:
> -        print('OK: {}'.format(msg))
> -        return 0
> +def main():
> +    try_check(parse_output)
>  
>  
>  if __name__ == '__main__':
> diff --git a/src/files/cron_mdadm.py b/src/files/cron_mdadm.py
> new file mode 100755
> index 0000000..bc91b5a
> --- /dev/null
> +++ b/src/files/cron_mdadm.py
> @@ -0,0 +1,127 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +import os
> +import re
> +import subprocess
> +
> +OUTPUT_FILE = '/var/lib/nagios/mdadm.out'
> +TEMP_FILE = '/tmp/mdadm.out'
> +
> +
> +def get_devices():
> +    if os.path.exists('/sbin/mdadm'):
> +        try:
> +            devices_raw = subprocess.Popen(
> +                ['/sbin/mdadm', '--detail', '--scan'],
> +                stdout=subprocess.PIPE,
> +                stderr=subprocess.PIPE)
> +            devices_re = re.compile(r'^ARRAY\s+([^ ]+) ')
> +            devices = set()
> +            for line in devices_raw.stdout.readlines():

You are not waiting for the process to terminate, so you might get truncated data. Best to use .communicate() (per the warning in the Python docs about not reading from stdout, or use the higher level calls like subprocess.check_output or the new subprocess.run

> +                line = line.decode().strip()
> +                device_re = devices_re.match(line)
> +                if device_re is not None:
> +                    devices.add(device_re.group(1))
> +            return devices
> +        except Exception as e:
> +            # log error

Given that this is for monitoring, it seems very wrong to ignore all errors. And the comment claims to log them, but you don't (and stderr was redirected).

> +            pass
> +    return set()
> +
> +
> +def generate_output(msg):
> +    try:
> +        with open(TEMP_FILE, 'w') as fd:
> +            fd.write(msg)
> +        os.rename(TEMP_FILE, OUTPUT_FILE)
> +        return True
> +    except Exception as error:
> +        print('Unable to generate output file')
> +        return False
> +
> +
> +def parse_output():
> +    devices = get_devices()
> +    if len(devices) == 0:
> +        return generate_output('WARNING: unexpectedly checked no devices')
> +
> +    mdadm_detail = ['/sbin/mdadm', '--detail']
> +    mdadm_detail.extend(sorted(devices))
> +    try:
> +        devices_details_raw = subprocess.Popen(
> +            mdadm_detail, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
> +    except Exception as error:
> +        return generate_output(
> +            'WARNING: error executing mdadm: {}'.format(error))
> +
> +    devices_re = r'^(/\S+):$'
> +    state_re = r'^\s*State\s+:\s+(\S+)$'
> +    status_re = r'^\s*(Active|Working|Failed|Spare) Devices\s+:\s+(\d+)$'
> +
> +    devices_cre = re.compile(devices_re)
> +    state_cre = re.compile(state_re)
> +    status_cre = re.compile(status_re)
> +
> +    device = None
> +    devices_stats = {}
> +    for line in devices_details_raw.stdout.readlines():
> +        line = line.decode().rstrip()
> +        m = devices_cre.match(line)
> +        if m:
> +            device = m.group(1)
> +            devices_stats[device] = {
> +                'stats': {
> +                    'Active': 0,
> +                    'Working': 0,
> +                    'Failed': 0,
> +                    'Spare': 0
> +                },
> +                'degraded': False
> +            }
> +            continue
> +
> +        m = state_cre.match(line)
> +        if m:
> +            if 'degraded' in m.group(1) and device:
> +                devices_stats[device]['degraded'] = True
> +            continue
> +
> +        m = status_cre.match(line)
> +        if m and device:
> +            devices_stats[device]['stats'][m.group(1)] = int(m.group(2))
> +
> +    rc = devices_details_raw.wait()
> +    if rc:
> +        return generate_output('WARNING: mdadm returned exit'
> +                               ' status {}'.format(int(rc)))
> +
> +    msg = []
> +    critical = False
> +    for device in devices_stats:
> +        # Is device degraded?
> +        if devices_stats[device]['degraded']:
> +            critical = True
> +            parts = ['{} degraded'.format(device)]
> +        else:
> +            parts = ['{} ok'.format(device)]
> +
> +        # If Failed drives are found, list counters (how many?)
> +        failed_cnt = devices_stats[device]['stats'].get('Failed', 0)
> +        if failed_cnt > 0:
> +            critical = True
> +            dev_stats = [
> +                '{}[{}]'.format(status, devices_stats[device]['stats'][status])
> +                for status in sorted(devices_stats[device]['stats'])
> +            ]
> +            parts.extend(dev_stats)
> +        msg.append(', '.join(parts))
> +
> +    if critical:
> +        msg = 'CRITICAL: {}'.format('; '.join(msg))
> +    else:
> +        msg = 'OK: {}'.format('; '.join(msg))
> +    return generate_output(msg)
> +
> +
> +if __name__ == '__main__':
> +    parse_output()

If parse_output raises an exception, will this cause an alert to be triggered? Or do you need to wrap parse_output in a try/except and generate_output('CRITICAL: {}'.format(exception)) if an exception is raised?

> diff --git a/src/lib/utils/hwdiscovery.py b/src/lib/utils/hwdiscovery.py
> index 43c8abf..c3a4a59 100644
> --- a/src/lib/utils/hwdiscovery.py
> +++ b/src/lib/utils/hwdiscovery.py
> @@ -67,10 +70,10 @@ def dmidecode(key):
>  def _supports_mdadm():
>      if os.path.exists('/sbin/mdadm'):
>          try:
> -            devices_raw = subprocess.Popen(['/sbin/mdadm',
> -                                            '--detail', '--scan'],
> -                                           stdout=subprocess.PIPE,
> -                                           stderr=subprocess.PIPE)
> +            devices_raw = subprocess.Popen(
> +                ['/sbin/mdadm', '--detail', '--scan'],
> +                stdout=subprocess.PIPE,
> +                stderr=subprocess.PIPE)

Here too you should probably be using subprocess.check_output or .communicate on the Popen object, because the process may run slowly and you don't get all the output, or a deadlock if the output is large per Python doc warnings.

>              devices_re = re.compile('^ARRAY\s+(\S+) ')
>              for line in devices_raw.stdout.readlines():
>                  line = line.decode().strip()
> diff --git a/src/lib/utils/tooling.py b/src/lib/utils/tooling.py
> index 387324f..c2d4aa1 100644
> --- a/src/lib/utils/tooling.py
> +++ b/src/lib/utils/tooling.py
> @@ -1,30 +1,82 @@
>  # -*- coding: us-ascii -*-
> +import glob
>  import os
> -from charmhelpers.contrib.charmsupport import nrpe
> -from charmhelpers.core.hookenv import (
> -    log,
> -    DEBUG,
> -    ERROR,
> -    INFO
> -)
> -# from charms.layer import snap
> -from shutil import copy2
> +import shutil
>  import subprocess
> +import tempfile
> +import zipfile
> +
> +from charmhelpers.contrib.charmsupport import nrpe
> +from charmhelpers.core import hookenv
>  
>  TOOLING = {
> -    'megacli': {'snap': 'megacli',
> -                'filename': 'megaraid/check_megacli.py',
> -                'cronjob': 'megaraid/check_megacli.sh'},
> -    'sas2ircu': {'snap': 'sas2ircu',
> -                 'filename': 'mpt/check_sas2ircu.py',
> -                 'cronjob': 'mpt/check_sas2ircu.sh'},
> -    'sas3ircu': {'snap': 'sas3ircu',
> -                 'filename': 'mpt/check_sas3ircu.py',
> -                 'cronjob': 'mpt/check_sas3ircu.sh'},
> -    'mdadm': {'filename': 'check_mdadm.py'},
> +    'megacli': {
> +        'snap': 'megacli',
> +        'filename': 'megaraid/check_megacli.py',
> +        'cronjob': 'megaraid/check_megacli.sh'
> +    },
> +    'sas2ircu': {
> +        'snap': 'sas2ircu',
> +        'filename': 'mpt/check_sas2ircu.py',
> +        'cronjob': 'mpt/check_sas2ircu.sh'
> +    },
> +    'sas3ircu': {
> +        'snap': 'sas3ircu',
> +        'filename': 'mpt/check_sas3ircu.py',
> +        'cronjob': 'mpt/check_sas3ircu.sh'
> +    },
> +    'mdadm': {
> +        'filename': 'check_mdadm.py',
> +        'cronjob': 'cron_mdadm.py'
> +    },
>  }
>  
>  PLUGINS_DIR = '/usr/local/lib/nagios/plugins/'
> +TOOLS_DIR = '/usr/local/bin/'
> +CRONJOB_PATH = '/etc/cron.d/hw_health_{}'
> +
> +
> +def install_resource(tools, resource):
> +    ntools = len(tools)
> +    if ntools == 0:
> +        return False
> +    elif 'mdadm' in tools:
> +        tools = [tool for tool in tools if tool != 'mdadm']
> +        ntools -= 1
> +        if ntools == 0:
> +            return True
> +
> +    if type(resource) != str \

I prefer isinstance(resource, str), which also passes if resource is a subclass of str. But it won't be, so whatever floats your boat.

> +            or not resource.endswith('.zip') \
> +            or not os.path.exists(resource):
> +        return False
> +
> +    with tempfile.TemporaryDirectory() as tmpdir:
> +        try:
> +            with zipfile.ZipFile(resource, 'r') as zf:
> +                zf.extractall(tmpdir)
> +
> +            count = 0
> +            for filepath in glob.glob(os.path.join(tmpdir, '*')):
> +                filebasename = os.path.basename(filepath)
> +                if filebasename in tools \
> +                        and filebasename in TOOLING \
> +                        and os.path.isfile(filepath):
> +                    os.chmod(filepath, 0o755)
> +                    shutil.chown(filepath, user=0, group=0)
> +                    shutil.copy2(filepath, TOOLS_DIR)
> +                    count += 1
> +
> +            return ntools == count and count > 0
> +
> +        except zipfile.BadZipFile as error:
> +            hookenv.log('BadZipFile: {}'.format(error), hookenv.ERROR)
> +
> +        except PermissionError as error:
> +            hookenv.log(
> +                'Unable to unzip the resource: {}'.format(error), hookenv.ERROR)
> +
> +    return False
>  
>  
>  def install_tools(tools, method=None):
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index 730d405..1a1402f 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -71,6 +74,18 @@ def any_change():
>          status.maintenance('Charm upgrade in progress')
>  
>  
> +@when('hw-health.installed')
> +@when('nrpe-external-master.available')
> +def remove_checks(nrpe):
> +    if hookenv.hook_name() == 'stop':
> +        # Note(aluria): Remove nrpe check(s)
> +        kv = unitdata.kv()
> +        tools = kv.get('tools', set())
> +        removed = remove_nrpe_checks(tools)
> +        if removed:
> +            nrpe.updated()

Per earlier discussions, I'd normally say this should be using the @charms.reactive.hook decorator. It would look like this

from charms.reactive import hook, endpoint_from_flag, is_flag_set
@hook('stop')
def remove_checks():
    if not is_flag_set('hw-health.installed'):
        return
    nrpe = endpoint_from_flag('nrpe-external-master.available')
    if nrpe is None:
        return
    [ ... existing implementation ... ]

But, yeah, that isn't clearer is it. So whichever you prefer, at least until charms.reactive gets better support for update-charm and stop hooks, or allows @when decorators to mix with @hook.

> +
> +
>  @when('config.changed')
>  def config_changed():
>      # clear_flag('hw-health.configured')


-- 
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/361785
Your team Nagios Charm developers is subscribed to branch hw-health-charm:master.


References