← Back to team overview

nagios-charmers team mailing list archive

Re: [Merge] ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master

 

I'm jumping into the middle of this, so some of my comments may be a bit naive. Please feel free to push back if I'm being unreasonable, but I did have some concerns :-)

Diff comments:

> diff --git a/src/files/ipmi/check_ipmi.py b/src/files/ipmi/check_ipmi.py
> new file mode 100644
> index 0000000..cab68ca
> --- /dev/null
> +++ b/src/files/ipmi/check_ipmi.py
> @@ -0,0 +1,38 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import os
> +
> +from nagios_plugin3 import CriticalError, UnknownError, WarnError, check_file_freshness, try_check
> +
> +OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
> +NAGIOS_ERRORS = {
> +    'CRITICAL': CriticalError,
> +    'UNKNOWN': UnknownError,
> +    'WARNING': WarnError,
> +}
> +
> +
> +def parse_output():
> +    if not os.path.exists(OUTPUT_FILE):
> +        raise UnknownError('UNKNOWN: {} does not exist (yet?)'.format(OUTPUT_FILE))
> +
> +    # Check if file is newwer than 10min

typo: newwer -> newer

> +    try_check(check_file_freshness, OUTPUT_FILE)
> +
> +    try:
> +        with open(OUTPUT_FILE, 'r') as fd:
> +            output = fd.read()
> +    except PermissionError as error:
> +        raise UnknownError(error)
> +
> +    for startline in NAGIOS_ERRORS:
> +        if output.startswith('{}: '.format(startline)):
> +            func = NAGIOS_ERRORS[startline]
> +            raise func(output)
> +
> +    print('OK: {}'.format(output))
> +
> +
> +if __name__ == '__main__':
> +    try_check(parse_output)
> diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
> new file mode 100644
> index 0000000..998e9f4
> --- /dev/null
> +++ b/src/files/ipmi/cron_ipmi_sensors.py
> @@ -0,0 +1,50 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import os
> +import subprocess
> +import sys
> +
> +CHECK_IPMI_PID = '/var/run/nagios/check_ipmi_sensors.pid'
> +TMP_OUTPUT_FILE = '/tmp/ipmi_sensors.out'
> +OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
> +CMD = '/usr/local/lib/nagios/plugins/check_ipmi_sensor'
> +NAGIOS_ERRORS = {
> +    1: 'WARNING',
> +    2: 'CRITICAL',
> +    3: 'UNKNOWN',
> +}
> +
> +
> +def gather_metrics():
> +    # a child is already running
> +    if os.path.exists(CHECK_IPMI_PID):
> +        return
> +
> +    try:
> +        with open(CHECK_IPMI_PID, 'w') as fd:
> +            fd.write(str(os.getpid()))
> +
> +        cmdline = [CMD]
> +        if len(sys.argv) > 1:
> +            cmdline.extend(sys.argv[1:])
> +
> +        output = subprocess.check_output(cmdline)
> +        with open(TMP_OUTPUT_FILE, 'w') as fd:
> +            fd.write(output.decode(errors='ignore'))
> +        os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> +    except subprocess.CalledProcessError as error:
> +        output = error.stdout.decode(errors='ignore')
> +        with open(TMP_OUTPUT_FILE, 'w') as fd:
> +            fd.write('{}: {}'.format(NAGIOS_ERRORS[error.returncode], output))
> +        os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> +    except PermissionError as error:
> +        with (OUTPUT_FILE, 'w') as fd:
> +            fd.write('CRITICAL: {}'.format(error))

Should this be critical here, or a UNKNOWN, as in the functions above?

> +
> +    # remove pid reference
> +    os.remove(CHECK_IPMI_PID)
> +
> +
> +if __name__ == '__main__':
> +    gather_metrics()
> diff --git a/src/lib/hwhealth/tools.py b/src/lib/hwhealth/tools.py
> index 4b967c2..bc48e7b 100644
> --- a/src/lib/hwhealth/tools.py
> +++ b/src/lib/hwhealth/tools.py
> @@ -101,6 +101,54 @@ class Tool():
>          self._remove_nrpe_plugin()
>          hookenv.log('Removed tool [{}]'.format(self._shortname))
>  
> +    def _install_cronjob(self, cron_user='root'):
> +        assert self._cronjob_script is not None
> +
> +        # Copy the cronjob script to the nagios plugins directory
> +        src = os.path.join(self._files_dir, self._cronjob_script)
> +        dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> +        os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
> +        os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
> +        hookenv.log(
> +            'Cronjob script [{}] copied to {}'
> +            ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
> +            hookenv.DEBUG
> +        )
> +
> +        cmdline = [dst]
> +        if self._cronjob_script_args and isinstance(self._cronjob_script_args, str):
> +            cmdline.extend(self._cronjob_script_args.split())

This will split up strings passed as arguments with spaces in them. I believe there's something in the std lib that will take care of this for you. I don't recall offhand the name, though.

> +        else:
> +            # Run it once to generate the temp file, otherwise the nrpe check might
> +            # fail at first. For security reasons, cronjobs that allow parameters
> +            # shared by the user don't run the script as root (check_ipmi allows a
> +            # missing output file)
> +            subprocess.call(cmdline)
> +
> +        # Now generate the cronjob file
> +        cronjob_line = '*/5 * * * * {user} {cmd}\n'.format(user=cron_user, cmd=' '.join(cmdline))
> +        crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> +        with open(crond_file, 'w') as crond_fd:
> +            crond_fd.write(cronjob_line)
> +            hookenv.log(
> +                'Cronjob configured at {}'.format(crond_file),
> +                hookenv.DEBUG
> +            )
> +        return dst
> +
> +    def _remove_cronjob(self):
> +        assert self._cronjob_script is not None
> +
> +        crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> +        cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR, self._cronjob_script)
> +        os.remove(crond_file)
> +        os.remove(cronjob_script)
> +        hookenv.log(
> +            'Removed cronjob files [{}, {}]'
> +            ''.format(crond_file, cronjob_script),
> +            hookenv.DEBUG
> +        )
> +
>  
>  class VendorTool(Tool):
>      """A class representing a vendor tool binary that is shipped via resource
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index f5d84d0..5044e89 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -78,6 +72,11 @@ def config_changed():
>  
>  @when('config.changed.manufacturer')
>  def toolset_changed():
> +    if not is_flag_set('hw-health.installed'):
> +        # Note(aluria): useful for testing purposes
> +        clear_flag('hw-health.unsupported')
> +        return

I'm unclear why we'd clear the hw-health.unsupported if hw-health is not installed. I'm sure there's a good reason to do it, but I'd like to see a more detailed comment about why.

> +
>      # Changing the manufacturer option will trigger a reinstallation of the
>      # tools
>      remove_tools()
> @@ -101,11 +100,18 @@ def configure_nrpe():
>  
>      status.maintenance('Configuring nrpe checks')
>  
> +    nrpe_setup = nrpe.NRPE(primary=False)
>      unitdb = unitdata.kv()
>      for tool_class_name in unitdb.get('toolset', set()):
>          # Re-instantiate the tool from the saved class name
>          tool_class = getattr(tools, tool_class_name)
> -        tool_class().configure_nrpe_check()
> +        tool_class().configure_nrpe_check(nrpe_setup)
> +
> +    if unitdb.get('toolset'):
> +        # Note(aluria): This needs to be run outside of tool_class().configure_nrpe_check
> +        # or shared dictionary with the nagios unit will list the last added check

Huh. This sounds like maybe a bug in the tool_class. Are you certain that it doesn't have a hook to do setup stuff inside of itself? That would encapsulate things much more neatly.

> +        nrpe_setup.write()
> +
>      status.active('ready')
>      set_flag('hw-health.configured')
>  
> diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
> index 6c2a66b..0414d1e 100644
> --- a/src/tests/functional/test_hwhealth.py
> +++ b/src/tests/functional/test_hwhealth.py
> @@ -171,11 +173,15 @@ async def test_stats(toolset, deployed_unit, file_stat):
>              assert test_stat['mode'] == oct(tool.CRONJOB_SCRIPT_MODE)
>  
>              # Have we installed the cronjob itself?
> -            cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
> +            cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))

This should fail a linter check for lines > 80 characters. Please format like:

cronjob_path = os.path.join(foo,
                            bar)

or 

cronjob_path = os.path.join(
    foo,
    bar
)

>              print('Checking {}'.format(cronjob_path))
>              test_stat = await file_stat(cronjob_path, deployed_unit)
>              assert test_stat['size'] > 0
>  
> +            if isinstance(tool, tools.Ipmi):
> +                # it's a nrpe script, a helper script and a cronjob (no tools installed)
> +                continue
> +
>              # Have we installed the vendor binary?
>              if isinstance(tool, tools.Mdadm):
>                  bin_path = os.path.join('/sbin', tool._shortname)
> @@ -231,12 +238,12 @@ async def test_removal(toolset, model, deploy_app, file_stat):
>                  await file_stat(cronjob_path, ubuntu_unit)
>  
>              # Have we removed the cronjob itself?
> -            cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
> +            cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))

See above about 80 character lines (I know this is nitpicky, but a lot of people take advantage of the 80 character limit to view code in side-by-side buffers, or otherwise manage their screen real estate efficiently).

>              print('Checking {}'.format(cronjob_path))
>              with pytest.raises(RuntimeError):
>                  await file_stat(cronjob_path, ubuntu_unit)
>  
> -            if not isinstance(tool, tools.Mdadm):
> +            if not isinstance(tool, tools.Mdadm) and not isinstance(tool, tools.Ipmi):
>                  # /sbin/mdadm will not be removed, but the vendor binaries
>                  # should have been
>                  bin_path = os.path.join(tool.TOOLS_DIR, tool._shortname)


-- 
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/365060
Your team Nagios Charm developers is requested to review the proposed merge of ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.


References