nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00404
Re: [Merge] ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master
Thank you both for your comments. I have added replies inline.
Diff comments:
> diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
> new file mode 100644
> index 0000000..998e9f4
> --- /dev/null
> +++ b/src/files/ipmi/cron_ipmi_sensors.py
> @@ -0,0 +1,50 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import os
> +import subprocess
> +import sys
> +
> +CHECK_IPMI_PID = '/var/run/nagios/check_ipmi_sensors.pid'
> +TMP_OUTPUT_FILE = '/tmp/ipmi_sensors.out'
> +OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
> +CMD = '/usr/local/lib/nagios/plugins/check_ipmi_sensor'
> +NAGIOS_ERRORS = {
> + 1: 'WARNING',
> + 2: 'CRITICAL',
> + 3: 'UNKNOWN',
> +}
> +
> +
> +def gather_metrics():
> + # a child is already running
> + if os.path.exists(CHECK_IPMI_PID):
> + return
> +
> + try:
> + with open(CHECK_IPMI_PID, 'w') as fd:
> + fd.write(str(os.getpid()))
> +
> + cmdline = [CMD]
> + if len(sys.argv) > 1:
> + cmdline.extend(sys.argv[1:])
> +
> + output = subprocess.check_output(cmdline)
> + with open(TMP_OUTPUT_FILE, 'w') as fd:
> + fd.write(output.decode(errors='ignore'))
> + os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> + except subprocess.CalledProcessError as error:
> + output = error.stdout.decode(errors='ignore')
> + with open(TMP_OUTPUT_FILE, 'w') as fd:
> + fd.write('{}: {}'.format(NAGIOS_ERRORS[error.returncode], output))
> + os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> + except PermissionError as error:
> + with (OUTPUT_FILE, 'w') as fd:
> + fd.write('CRITICAL: {}'.format(error))
I think you are right. We won't get paged, but we will see an issue in a dashboard. Thank you.
> +
> + # remove pid reference
> + os.remove(CHECK_IPMI_PID)
> +
> +
> +if __name__ == '__main__':
> + gather_metrics()
> diff --git a/src/lib/hwhealth/tools.py b/src/lib/hwhealth/tools.py
> index 4b967c2..bc48e7b 100644
> --- a/src/lib/hwhealth/tools.py
> +++ b/src/lib/hwhealth/tools.py
> @@ -101,6 +101,54 @@ class Tool():
> self._remove_nrpe_plugin()
> hookenv.log('Removed tool [{}]'.format(self._shortname))
>
> + def _install_cronjob(self, cron_user='root'):
> + assert self._cronjob_script is not None
> +
> + # Copy the cronjob script to the nagios plugins directory
> + src = os.path.join(self._files_dir, self._cronjob_script)
> + dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> + os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
> + os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
> + hookenv.log(
> + 'Cronjob script [{}] copied to {}'
> + ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
> + hookenv.DEBUG
> + )
> +
> + cmdline = [dst]
> + if self._cronjob_script_args and isinstance(self._cronjob_script_args, str):
> + cmdline.extend(self._cronjob_script_args.split())
I see subprocess.check_output's shell=True allows to pass strings instead of a list. However, this option is not available in py35, and we are going to deploy this charm in Xenial.
> + else:
> + # Run it once to generate the temp file, otherwise the nrpe check might
> + # fail at first. For security reasons, cronjobs that allow parameters
> + # shared by the user don't run the script as root (check_ipmi allows a
> + # missing output file)
> + subprocess.call(cmdline)
> +
> + # Now generate the cronjob file
> + cronjob_line = '*/5 * * * * {user} {cmd}\n'.format(user=cron_user, cmd=' '.join(cmdline))
Thank you. I've added the code into _install_cronjob.
> + crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> + with open(crond_file, 'w') as crond_fd:
> + crond_fd.write(cronjob_line)
> + hookenv.log(
> + 'Cronjob configured at {}'.format(crond_file),
> + hookenv.DEBUG
> + )
> + return dst
> +
> + def _remove_cronjob(self):
> + assert self._cronjob_script is not None
> +
> + crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> + cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR, self._cronjob_script)
> + os.remove(crond_file)
> + os.remove(cronjob_script)
> + hookenv.log(
> + 'Removed cronjob files [{}, {}]'
> + ''.format(crond_file, cronjob_script),
> + hookenv.DEBUG
> + )
> +
>
> class VendorTool(Tool):
> """A class representing a vendor tool binary that is shipped via resource
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index f5d84d0..5044e89 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -78,6 +72,11 @@ def config_changed():
>
> @when('config.changed.manufacturer')
> def toolset_changed():
> + if not is_flag_set('hw-health.installed'):
> + # Note(aluria): useful for testing purposes
> + clear_flag('hw-health.unsupported')
> + return
This only make sense here below. Since functional tests are generally run backed by LXD, hw-health.unsupported is likely to be set before we force manufacturer='test' during said functional testing. In a normal situation, I would expect "hw-health.installed" flag to be set and "remove_tools()" to be run (which already clears all flags to reprocess tools detection, install and configuration. "manufacturer=auto" is the default, but in the future, we may want to say "manufacturer=nvme" (or dell) and until detect/install/configure a subset of the hardware running on a host.
"""
@when_none('hw-health.installed', 'hw-health.unsupported')
@when('general-info.connected')
def install():
"""
> +
> # Changing the manufacturer option will trigger a reinstallation of the
> # tools
> remove_tools()
> @@ -101,11 +100,18 @@ def configure_nrpe():
>
> status.maintenance('Configuring nrpe checks')
>
> + nrpe_setup = nrpe.NRPE(primary=False)
> unitdb = unitdata.kv()
> for tool_class_name in unitdb.get('toolset', set()):
> # Re-instantiate the tool from the saved class name
> tool_class = getattr(tools, tool_class_name)
> - tool_class().configure_nrpe_check()
> + tool_class().configure_nrpe_check(nrpe_setup)
> +
> + if unitdb.get('toolset'):
> + # Note(aluria): This needs to be run outside of tool_class().configure_nrpe_check
> + # or shared dictionary with the nagios unit will list the last added check
The main issue is on charmhelpers.contrib.charmsupport.nrpe.NRPE.write method. We can hide the problem in the helpers lib, but I think that comment highlights the issue (probably adding "LP#1821602" into the comment will better explain it)
https://bugs.launchpad.net/charm-helpers/+bug/1821602
OTOH, remove_nrpe_check will call NRPE.write after removing a specific check, which I think will hit the bug, too :-\
As mentioned in the bug report, the nrpe-external-master interface doesn't use this dict in a shared relation key, so not sure if that is the approach we should follow :?
> + nrpe_setup.write()
> +
> status.active('ready')
> set_flag('hw-health.configured')
>
> diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
> index 6c2a66b..0414d1e 100644
> --- a/src/tests/functional/test_hwhealth.py
> +++ b/src/tests/functional/test_hwhealth.py
> @@ -171,11 +173,15 @@ async def test_stats(toolset, deployed_unit, file_stat):
> assert test_stat['mode'] == oct(tool.CRONJOB_SCRIPT_MODE)
>
> # Have we installed the cronjob itself?
> - cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
> + cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
This makes total sense, although I have had conflicting reviews about it, and template-python-pytest repo (where the code comes from) allows up to 120 chars.
https://git.launchpad.net/template-python-pytest/tree/tox.ini#n44
We should probably change that value in the template, since your reasoning is what I followed in the past, too.
> print('Checking {}'.format(cronjob_path))
> test_stat = await file_stat(cronjob_path, deployed_unit)
> assert test_stat['size'] > 0
>
> + if isinstance(tool, tools.Ipmi):
> + # it's a nrpe script, a helper script and a cronjob (no tools installed)
> + continue
> +
> # Have we installed the vendor binary?
> if isinstance(tool, tools.Mdadm):
> bin_path = os.path.join('/sbin', tool._shortname)
--
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/365060
Your team Nagios Charm developers is requested to review the proposed merge of ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.
References