nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00401
Re: [Merge] ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master
I'm jumping into the middle of this, so some of my comments may be a bit naive. Please feel free to push back if I'm being unreasonable, but I did have some concerns :-)
Diff comments:
> diff --git a/src/files/ipmi/check_ipmi.py b/src/files/ipmi/check_ipmi.py
> new file mode 100644
> index 0000000..cab68ca
> --- /dev/null
> +++ b/src/files/ipmi/check_ipmi.py
> @@ -0,0 +1,38 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import os
> +
> +from nagios_plugin3 import CriticalError, UnknownError, WarnError, check_file_freshness, try_check
> +
> +OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
> +NAGIOS_ERRORS = {
> + 'CRITICAL': CriticalError,
> + 'UNKNOWN': UnknownError,
> + 'WARNING': WarnError,
> +}
> +
> +
> +def parse_output():
> + if not os.path.exists(OUTPUT_FILE):
> + raise UnknownError('UNKNOWN: {} does not exist (yet?)'.format(OUTPUT_FILE))
> +
> + # Check if file is newwer than 10min
typo: newwer -> newer
> + try_check(check_file_freshness, OUTPUT_FILE)
> +
> + try:
> + with open(OUTPUT_FILE, 'r') as fd:
> + output = fd.read()
> + except PermissionError as error:
> + raise UnknownError(error)
> +
> + for startline in NAGIOS_ERRORS:
> + if output.startswith('{}: '.format(startline)):
> + func = NAGIOS_ERRORS[startline]
> + raise func(output)
> +
> + print('OK: {}'.format(output))
> +
> +
> +if __name__ == '__main__':
> + try_check(parse_output)
> diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
> new file mode 100644
> index 0000000..998e9f4
> --- /dev/null
> +++ b/src/files/ipmi/cron_ipmi_sensors.py
> @@ -0,0 +1,50 @@
> +#!/usr/bin/env python3
> +# -*- coding: us-ascii -*-
> +
> +import os
> +import subprocess
> +import sys
> +
> +CHECK_IPMI_PID = '/var/run/nagios/check_ipmi_sensors.pid'
> +TMP_OUTPUT_FILE = '/tmp/ipmi_sensors.out'
> +OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
> +CMD = '/usr/local/lib/nagios/plugins/check_ipmi_sensor'
> +NAGIOS_ERRORS = {
> + 1: 'WARNING',
> + 2: 'CRITICAL',
> + 3: 'UNKNOWN',
> +}
> +
> +
> +def gather_metrics():
> + # a child is already running
> + if os.path.exists(CHECK_IPMI_PID):
> + return
> +
> + try:
> + with open(CHECK_IPMI_PID, 'w') as fd:
> + fd.write(str(os.getpid()))
> +
> + cmdline = [CMD]
> + if len(sys.argv) > 1:
> + cmdline.extend(sys.argv[1:])
> +
> + output = subprocess.check_output(cmdline)
> + with open(TMP_OUTPUT_FILE, 'w') as fd:
> + fd.write(output.decode(errors='ignore'))
> + os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> + except subprocess.CalledProcessError as error:
> + output = error.stdout.decode(errors='ignore')
> + with open(TMP_OUTPUT_FILE, 'w') as fd:
> + fd.write('{}: {}'.format(NAGIOS_ERRORS[error.returncode], output))
> + os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
> + except PermissionError as error:
> + with (OUTPUT_FILE, 'w') as fd:
> + fd.write('CRITICAL: {}'.format(error))
Should this be critical here, or a UNKNOWN, as in the functions above?
> +
> + # remove pid reference
> + os.remove(CHECK_IPMI_PID)
> +
> +
> +if __name__ == '__main__':
> + gather_metrics()
> diff --git a/src/lib/hwhealth/tools.py b/src/lib/hwhealth/tools.py
> index 4b967c2..bc48e7b 100644
> --- a/src/lib/hwhealth/tools.py
> +++ b/src/lib/hwhealth/tools.py
> @@ -101,6 +101,54 @@ class Tool():
> self._remove_nrpe_plugin()
> hookenv.log('Removed tool [{}]'.format(self._shortname))
>
> + def _install_cronjob(self, cron_user='root'):
> + assert self._cronjob_script is not None
> +
> + # Copy the cronjob script to the nagios plugins directory
> + src = os.path.join(self._files_dir, self._cronjob_script)
> + dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> + os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
> + os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
> + hookenv.log(
> + 'Cronjob script [{}] copied to {}'
> + ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
> + hookenv.DEBUG
> + )
> +
> + cmdline = [dst]
> + if self._cronjob_script_args and isinstance(self._cronjob_script_args, str):
> + cmdline.extend(self._cronjob_script_args.split())
This will split up strings passed as arguments with spaces in them. I believe there's something in the std lib that will take care of this for you. I don't recall offhand the name, though.
> + else:
> + # Run it once to generate the temp file, otherwise the nrpe check might
> + # fail at first. For security reasons, cronjobs that allow parameters
> + # shared by the user don't run the script as root (check_ipmi allows a
> + # missing output file)
> + subprocess.call(cmdline)
> +
> + # Now generate the cronjob file
> + cronjob_line = '*/5 * * * * {user} {cmd}\n'.format(user=cron_user, cmd=' '.join(cmdline))
> + crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> + with open(crond_file, 'w') as crond_fd:
> + crond_fd.write(cronjob_line)
> + hookenv.log(
> + 'Cronjob configured at {}'.format(crond_file),
> + hookenv.DEBUG
> + )
> + return dst
> +
> + def _remove_cronjob(self):
> + assert self._cronjob_script is not None
> +
> + crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
> + cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR, self._cronjob_script)
> + os.remove(crond_file)
> + os.remove(cronjob_script)
> + hookenv.log(
> + 'Removed cronjob files [{}, {}]'
> + ''.format(crond_file, cronjob_script),
> + hookenv.DEBUG
> + )
> +
>
> class VendorTool(Tool):
> """A class representing a vendor tool binary that is shipped via resource
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index f5d84d0..5044e89 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -78,6 +72,11 @@ def config_changed():
>
> @when('config.changed.manufacturer')
> def toolset_changed():
> + if not is_flag_set('hw-health.installed'):
> + # Note(aluria): useful for testing purposes
> + clear_flag('hw-health.unsupported')
> + return
I'm unclear why we'd clear the hw-health.unsupported if hw-health is not installed. I'm sure there's a good reason to do it, but I'd like to see a more detailed comment about why.
> +
> # Changing the manufacturer option will trigger a reinstallation of the
> # tools
> remove_tools()
> @@ -101,11 +100,18 @@ def configure_nrpe():
>
> status.maintenance('Configuring nrpe checks')
>
> + nrpe_setup = nrpe.NRPE(primary=False)
> unitdb = unitdata.kv()
> for tool_class_name in unitdb.get('toolset', set()):
> # Re-instantiate the tool from the saved class name
> tool_class = getattr(tools, tool_class_name)
> - tool_class().configure_nrpe_check()
> + tool_class().configure_nrpe_check(nrpe_setup)
> +
> + if unitdb.get('toolset'):
> + # Note(aluria): This needs to be run outside of tool_class().configure_nrpe_check
> + # or shared dictionary with the nagios unit will list the last added check
Huh. This sounds like maybe a bug in the tool_class. Are you certain that it doesn't have a hook to do setup stuff inside of itself? That would encapsulate things much more neatly.
> + nrpe_setup.write()
> +
> status.active('ready')
> set_flag('hw-health.configured')
>
> diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
> index 6c2a66b..0414d1e 100644
> --- a/src/tests/functional/test_hwhealth.py
> +++ b/src/tests/functional/test_hwhealth.py
> @@ -171,11 +173,15 @@ async def test_stats(toolset, deployed_unit, file_stat):
> assert test_stat['mode'] == oct(tool.CRONJOB_SCRIPT_MODE)
>
> # Have we installed the cronjob itself?
> - cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
> + cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
This should fail a linter check for lines > 80 characters. Please format like:
cronjob_path = os.path.join(foo,
bar)
or
cronjob_path = os.path.join(
foo,
bar
)
> print('Checking {}'.format(cronjob_path))
> test_stat = await file_stat(cronjob_path, deployed_unit)
> assert test_stat['size'] > 0
>
> + if isinstance(tool, tools.Ipmi):
> + # it's a nrpe script, a helper script and a cronjob (no tools installed)
> + continue
> +
> # Have we installed the vendor binary?
> if isinstance(tool, tools.Mdadm):
> bin_path = os.path.join('/sbin', tool._shortname)
> @@ -231,12 +238,12 @@ async def test_removal(toolset, model, deploy_app, file_stat):
> await file_stat(cronjob_path, ubuntu_unit)
>
> # Have we removed the cronjob itself?
> - cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
> + cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
See above about 80 character lines (I know this is nitpicky, but a lot of people take advantage of the 80 character limit to view code in side-by-side buffers, or otherwise manage their screen real estate efficiently).
> print('Checking {}'.format(cronjob_path))
> with pytest.raises(RuntimeError):
> await file_stat(cronjob_path, ubuntu_unit)
>
> - if not isinstance(tool, tools.Mdadm):
> + if not isinstance(tool, tools.Mdadm) and not isinstance(tool, tools.Ipmi):
> # /sbin/mdadm will not be removed, but the vendor binaries
> # should have been
> bin_path = os.path.join(tool.TOOLS_DIR, tool._shortname)
--
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/365060
Your team Nagios Charm developers is requested to review the proposed merge of ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.
References