nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00397
[Merge] ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master
Alvaro Uría has proposed merging ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.
Requested reviews:
Nagios Charm developers (nagios-charmers)
For more details, see:
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/365060
--
Your team Nagios Charm developers is requested to review the proposed merge of ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.
diff --git a/src/files/ipmi/99-check_ipmi_sensor b/src/files/ipmi/99-check_ipmi_sensor
new file mode 100644
index 0000000..ead55da
--- /dev/null
+++ b/src/files/ipmi/99-check_ipmi_sensor
@@ -0,0 +1 @@
+nagios ALL=(root) NOPASSWD: /usr/sbin/ipmimonitoring, /usr/sbin/ipmi-dcmi, /usr/sbin/ipmi-fru, /usr/sbin/ipmi-sensors, /usr/sbin/ipmi-sel
diff --git a/src/files/ipmi/99-check_ipmi_sensor.sudoer b/src/files/ipmi/99-check_ipmi_sensor.sudoer
deleted file mode 100644
index 6571238..0000000
--- a/src/files/ipmi/99-check_ipmi_sensor.sudoer
+++ /dev/null
@@ -1 +0,0 @@
-nagios ALL=(root) NOPASSWD: /usr/sbin/ipmi-sensors, /usr/sbin/ipmi-sel
diff --git a/src/files/ipmi/check_ipmi.py b/src/files/ipmi/check_ipmi.py
new file mode 100644
index 0000000..cab68ca
--- /dev/null
+++ b/src/files/ipmi/check_ipmi.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: us-ascii -*-
+
+import os
+
+from nagios_plugin3 import CriticalError, UnknownError, WarnError, check_file_freshness, try_check
+
+OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
+NAGIOS_ERRORS = {
+ 'CRITICAL': CriticalError,
+ 'UNKNOWN': UnknownError,
+ 'WARNING': WarnError,
+}
+
+
+def parse_output():
+ if not os.path.exists(OUTPUT_FILE):
+ raise UnknownError('UNKNOWN: {} does not exist (yet?)'.format(OUTPUT_FILE))
+
+ # Check if file is newwer than 10min
+ try_check(check_file_freshness, OUTPUT_FILE)
+
+ try:
+ with open(OUTPUT_FILE, 'r') as fd:
+ output = fd.read()
+ except PermissionError as error:
+ raise UnknownError(error)
+
+ for startline in NAGIOS_ERRORS:
+ if output.startswith('{}: '.format(startline)):
+ func = NAGIOS_ERRORS[startline]
+ raise func(output)
+
+ print('OK: {}'.format(output))
+
+
+if __name__ == '__main__':
+ try_check(parse_output)
diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
new file mode 100644
index 0000000..998e9f4
--- /dev/null
+++ b/src/files/ipmi/cron_ipmi_sensors.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# -*- coding: us-ascii -*-
+
+import os
+import subprocess
+import sys
+
+CHECK_IPMI_PID = '/var/run/nagios/check_ipmi_sensors.pid'
+TMP_OUTPUT_FILE = '/tmp/ipmi_sensors.out'
+OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
+CMD = '/usr/local/lib/nagios/plugins/check_ipmi_sensor'
+NAGIOS_ERRORS = {
+ 1: 'WARNING',
+ 2: 'CRITICAL',
+ 3: 'UNKNOWN',
+}
+
+
+def gather_metrics():
+ # a child is already running
+ if os.path.exists(CHECK_IPMI_PID):
+ return
+
+ try:
+ with open(CHECK_IPMI_PID, 'w') as fd:
+ fd.write(str(os.getpid()))
+
+ cmdline = [CMD]
+ if len(sys.argv) > 1:
+ cmdline.extend(sys.argv[1:])
+
+ output = subprocess.check_output(cmdline)
+ with open(TMP_OUTPUT_FILE, 'w') as fd:
+ fd.write(output.decode(errors='ignore'))
+ os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
+ except subprocess.CalledProcessError as error:
+ output = error.stdout.decode(errors='ignore')
+ with open(TMP_OUTPUT_FILE, 'w') as fd:
+ fd.write('{}: {}'.format(NAGIOS_ERRORS[error.returncode], output))
+ os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
+ except PermissionError as error:
+ with (OUTPUT_FILE, 'w') as fd:
+ fd.write('CRITICAL: {}'.format(error))
+
+ # remove pid reference
+ os.remove(CHECK_IPMI_PID)
+
+
+if __name__ == '__main__':
+ gather_metrics()
diff --git a/src/lib/hwhealth/tools.py b/src/lib/hwhealth/tools.py
index 4b967c2..bc48e7b 100644
--- a/src/lib/hwhealth/tools.py
+++ b/src/lib/hwhealth/tools.py
@@ -44,6 +44,8 @@ class Tool():
'files',
self._shortname)
self._nrpe_script = 'check_{}'.format(shortname)
+ self._cronjob_script = None
+ self._cronjob_script_args = None
def _install_nrpe_plugin(self):
src = os.path.join(self._files_dir, self._nrpe_script)
@@ -65,15 +67,13 @@ class Tool():
hookenv.DEBUG
)
- def configure_nrpe_check(self):
- nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
+ def configure_nrpe_check(self, nrpe_setup):
cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
nrpe_setup.add_check(
shortname=self._shortname,
description='{} Hardware Health',
- check_cmd=cmd
+ check_cmd=cmd,
)
- nrpe_setup.write()
hookenv.log(
'configured NRPE check for tool [{}]'.format(self._shortname),
hookenv.DEBUG
@@ -101,6 +101,54 @@ class Tool():
self._remove_nrpe_plugin()
hookenv.log('Removed tool [{}]'.format(self._shortname))
+ def _install_cronjob(self, cron_user='root'):
+ assert self._cronjob_script is not None
+
+ # Copy the cronjob script to the nagios plugins directory
+ src = os.path.join(self._files_dir, self._cronjob_script)
+ dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
+ os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
+ os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
+ hookenv.log(
+ 'Cronjob script [{}] copied to {}'
+ ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
+ hookenv.DEBUG
+ )
+
+ cmdline = [dst]
+ if self._cronjob_script_args and isinstance(self._cronjob_script_args, str):
+ cmdline.extend(self._cronjob_script_args.split())
+ else:
+ # Run it once to generate the temp file, otherwise the nrpe check might
+ # fail at first. For security reasons, cronjobs that allow parameters
+ # shared by the user don't run the script as root (check_ipmi allows a
+ # missing output file)
+ subprocess.call(cmdline)
+
+ # Now generate the cronjob file
+ cronjob_line = '*/5 * * * * {user} {cmd}\n'.format(user=cron_user, cmd=' '.join(cmdline))
+ crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
+ with open(crond_file, 'w') as crond_fd:
+ crond_fd.write(cronjob_line)
+ hookenv.log(
+ 'Cronjob configured at {}'.format(crond_file),
+ hookenv.DEBUG
+ )
+ return dst
+
+ def _remove_cronjob(self):
+ assert self._cronjob_script is not None
+
+ crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
+ cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR, self._cronjob_script)
+ os.remove(crond_file)
+ os.remove(cronjob_script)
+ hookenv.log(
+ 'Removed cronjob files [{}, {}]'
+ ''.format(crond_file, cronjob_script),
+ hookenv.DEBUG
+ )
+
class VendorTool(Tool):
"""A class representing a vendor tool binary that is shipped via resource
@@ -179,44 +227,6 @@ class VendorTool(Tool):
hookenv.DEBUG
)
- def _install_cronjob(self):
- # Copy the cronjob script to the nagios plugins directory
- src = os.path.join(self._files_dir, self._cronjob_script)
- dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
- os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
- os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
- hookenv.log(
- 'Cronjob script [{}] copied to {}'
- ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
- hookenv.DEBUG
- )
- # Run it once to generate the temp file, otherwise the nrpe check might
- # fail at first
- subprocess.call(dst)
- # Now generate the cronjob file
- cronjob_line = '*/5 * * * * root {}\n'.format(dst)
- crond_file = os.path.join(self.CROND_DIR, self._shortname)
- with open(crond_file, 'w') as crond_fd:
- crond_fd.write(cronjob_line)
- hookenv.log(
- 'Cronjob configured at {}'.format(crond_file),
- hookenv.DEBUG
- )
-
- return dst
-
- def _remove_cronjob(self):
- crond_file = os.path.join(self.CROND_DIR, self._shortname)
- cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR,
- self._cronjob_script)
- os.remove(crond_file)
- os.remove(cronjob_script)
- hookenv.log(
- 'Removed cronjob files [{}, {}]'
- ''.format(crond_file, cronjob_script),
- hookenv.DEBUG
- )
-
class Sas3Ircu(VendorTool):
"""A class representing the sas3ircu tool
@@ -280,20 +290,45 @@ class Ipmi(Tool):
actual nrpe check, which is imported as a git submodule
"""
def __init__(self, nrpe_opts=''):
- super().__init__(shortname='ipmi',
- nrpe_opts=hookenv.config('ipmi_check_options'))
- self._nrpe_script = 'check_ipmi_sensor'
- self._sudoer_file = '99-check_ipmi_sensor.sudoer'
+ super().__init__(shortname='ipmi')
+ self._nrpe_script = 'check_ipmi.py'
+ self._sudoer_file = '99-check_ipmi_sensor'
+ self._cronjob_script = 'cron_ipmi_sensors.py'
+ self._cronjob_script_args = hookenv.config('ipmi_check_options')
+
+ def configure_nrpe_check(self, nrpe_setup):
+ # extra options for check_ipmi_sensors Perl script are configured in the cronjob
+ self._install_cronjob(cron_user='nagios')
+ super().configure_nrpe_check(nrpe_setup)
def install(self):
# Install the sudoer file
self._install_sudoer()
+ # Install Perl script called by the (Python) cronjob
+ self._install_nrpe_helper_plugin()
+ self._install_cronjob(cron_user='nagios')
+
+ # Install the Python script called by check_nrpe
super().install()
def remove(self):
self._remove_sudoer()
+ self._remove_nrpe_helper_plugin()
+ self._remove_cronjob()
super().remove()
+ def _install_nrpe_helper_plugin(self):
+ original_nrpe_script = self._nrpe_script
+ self._nrpe_script = 'check_ipmi_sensor'
+ super()._install_nrpe_plugin()
+ self._nrpe_script = original_nrpe_script
+
+ def _remove_nrpe_helper_plugin(self):
+ original_nrpe_script = self._nrpe_script
+ self._nrpe_script = 'check_ipmi_sensor'
+ super()._remove_nrpe_plugin()
+ self._nrpe_script = original_nrpe_script
+
def _install_sudoer(self):
# User nagios is normally not able to run ipmi-sel and ipmi-sensors, so
# we need to add a sudoer entry for it
diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
index f5d84d0..5044e89 100644
--- a/src/reactive/hw_health.py
+++ b/src/reactive/hw_health.py
@@ -1,17 +1,11 @@
import os
-from charmhelpers.core import hookenv, host, unitdata
from charms.layer import status
-from charms.reactive.flags import (
- set_flag,
- clear_flag,
-)
-from charms.reactive import (
- hook,
- when,
- when_none,
- when_not,
-)
+from charms.reactive.flags import clear_flag, is_flag_set, set_flag
+from charms.reactive import hook, when, when_none, when_not
+from charmhelpers.contrib.charmsupport import nrpe
+from charmhelpers.core import hookenv, host, unitdata
+
from hwhealth.hwdiscovery import get_tools
from hwhealth import tools
@@ -78,6 +72,11 @@ def config_changed():
@when('config.changed.manufacturer')
def toolset_changed():
+ if not is_flag_set('hw-health.installed'):
+ # Note(aluria): useful for testing purposes
+ clear_flag('hw-health.unsupported')
+ return
+
# Changing the manufacturer option will trigger a reinstallation of the
# tools
remove_tools()
@@ -101,11 +100,18 @@ def configure_nrpe():
status.maintenance('Configuring nrpe checks')
+ nrpe_setup = nrpe.NRPE(primary=False)
unitdb = unitdata.kv()
for tool_class_name in unitdb.get('toolset', set()):
# Re-instantiate the tool from the saved class name
tool_class = getattr(tools, tool_class_name)
- tool_class().configure_nrpe_check()
+ tool_class().configure_nrpe_check(nrpe_setup)
+
+ if unitdb.get('toolset'):
+ # Note(aluria): This needs to be run outside of tool_class().configure_nrpe_check
+ # or shared dictionary with the nagios unit will list the last added check
+ nrpe_setup.write()
+
status.active('ready')
set_flag('hw-health.configured')
diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
index 6c2a66b..0414d1e 100644
--- a/src/tests/functional/test_hwhealth.py
+++ b/src/tests/functional/test_hwhealth.py
@@ -151,6 +151,7 @@ async def test_stats(toolset, deployed_unit, file_stat):
assert test_stat['gid'] == tool.NRPE_PLUGINS_GID
assert test_stat['uid'] == tool.NRPE_PLUGINS_UID
assert test_stat['mode'] == oct(tool.NRPE_PLUGINS_MODE)
+
if isinstance(tool, tools.Ipmi):
# Have we added sudo rights for running freeipmi commands?
sudoer_path = os.path.join(tool.SUDOERS_DIR, tool._sudoer_file)
@@ -160,7 +161,8 @@ async def test_stats(toolset, deployed_unit, file_stat):
assert test_stat['gid'] == tool.SUDOERS_GID
assert test_stat['uid'] == tool.SUDOERS_UID
assert test_stat['mode'] == oct(tool.SUDOERS_MODE)
- elif isinstance(tool, tools.VendorTool):
+
+ if isinstance(tool, tools.Ipmi) or isinstance(tool, tools.VendorTool):
# Have we installed the cronjob script helper?
cronjob_script_path = os.path.join(tool.NRPE_PLUGINS_DIR, tool._cronjob_script)
print('Checking {}'.format(cronjob_script_path))
@@ -171,11 +173,15 @@ async def test_stats(toolset, deployed_unit, file_stat):
assert test_stat['mode'] == oct(tool.CRONJOB_SCRIPT_MODE)
# Have we installed the cronjob itself?
- cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
+ cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
print('Checking {}'.format(cronjob_path))
test_stat = await file_stat(cronjob_path, deployed_unit)
assert test_stat['size'] > 0
+ if isinstance(tool, tools.Ipmi):
+ # it's a nrpe script, a helper script and a cronjob (no tools installed)
+ continue
+
# Have we installed the vendor binary?
if isinstance(tool, tools.Mdadm):
bin_path = os.path.join('/sbin', tool._shortname)
@@ -222,7 +228,8 @@ async def test_removal(toolset, model, deploy_app, file_stat):
print('Checking {}'.format(sudoer_path))
with pytest.raises(RuntimeError):
await file_stat(sudoer_path, ubuntu_unit)
- elif isinstance(tool, tools.VendorTool):
+
+ if isinstance(tool, tools.Ipmi) or isinstance(tool, tools.VendorTool):
# Have we removed the cronjob script helper?
cronjob_path = os.path.join(tool.NRPE_PLUGINS_DIR,
tool._cronjob_script)
@@ -231,12 +238,12 @@ async def test_removal(toolset, model, deploy_app, file_stat):
await file_stat(cronjob_path, ubuntu_unit)
# Have we removed the cronjob itself?
- cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
+ cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
print('Checking {}'.format(cronjob_path))
with pytest.raises(RuntimeError):
await file_stat(cronjob_path, ubuntu_unit)
- if not isinstance(tool, tools.Mdadm):
+ if not isinstance(tool, tools.Mdadm) and not isinstance(tool, tools.Ipmi):
# /sbin/mdadm will not be removed, but the vendor binaries
# should have been
bin_path = os.path.join(tool.TOOLS_DIR, tool._shortname)
Follow ups