← Back to team overview

nagios-charmers team mailing list archive

[Merge] ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master

 

Alvaro Uría has proposed merging ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.

Requested reviews:
  Nagios Charm developers (nagios-charmers)

For more details, see:
https://code.launchpad.net/~aluria/hw-health-charm/+git/hw-health-charm/+merge/365060
-- 
Your team Nagios Charm developers is requested to review the proposed merge of ~aluria/hw-health-charm/+git/hw-health-charm:fix-check-ipmi into hw-health-charm:master.
diff --git a/src/files/ipmi/99-check_ipmi_sensor b/src/files/ipmi/99-check_ipmi_sensor
new file mode 100644
index 0000000..ead55da
--- /dev/null
+++ b/src/files/ipmi/99-check_ipmi_sensor
@@ -0,0 +1 @@
+nagios ALL=(root) NOPASSWD: /usr/sbin/ipmimonitoring, /usr/sbin/ipmi-dcmi, /usr/sbin/ipmi-fru, /usr/sbin/ipmi-sensors, /usr/sbin/ipmi-sel
diff --git a/src/files/ipmi/99-check_ipmi_sensor.sudoer b/src/files/ipmi/99-check_ipmi_sensor.sudoer
deleted file mode 100644
index 6571238..0000000
--- a/src/files/ipmi/99-check_ipmi_sensor.sudoer
+++ /dev/null
@@ -1 +0,0 @@
-nagios ALL=(root) NOPASSWD: /usr/sbin/ipmi-sensors, /usr/sbin/ipmi-sel
diff --git a/src/files/ipmi/check_ipmi.py b/src/files/ipmi/check_ipmi.py
new file mode 100644
index 0000000..cab68ca
--- /dev/null
+++ b/src/files/ipmi/check_ipmi.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: us-ascii -*-
+
+import os
+
+from nagios_plugin3 import CriticalError, UnknownError, WarnError, check_file_freshness, try_check
+
+OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
+NAGIOS_ERRORS = {
+    'CRITICAL': CriticalError,
+    'UNKNOWN': UnknownError,
+    'WARNING': WarnError,
+}
+
+
+def parse_output():
+    if not os.path.exists(OUTPUT_FILE):
+        raise UnknownError('UNKNOWN: {} does not exist (yet?)'.format(OUTPUT_FILE))
+
+    # Check if file is newwer than 10min
+    try_check(check_file_freshness, OUTPUT_FILE)
+
+    try:
+        with open(OUTPUT_FILE, 'r') as fd:
+            output = fd.read()
+    except PermissionError as error:
+        raise UnknownError(error)
+
+    for startline in NAGIOS_ERRORS:
+        if output.startswith('{}: '.format(startline)):
+            func = NAGIOS_ERRORS[startline]
+            raise func(output)
+
+    print('OK: {}'.format(output))
+
+
+if __name__ == '__main__':
+    try_check(parse_output)
diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
new file mode 100644
index 0000000..998e9f4
--- /dev/null
+++ b/src/files/ipmi/cron_ipmi_sensors.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# -*- coding: us-ascii -*-
+
+import os
+import subprocess
+import sys
+
+CHECK_IPMI_PID = '/var/run/nagios/check_ipmi_sensors.pid'
+TMP_OUTPUT_FILE = '/tmp/ipmi_sensors.out'
+OUTPUT_FILE = '/var/lib/nagios/ipmi_sensors.out'
+CMD = '/usr/local/lib/nagios/plugins/check_ipmi_sensor'
+NAGIOS_ERRORS = {
+    1: 'WARNING',
+    2: 'CRITICAL',
+    3: 'UNKNOWN',
+}
+
+
+def gather_metrics():
+    # a child is already running
+    if os.path.exists(CHECK_IPMI_PID):
+        return
+
+    try:
+        with open(CHECK_IPMI_PID, 'w') as fd:
+            fd.write(str(os.getpid()))
+
+        cmdline = [CMD]
+        if len(sys.argv) > 1:
+            cmdline.extend(sys.argv[1:])
+
+        output = subprocess.check_output(cmdline)
+        with open(TMP_OUTPUT_FILE, 'w') as fd:
+            fd.write(output.decode(errors='ignore'))
+        os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
+    except subprocess.CalledProcessError as error:
+        output = error.stdout.decode(errors='ignore')
+        with open(TMP_OUTPUT_FILE, 'w') as fd:
+            fd.write('{}: {}'.format(NAGIOS_ERRORS[error.returncode], output))
+        os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
+    except PermissionError as error:
+        with (OUTPUT_FILE, 'w') as fd:
+            fd.write('CRITICAL: {}'.format(error))
+
+    # remove pid reference
+    os.remove(CHECK_IPMI_PID)
+
+
+if __name__ == '__main__':
+    gather_metrics()
diff --git a/src/lib/hwhealth/tools.py b/src/lib/hwhealth/tools.py
index 4b967c2..bc48e7b 100644
--- a/src/lib/hwhealth/tools.py
+++ b/src/lib/hwhealth/tools.py
@@ -44,6 +44,8 @@ class Tool():
                                        'files',
                                        self._shortname)
         self._nrpe_script = 'check_{}'.format(shortname)
+        self._cronjob_script = None
+        self._cronjob_script_args = None
 
     def _install_nrpe_plugin(self):
         src = os.path.join(self._files_dir, self._nrpe_script)
@@ -65,15 +67,13 @@ class Tool():
             hookenv.DEBUG
         )
 
-    def configure_nrpe_check(self):
-        nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
+    def configure_nrpe_check(self, nrpe_setup):
         cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
         nrpe_setup.add_check(
             shortname=self._shortname,
             description='{} Hardware Health',
-            check_cmd=cmd
+            check_cmd=cmd,
         )
-        nrpe_setup.write()
         hookenv.log(
             'configured NRPE check for tool [{}]'.format(self._shortname),
             hookenv.DEBUG
@@ -101,6 +101,54 @@ class Tool():
         self._remove_nrpe_plugin()
         hookenv.log('Removed tool [{}]'.format(self._shortname))
 
+    def _install_cronjob(self, cron_user='root'):
+        assert self._cronjob_script is not None
+
+        # Copy the cronjob script to the nagios plugins directory
+        src = os.path.join(self._files_dir, self._cronjob_script)
+        dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
+        os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
+        os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
+        hookenv.log(
+            'Cronjob script [{}] copied to {}'
+            ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
+            hookenv.DEBUG
+        )
+
+        cmdline = [dst]
+        if self._cronjob_script_args and isinstance(self._cronjob_script_args, str):
+            cmdline.extend(self._cronjob_script_args.split())
+        else:
+            # Run it once to generate the temp file, otherwise the nrpe check might
+            # fail at first. For security reasons, cronjobs that allow parameters
+            # shared by the user don't run the script as root (check_ipmi allows a
+            # missing output file)
+            subprocess.call(cmdline)
+
+        # Now generate the cronjob file
+        cronjob_line = '*/5 * * * * {user} {cmd}\n'.format(user=cron_user, cmd=' '.join(cmdline))
+        crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
+        with open(crond_file, 'w') as crond_fd:
+            crond_fd.write(cronjob_line)
+            hookenv.log(
+                'Cronjob configured at {}'.format(crond_file),
+                hookenv.DEBUG
+            )
+        return dst
+
+    def _remove_cronjob(self):
+        assert self._cronjob_script is not None
+
+        crond_file = os.path.join(self.CROND_DIR, 'hwhealth_{}'.format(self._shortname))
+        cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR, self._cronjob_script)
+        os.remove(crond_file)
+        os.remove(cronjob_script)
+        hookenv.log(
+            'Removed cronjob files [{}, {}]'
+            ''.format(crond_file, cronjob_script),
+            hookenv.DEBUG
+        )
+
 
 class VendorTool(Tool):
     """A class representing a vendor tool binary that is shipped via resource
@@ -179,44 +227,6 @@ class VendorTool(Tool):
             hookenv.DEBUG
         )
 
-    def _install_cronjob(self):
-        # Copy the cronjob script to the nagios plugins directory
-        src = os.path.join(self._files_dir, self._cronjob_script)
-        dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
-        os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
-        os.chown(dst, uid=self.CRONJOB_SCRIPT_UID, gid=self.CRONJOB_SCRIPT_GID)
-        hookenv.log(
-            'Cronjob script [{}] copied to {}'
-            ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
-            hookenv.DEBUG
-        )
-        # Run it once to generate the temp file, otherwise the nrpe check might
-        # fail at first
-        subprocess.call(dst)
-        # Now generate the cronjob file
-        cronjob_line = '*/5 * * * * root {}\n'.format(dst)
-        crond_file = os.path.join(self.CROND_DIR, self._shortname)
-        with open(crond_file, 'w') as crond_fd:
-            crond_fd.write(cronjob_line)
-            hookenv.log(
-                'Cronjob configured at {}'.format(crond_file),
-                hookenv.DEBUG
-            )
-
-        return dst
-
-    def _remove_cronjob(self):
-        crond_file = os.path.join(self.CROND_DIR, self._shortname)
-        cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR,
-                                      self._cronjob_script)
-        os.remove(crond_file)
-        os.remove(cronjob_script)
-        hookenv.log(
-            'Removed cronjob files [{}, {}]'
-            ''.format(crond_file, cronjob_script),
-            hookenv.DEBUG
-        )
-
 
 class Sas3Ircu(VendorTool):
     """A class representing the sas3ircu tool
@@ -280,20 +290,45 @@ class Ipmi(Tool):
     actual nrpe check, which is imported as a git submodule
     """
     def __init__(self, nrpe_opts=''):
-        super().__init__(shortname='ipmi',
-                         nrpe_opts=hookenv.config('ipmi_check_options'))
-        self._nrpe_script = 'check_ipmi_sensor'
-        self._sudoer_file = '99-check_ipmi_sensor.sudoer'
+        super().__init__(shortname='ipmi')
+        self._nrpe_script = 'check_ipmi.py'
+        self._sudoer_file = '99-check_ipmi_sensor'
+        self._cronjob_script = 'cron_ipmi_sensors.py'
+        self._cronjob_script_args = hookenv.config('ipmi_check_options')
+
+    def configure_nrpe_check(self, nrpe_setup):
+        # extra options for check_ipmi_sensors Perl script are configured in the cronjob
+        self._install_cronjob(cron_user='nagios')
+        super().configure_nrpe_check(nrpe_setup)
 
     def install(self):
         # Install the sudoer file
         self._install_sudoer()
+        # Install Perl script called by the (Python) cronjob
+        self._install_nrpe_helper_plugin()
+        self._install_cronjob(cron_user='nagios')
+
+        # Install the Python script called by check_nrpe
         super().install()
 
     def remove(self):
         self._remove_sudoer()
+        self._remove_nrpe_helper_plugin()
+        self._remove_cronjob()
         super().remove()
 
+    def _install_nrpe_helper_plugin(self):
+        original_nrpe_script = self._nrpe_script
+        self._nrpe_script = 'check_ipmi_sensor'
+        super()._install_nrpe_plugin()
+        self._nrpe_script = original_nrpe_script
+
+    def _remove_nrpe_helper_plugin(self):
+        original_nrpe_script = self._nrpe_script
+        self._nrpe_script = 'check_ipmi_sensor'
+        super()._remove_nrpe_plugin()
+        self._nrpe_script = original_nrpe_script
+
     def _install_sudoer(self):
         # User nagios is normally not able to run ipmi-sel and ipmi-sensors, so
         # we need to add a sudoer entry for it
diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
index f5d84d0..5044e89 100644
--- a/src/reactive/hw_health.py
+++ b/src/reactive/hw_health.py
@@ -1,17 +1,11 @@
 import os
 
-from charmhelpers.core import hookenv, host, unitdata
 from charms.layer import status
-from charms.reactive.flags import (
-    set_flag,
-    clear_flag,
-)
-from charms.reactive import (
-    hook,
-    when,
-    when_none,
-    when_not,
-)
+from charms.reactive.flags import clear_flag, is_flag_set, set_flag
+from charms.reactive import hook, when, when_none, when_not
+from charmhelpers.contrib.charmsupport import nrpe
+from charmhelpers.core import hookenv, host, unitdata
+
 from hwhealth.hwdiscovery import get_tools
 from hwhealth import tools
 
@@ -78,6 +72,11 @@ def config_changed():
 
 @when('config.changed.manufacturer')
 def toolset_changed():
+    if not is_flag_set('hw-health.installed'):
+        # Note(aluria): useful for testing purposes
+        clear_flag('hw-health.unsupported')
+        return
+
     # Changing the manufacturer option will trigger a reinstallation of the
     # tools
     remove_tools()
@@ -101,11 +100,18 @@ def configure_nrpe():
 
     status.maintenance('Configuring nrpe checks')
 
+    nrpe_setup = nrpe.NRPE(primary=False)
     unitdb = unitdata.kv()
     for tool_class_name in unitdb.get('toolset', set()):
         # Re-instantiate the tool from the saved class name
         tool_class = getattr(tools, tool_class_name)
-        tool_class().configure_nrpe_check()
+        tool_class().configure_nrpe_check(nrpe_setup)
+
+    if unitdb.get('toolset'):
+        # Note(aluria): This needs to be run outside of tool_class().configure_nrpe_check
+        # or shared dictionary with the nagios unit will list the last added check
+        nrpe_setup.write()
+
     status.active('ready')
     set_flag('hw-health.configured')
 
diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
index 6c2a66b..0414d1e 100644
--- a/src/tests/functional/test_hwhealth.py
+++ b/src/tests/functional/test_hwhealth.py
@@ -151,6 +151,7 @@ async def test_stats(toolset, deployed_unit, file_stat):
         assert test_stat['gid'] == tool.NRPE_PLUGINS_GID
         assert test_stat['uid'] == tool.NRPE_PLUGINS_UID
         assert test_stat['mode'] == oct(tool.NRPE_PLUGINS_MODE)
+
         if isinstance(tool, tools.Ipmi):
             # Have we added sudo rights for running freeipmi commands?
             sudoer_path = os.path.join(tool.SUDOERS_DIR, tool._sudoer_file)
@@ -160,7 +161,8 @@ async def test_stats(toolset, deployed_unit, file_stat):
             assert test_stat['gid'] == tool.SUDOERS_GID
             assert test_stat['uid'] == tool.SUDOERS_UID
             assert test_stat['mode'] == oct(tool.SUDOERS_MODE)
-        elif isinstance(tool, tools.VendorTool):
+
+        if isinstance(tool, tools.Ipmi) or isinstance(tool, tools.VendorTool):
             # Have we installed the cronjob script helper?
             cronjob_script_path = os.path.join(tool.NRPE_PLUGINS_DIR, tool._cronjob_script)
             print('Checking {}'.format(cronjob_script_path))
@@ -171,11 +173,15 @@ async def test_stats(toolset, deployed_unit, file_stat):
             assert test_stat['mode'] == oct(tool.CRONJOB_SCRIPT_MODE)
 
             # Have we installed the cronjob itself?
-            cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
+            cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
             print('Checking {}'.format(cronjob_path))
             test_stat = await file_stat(cronjob_path, deployed_unit)
             assert test_stat['size'] > 0
 
+            if isinstance(tool, tools.Ipmi):
+                # it's a nrpe script, a helper script and a cronjob (no tools installed)
+                continue
+
             # Have we installed the vendor binary?
             if isinstance(tool, tools.Mdadm):
                 bin_path = os.path.join('/sbin', tool._shortname)
@@ -222,7 +228,8 @@ async def test_removal(toolset, model, deploy_app, file_stat):
             print('Checking {}'.format(sudoer_path))
             with pytest.raises(RuntimeError):
                 await file_stat(sudoer_path, ubuntu_unit)
-        elif isinstance(tool, tools.VendorTool):
+
+        if isinstance(tool, tools.Ipmi) or isinstance(tool, tools.VendorTool):
             # Have we removed the cronjob script helper?
             cronjob_path = os.path.join(tool.NRPE_PLUGINS_DIR,
                                         tool._cronjob_script)
@@ -231,12 +238,12 @@ async def test_removal(toolset, model, deploy_app, file_stat):
                 await file_stat(cronjob_path, ubuntu_unit)
 
             # Have we removed the cronjob itself?
-            cronjob_path = os.path.join(tool.CROND_DIR, tool._shortname)
+            cronjob_path = os.path.join(tool.CROND_DIR, 'hwhealth_{}'.format(tool._shortname))
             print('Checking {}'.format(cronjob_path))
             with pytest.raises(RuntimeError):
                 await file_stat(cronjob_path, ubuntu_unit)
 
-            if not isinstance(tool, tools.Mdadm):
+            if not isinstance(tool, tools.Mdadm) and not isinstance(tool, tools.Ipmi):
                 # /sbin/mdadm will not be removed, but the vendor binaries
                 # should have been
                 bin_path = os.path.join(tool.TOOLS_DIR, tool._shortname)

Follow ups