nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00360
Re: [Merge] ~aieri/hw-health-charm:oo-rewrite-core into hw-health-charm:oo-rewrite-integration
Review: Approve
lgtm
Diff comments:
> diff --git a/src/lib/hwhealth/classes.py b/src/lib/hwhealth/classes.py
> new file mode 100644
> index 0000000..1884059
> --- /dev/null
> +++ b/src/lib/hwhealth/classes.py
> @@ -0,0 +1,298 @@
> +import os
> +import shutil
> +import subprocess
> +from zipfile import ZipFile, BadZipFile
> +from tempfile import TemporaryDirectory
> +from charmhelpers import fetch
> +from charmhelpers.core import hookenv
> +from charmhelpers.core.hookenv import status_set
> +from charmhelpers.contrib.charmsupport import nrpe
> +
> +
> +class Tool():
> + '''
> + This is an abstract class representing a "tool".
> + The idea is to delegate install/configure duties to specific per-tool
> + classes instead of handling multiple exceptional cases in a main loop
> + during the install hook.
> + Every tool should implement its own internal logic regarding how to be
> + installed, configured, and removed.
> + '''
> + NRPE_PLUGINS_DIR = '/usr/local/lib/nagios/plugins'
> + NRPE_PLUGINS_MODE = 0o100755
> + NRPE_PLUGINS_UID = 0
> + NRPE_PLUGINS_GID = 0
> + TOOLS_DIR = '/usr/local/bin'
> + TOOLS_MODE = 0o100755
> + TOOLS_UID = 0
> + TOOLS_GID = 0
> + CROND_DIR = '/etc/cron.d'
> + CRONJOB_SCRIPT_MODE = 0o100755
> + CRONJOB_SCRIPT_UID = 0
> + CRONJOB_SCRIPT_GID = 0
> + SUDOERS_DIR = '/etc/sudoers.d'
> + SUDOERS_MODE = 0o100440
> + SUDOERS_UID = 0
> + SUDOERS_GID = 0
> +
> + def __init__(self, shortname, nrpe_opts=''):
> + self._nagios_hostname = nrpe.get_nagios_hostname()
> + self._nrpe_opts = nrpe_opts
> + self._shortname = shortname
> + self._files_dir = os.path.join(hookenv.charm_dir(),
> + 'files',
> + self._shortname)
> + self._nrpe_script = 'check_{}'.format(shortname)
> +
> + def _install_nrpe_plugin(self):
> + src = os.path.join(self._files_dir, self._nrpe_script)
> + dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> + os.chmod(dst, self.NRPE_PLUGINS_MODE)
> + os.chown(dst, uid=self.NRPE_PLUGINS_UID, gid=self.NRPE_PLUGINS_GID)
> + hookenv.log('NRPE script for tool [{}] installed at as {}'
> + ''.format(self._shortname, dst),
> + hookenv.DEBUG)
> + return dst
> +
> + def _remove_nrpe_plugin(self):
> + plugin_path = os.path.join(self.NRPE_PLUGINS_DIR, self._nrpe_script)
> + os.remove(plugin_path)
> + hookenv.log('deleted NRPE script for tool [{}]'
> + ''.format(self._shortname),
> + hookenv.DEBUG)
> +
> + def configure_nrpe_check(self):
> + nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
> + cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
> + nrpe_setup.add_check(
> + shortname=self._shortname,
> + description='{} Hardware Health',
> + check_cmd=cmd
> + )
> + nrpe_setup.write()
> + hookenv.log('configured NRPE check for tool [{}]'
> + ''.format(self._shortname),
> + hookenv.DEBUG)
> +
> + def remove_nrpe_check(self):
> + nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
> + cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
> + nrpe_setup.remove_check(
> + shortname=self._shortname,
> + check_cmd=cmd
> + )
> + nrpe_setup.write()
> + hookenv.log('removed NRPE check for tool [{}]'
> + ''.format(self._shortname),
> + hookenv.DEBUG)
> +
> + def install(self):
> + self._install_nrpe_plugin()
> + hookenv.log('Installed tool [{}]'.format(self._shortname))
> +
> + def remove(self):
> + self.remove_nrpe_check()
> + self._remove_nrpe_plugin()
> + hookenv.log('Removed tool [{}]'.format(self._shortname))
> +
> +
> +class VendorTool(Tool):
> + '''
> + A Vendor tool is a statically linked binary we ship via a resource.
> + Since it generally requires root privileges to run, we also deploy a
> + cronjob that runs as root and saves the tool output in a temporary file
> + that nrpe can read (as nagios user).
> + '''
> + def __init__(self, shortname):
> + super().__init__(shortname=shortname)
> + self._cronjob_script = 'check_{}.sh'.format(shortname)
> + self._nrpe_script = 'check_{}.py'.format(shortname)
> +
> + def install(self):
> + self._install_from_resource()
> + # TODO(aluria): install vendor utilities via snaps
> + # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
> + # Snap interface for hw tooling is not supported
> + # in strict mode (confined). Classic mode
> + # (unconfined snaps) need approval
> + # Install the cronjob that generates the temp output
> + self._install_cronjob()
> + super().install()
> +
> + def remove(self):
> + self._remove_cronjob()
> + self._remove_binary()
> + super().remove()
> +
> + def _install_from_resource(self):
> + resource = hookenv.resource_get('tools')
> + if resource:
> + hookenv.log('Installing tool [{}] from resource'
> + ''.format(self._shortname),
> + hookenv.DEBUG)
> + status_set('waiting', 'Installing from attached resource')
> + # Move in from a temp directory to be atomic
> + with TemporaryDirectory() as tmpdir:
> + try:
> + with ZipFile(resource, 'r') as zipfile:
> + tmpfile = zipfile.extract(self._shortname, tmpdir)
> + # We could just use self.TOOLS_DIR as a destination
> + # here, but shutil.move refuses to overwrite the
> + # destination file unless it receives a full path
> + dst = shutil.move(tmpfile,
> + os.path.join(self.TOOLS_DIR,
> + self._shortname))
> + os.chmod(dst, self.TOOLS_MODE)
> + os.chown(dst, uid=self.TOOLS_UID, gid=self.TOOLS_GID)
> +
> + except BadZipFile as error:
> + hookenv.log('BadZipFile: {}'.format(error), hookenv.ERROR)
> +
> + except PermissionError as error:
> + hookenv.log('Unable to unzip tool {} from the provided '
> + 'resource: {}'.format(self._shortname, error),
> + hookenv.ERROR)
> + except KeyError as error:
> + hookenv.log('Tool {} not found in the provided resource: '
> + '{}'.format(self._shortname, error),
> + hookenv.ERROR)
> + status_set('blocked',
> + 'Tool {} not found'.format(self._shortname))
> +
> + else:
> + hookenv.log(
> + 'Missing Juju resource: tools - alternative method is not '
> + 'available yet', hookenv.ERROR)
> + status_set('blocked', 'Missing Juju resource: tools')
> +
> + def _remove_binary(self):
> + binary_path = os.path.join(self.TOOLS_DIR, self._shortname)
> + os.remove(binary_path)
> + hookenv.log('Removed binary tool {}'.format(binary_path),
> + hookenv.DEBUG)
> +
> + def _install_cronjob(self):
> + # Copy the cronjob script to the nagios plugins directory
> + src = os.path.join(self._files_dir, self._cronjob_script)
> + dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> + os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
> + os.chown(dst,
> + uid=self.CRONJOB_SCRIPT_UID,
> + gid=self.CRONJOB_SCRIPT_GID)
> + hookenv.log('Cronjob script [{}] copied to {}'
> + ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
> + hookenv.DEBUG)
> + # Run it once to generate the temp file, otherwise the nrpe check might
> + # fail at first
> + subprocess.call(dst)
> + # Now generate the cronjob file
> + cronjob_line = '*/5 * * * * root {}\n'.format(dst)
> + crond_file = os.path.join(self.CROND_DIR, self._shortname)
> + with open(crond_file, 'w') as crond_fd:
> + crond_fd.write(cronjob_line)
> + hookenv.log('Cronjob configured at {}'.format(crond_file),
> + hookenv.DEBUG)
> +
> + return dst
> +
> + def _remove_cronjob(self):
> + crond_file = os.path.join(self.CROND_DIR, self._shortname)
> + cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR,
> + self._cronjob_script)
> + os.remove(crond_file)
> + os.remove(cronjob_script)
> + hookenv.log('Removed cronjob files [{}, {}]'
> + ''.format(crond_file, cronjob_script),
> + hookenv.DEBUG)
> +
> +
> +class Sas3IrcuTool(VendorTool):
> + def __init__(self):
> + super().__init__(shortname='sas3ircu')
> +
> +
> +class Sas2IrcuTool(VendorTool):
> + def __init__(self):
> + super().__init__(shortname='sas2ircu')
> +
> +
> +class MegaCLITool(VendorTool):
> + def __init__(self):
> + super().__init__(shortname='megacli')
> +
> +
> +class MdadmTool(VendorTool):
> + '''
> + Our mdadm check kind of behaves like a VendorTool for the purpose of
> + installation as it has a cronjob + check script
> + '''
> + def __init__(self):
> + shortname = 'mdadm'
> + super().__init__(shortname=shortname)
> + self._cronjob_script = 'cron_{}.py'.format(shortname)
> + self._nrpe_script = 'check_{}.py'.format(shortname)
> +
> + def install(self):
> + # mdadm should already be installed, but let's check
> + fetch.apt_install(['mdadm'], fatal=True)
> + self._install_cronjob()
> + # No vendor binary to install
> + Tool.install(self)
> +
> + def remove(self):
> + self._remove_cronjob()
> + # There's no vendor binary to remove
> + Tool.remove(self)
> +
> +
> +class IpmiTool(Tool):
> + '''
> + The IPMI check does not rely on a vendor binary, it only uses freeipmi
> + (available via the main repos) and a nrpe check that is imported as a git
> + submodule
> + '''
> + def __init__(self, nrpe_opts=''):
> + super().__init__(shortname='ipmi',
> + nrpe_opts=hookenv.config('ipmi_check_options'))
> + self._nrpe_script = 'check_ipmi_sensor'
> + self._sudoer_file = 'check_ipmi_sensor.sudoer'
Why aren't these constants (in upper case, out of the __init__ constructor) as in the Tool class?
> +
> + def install(self):
> + # Install the sudoer file
> + self._install_sudoer()
> + super().install()
> +
> + def remove(self):
> + self._remove_sudoer()
> + super().remove()
> +
> + def _install_sudoer(self):
> + '''
> + User nagios is normally not able to run ipmi-sel and ipmi-sensors, so
> + we need to add a sudoer entry for it
> + '''
> + src = os.path.join(self._files_dir, self._sudoer_file)
> + dst = shutil.copy(src, self.SUDOERS_DIR)
> + os.chmod(dst, self.SUDOERS_MODE)
> + os.chown(dst, uid=self.SUDOERS_UID, gid=self.SUDOERS_GID)
> + hookenv.log('sudoer file for tool [{}] installed at {}'
> + ''.format(self._shortname, dst),
> + hookenv.DEBUG)
> + return dst
> +
> + def _remove_sudoer(self):
> + sudoer_path = os.path.join(self.SUDOERS_DIR, self._sudoer_file)
> + os.remove(sudoer_path)
> + hookenv.log('deleted sudoer file'.format(sudoer_path), hookenv.DEBUG)
> +
> +
> +class HPTool(VendorTool):
> + # ['hponcfg', 'hp-health', 'hpssacli']
> + def __init__(self):
> + raise NotImplementedError
> +
> +
> +class HPETool(VendorTool):
> + # ['hponcfg', 'ilorest']
> + def __init__(self):
> + raise NotImplementedError
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index 20bc354..941b33a 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -2,69 +2,49 @@ import os
>
> from charmhelpers.core import hookenv, host, unitdata
> from charms.layer import status
> -from charms.reactive import (
> +from charms.reactive.flags import (
> + set_flag,
> clear_flag,
> +)
> +from charms.reactive import (
> hook,
> - set_flag,
> when,
> when_none,
> when_not,
> )
> from hwhealth.hwdiscovery import get_tools
> -from hwhealth.tooling import (
> - configure_nrpe_checks,
> - install_resource,
> - remove_nrpe_checks,
> -)
> +from hwhealth import classes
>
>
> @when_none('hw-health.installed', 'hw-health.unsupported')
> +@when('general-info.connected')
> def install():
> - if host.is_container():
> + manufacturer = hookenv.config('manufacturer')
We may want to implement this in the future as a lib helper (hwhealth_helper = HwhealthHelper(); not hwheatlh_helper.is_supported(), etc.)
> + if host.is_container() and manufacturer != 'test':
> status.blocked('Containers are not supported')
> set_flag('hw-health.unsupported')
> return
>
> - config = hookenv.config()
> - if config.get('manufacturer', '') != 'auto':
> + if manufacturer not in ['auto', 'test']:
> status.blocked('manufacturer needs to be set to auto')
> return
>
> + # Detect hardware and return a list of tools we need to use
> status.maintenance('Autodiscovering hardware')
> - tools = get_tools()
> + tools = get_tools(manufacturer)
> if not tools:
> - set_flag('hw-health.unsupported')
> status.blocked('Hardware not supported')
> + set_flag('hw-health.unsupported')
> else:
> - resource = hookenv.resource_get('tools')
> - if resource:
> - hookenv.log('Installing from resource')
> - status.waiting('Installing from attached resource')
> - installed = install_resource(tools, resource)
> - else:
> - hookenv.log(
> - 'Missing Juju resource: tools - alternative method is not'
> - ' available yet', hookenv.ERROR)
> - status.blocked('Missing Juju resource: tools')
> - return
> - # TODO(aluria): install vendor utilities via snaps
> - # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
> - # Snap interface for hw tooling is not supported
> - # in strict mode (confined). Classic mode
> - # (unconfined snaps) need approval
> - #
> - # hookenv.log('Installing from snap')
> - # status.waiting('Installing from snap')
> - # installed = install_tools(tools, method='snap')
> -
> - if installed:
> - kv = unitdata.kv()
> - kv.set('tools', list(tools))
> - set_flag('hw-health.installed')
> - status.waiting('Preparing scripts installation')
> - else:
> - missing_tools = tools - {'mdadm', 'ipmi'}
> - status.blocked('Tools not found: {}'.format(', '.join(missing_tools)))
> + tool_list = list()
> + for tool in tools:
> + tool.install()
> + # Save the class name in the unit kv db. This will be reused when
> + # reconfiguring or removing the checks
> + tool_list.append(type(tool).__name__)
> + unitdb = unitdata.kv()
> + unitdb.set('tools', tool_list)
> + set_flag('hw-health.installed')
>
>
> @hook('upgrade-charm')
> @@ -75,23 +55,36 @@ def upgrade():
> status.maintenance('Charm upgrade in progress')
>
>
> -@hook('stop')
> -def remove_checks(nrpe):
> - # Note(aluria): Remove nrpe check(s)
> - kv = unitdata.kv()
> - tools = kv.get('tools', set())
> - removed = remove_nrpe_checks(tools)
> - if removed:
> - nrpe.updated()
> +@when('hw-health.installed')
> +@when_not('general-info.available')
> +def remove_tools():
This probably needs a docstring to understand when general-info.available is not set (unit is a subordinate, and not having the juju-info relation would mean that it is stopped).
> + unitdb = unitdata.kv()
> + for tool_class_name in unitdb.get('tools', set()):
> + # Re-instantiate the tool from the saved class name
> + tool_class = getattr(classes, tool_class_name)
> + tool_class().remove()
> + clear_flag('hw-health.installed')
> + clear_flag('hw-health.unsupported')
> + clear_flag('hw-health.configured')
>
>
> @when('config.changed')
> +@when_not('config.changed.manufacturer')
> def config_changed():
> clear_flag('hw-health.configured')
>
>
> +@when('config.changed.manufacturer')
> +def toolset_changed():
> + # Changing the manufacturer option will trigger a reinstallation of the
> + # tools
> + remove_tools()
> + status.maintenance('Reinstallation of tools in progress')
> +
> +
> @when('hw-health.installed')
> @when_not('nrpe-external-master.available')
> +@when_not('hw-health.configured')
> def blocked_on_nrpe():
> status.blocked('Missing relations: nrpe-external-master')
>
--
https://code.launchpad.net/~aieri/hw-health-charm/+git/hw-health-charm/+merge/364757
Your team Nagios Charm developers is subscribed to branch hw-health-charm:oo-rewrite-integration.
References