← Back to team overview

nagios-charmers team mailing list archive

Re: [Merge] ~aieri/hw-health-charm:oo-rewrite-core into hw-health-charm:oo-rewrite-integration

 

Review: Approve

lgtm

Diff comments:

> diff --git a/src/lib/hwhealth/classes.py b/src/lib/hwhealth/classes.py
> new file mode 100644
> index 0000000..1884059
> --- /dev/null
> +++ b/src/lib/hwhealth/classes.py
> @@ -0,0 +1,298 @@
> +import os
> +import shutil
> +import subprocess
> +from zipfile import ZipFile, BadZipFile
> +from tempfile import TemporaryDirectory
> +from charmhelpers import fetch
> +from charmhelpers.core import hookenv
> +from charmhelpers.core.hookenv import status_set
> +from charmhelpers.contrib.charmsupport import nrpe
> +
> +
> +class Tool():
> +    '''
> +    This is an abstract class representing a "tool".
> +    The idea is to delegate install/configure duties to specific per-tool
> +    classes instead of handling multiple exceptional cases in a main loop
> +    during the install hook.
> +    Every tool should implement its own internal logic regarding how to be
> +    installed, configured, and removed.
> +    '''
> +    NRPE_PLUGINS_DIR = '/usr/local/lib/nagios/plugins'
> +    NRPE_PLUGINS_MODE = 0o100755
> +    NRPE_PLUGINS_UID = 0
> +    NRPE_PLUGINS_GID = 0
> +    TOOLS_DIR = '/usr/local/bin'
> +    TOOLS_MODE = 0o100755
> +    TOOLS_UID = 0
> +    TOOLS_GID = 0
> +    CROND_DIR = '/etc/cron.d'
> +    CRONJOB_SCRIPT_MODE = 0o100755
> +    CRONJOB_SCRIPT_UID = 0
> +    CRONJOB_SCRIPT_GID = 0
> +    SUDOERS_DIR = '/etc/sudoers.d'
> +    SUDOERS_MODE = 0o100440
> +    SUDOERS_UID = 0
> +    SUDOERS_GID = 0
> +
> +    def __init__(self, shortname, nrpe_opts=''):
> +        self._nagios_hostname = nrpe.get_nagios_hostname()
> +        self._nrpe_opts = nrpe_opts
> +        self._shortname = shortname
> +        self._files_dir = os.path.join(hookenv.charm_dir(),
> +                                       'files',
> +                                       self._shortname)
> +        self._nrpe_script = 'check_{}'.format(shortname)
> +
> +    def _install_nrpe_plugin(self):
> +        src = os.path.join(self._files_dir, self._nrpe_script)
> +        dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> +        os.chmod(dst, self.NRPE_PLUGINS_MODE)
> +        os.chown(dst, uid=self.NRPE_PLUGINS_UID, gid=self.NRPE_PLUGINS_GID)
> +        hookenv.log('NRPE script for tool [{}] installed at as {}'
> +                    ''.format(self._shortname, dst),
> +                    hookenv.DEBUG)
> +        return dst
> +
> +    def _remove_nrpe_plugin(self):
> +        plugin_path = os.path.join(self.NRPE_PLUGINS_DIR, self._nrpe_script)
> +        os.remove(plugin_path)
> +        hookenv.log('deleted NRPE script for tool [{}]'
> +                    ''.format(self._shortname),
> +                    hookenv.DEBUG)
> +
> +    def configure_nrpe_check(self):
> +        nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
> +        cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
> +        nrpe_setup.add_check(
> +            shortname=self._shortname,
> +            description='{} Hardware Health',
> +            check_cmd=cmd
> +        )
> +        nrpe_setup.write()
> +        hookenv.log('configured NRPE check for tool [{}]'
> +                    ''.format(self._shortname),
> +                    hookenv.DEBUG)
> +
> +    def remove_nrpe_check(self):
> +        nrpe_setup = nrpe.NRPE(hostname=self._nagios_hostname, primary=False)
> +        cmd = ' '.join([self._nrpe_script, self._nrpe_opts])
> +        nrpe_setup.remove_check(
> +            shortname=self._shortname,
> +            check_cmd=cmd
> +        )
> +        nrpe_setup.write()
> +        hookenv.log('removed NRPE check for tool [{}]'
> +                    ''.format(self._shortname),
> +                    hookenv.DEBUG)
> +
> +    def install(self):
> +        self._install_nrpe_plugin()
> +        hookenv.log('Installed tool [{}]'.format(self._shortname))
> +
> +    def remove(self):
> +        self.remove_nrpe_check()
> +        self._remove_nrpe_plugin()
> +        hookenv.log('Removed tool [{}]'.format(self._shortname))
> +
> +
> +class VendorTool(Tool):
> +    '''
> +    A Vendor tool is a statically linked binary we ship via a resource.
> +    Since it generally requires root privileges to run, we also deploy a
> +    cronjob that runs as root and saves the tool output in a temporary file
> +    that nrpe can read (as nagios user).
> +    '''
> +    def __init__(self, shortname):
> +        super().__init__(shortname=shortname)
> +        self._cronjob_script = 'check_{}.sh'.format(shortname)
> +        self._nrpe_script = 'check_{}.py'.format(shortname)
> +
> +    def install(self):
> +        self._install_from_resource()
> +        # TODO(aluria): install vendor utilities via snaps
> +        # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
> +        # Snap interface for hw tooling is not supported
> +        # in strict mode (confined). Classic mode
> +        # (unconfined snaps) need approval
> +        # Install the cronjob that generates the temp output
> +        self._install_cronjob()
> +        super().install()
> +
> +    def remove(self):
> +        self._remove_cronjob()
> +        self._remove_binary()
> +        super().remove()
> +
> +    def _install_from_resource(self):
> +        resource = hookenv.resource_get('tools')
> +        if resource:
> +            hookenv.log('Installing tool [{}] from resource'
> +                        ''.format(self._shortname),
> +                        hookenv.DEBUG)
> +            status_set('waiting', 'Installing from attached resource')
> +            # Move in from a temp directory to be atomic
> +            with TemporaryDirectory() as tmpdir:
> +                try:
> +                    with ZipFile(resource, 'r') as zipfile:
> +                        tmpfile = zipfile.extract(self._shortname, tmpdir)
> +                        # We could just use self.TOOLS_DIR as a destination
> +                        # here, but shutil.move refuses to overwrite the
> +                        # destination file unless it receives a full path
> +                        dst = shutil.move(tmpfile,
> +                                          os.path.join(self.TOOLS_DIR,
> +                                                       self._shortname))
> +                        os.chmod(dst, self.TOOLS_MODE)
> +                        os.chown(dst, uid=self.TOOLS_UID, gid=self.TOOLS_GID)
> +
> +                except BadZipFile as error:
> +                    hookenv.log('BadZipFile: {}'.format(error), hookenv.ERROR)
> +
> +                except PermissionError as error:
> +                    hookenv.log('Unable to unzip tool {} from the provided '
> +                                'resource: {}'.format(self._shortname, error),
> +                                hookenv.ERROR)
> +                except KeyError as error:
> +                    hookenv.log('Tool {} not found in the provided resource: '
> +                                '{}'.format(self._shortname, error),
> +                                hookenv.ERROR)
> +                    status_set('blocked',
> +                               'Tool {} not found'.format(self._shortname))
> +
> +        else:
> +            hookenv.log(
> +                'Missing Juju resource: tools - alternative method is not '
> +                'available yet', hookenv.ERROR)
> +            status_set('blocked', 'Missing Juju resource: tools')
> +
> +    def _remove_binary(self):
> +        binary_path = os.path.join(self.TOOLS_DIR, self._shortname)
> +        os.remove(binary_path)
> +        hookenv.log('Removed binary tool {}'.format(binary_path),
> +                    hookenv.DEBUG)
> +
> +    def _install_cronjob(self):
> +        # Copy the cronjob script to the nagios plugins directory
> +        src = os.path.join(self._files_dir, self._cronjob_script)
> +        dst = shutil.copy(src, self.NRPE_PLUGINS_DIR)
> +        os.chmod(dst, self.CRONJOB_SCRIPT_MODE)
> +        os.chown(dst,
> +                 uid=self.CRONJOB_SCRIPT_UID,
> +                 gid=self.CRONJOB_SCRIPT_GID)
> +        hookenv.log('Cronjob script [{}] copied to {}'
> +                    ''.format(self._cronjob_script, self.NRPE_PLUGINS_DIR),
> +                    hookenv.DEBUG)
> +        # Run it once to generate the temp file, otherwise the nrpe check might
> +        # fail at first
> +        subprocess.call(dst)
> +        # Now generate the cronjob file
> +        cronjob_line = '*/5 * * * * root {}\n'.format(dst)
> +        crond_file = os.path.join(self.CROND_DIR, self._shortname)
> +        with open(crond_file, 'w') as crond_fd:
> +            crond_fd.write(cronjob_line)
> +            hookenv.log('Cronjob configured at {}'.format(crond_file),
> +                        hookenv.DEBUG)
> +
> +        return dst
> +
> +    def _remove_cronjob(self):
> +        crond_file = os.path.join(self.CROND_DIR, self._shortname)
> +        cronjob_script = os.path.join(self.NRPE_PLUGINS_DIR,
> +                                      self._cronjob_script)
> +        os.remove(crond_file)
> +        os.remove(cronjob_script)
> +        hookenv.log('Removed cronjob files [{}, {}]'
> +                    ''.format(crond_file, cronjob_script),
> +                    hookenv.DEBUG)
> +
> +
> +class Sas3IrcuTool(VendorTool):
> +    def __init__(self):
> +        super().__init__(shortname='sas3ircu')
> +
> +
> +class Sas2IrcuTool(VendorTool):
> +    def __init__(self):
> +        super().__init__(shortname='sas2ircu')
> +
> +
> +class MegaCLITool(VendorTool):
> +    def __init__(self):
> +        super().__init__(shortname='megacli')
> +
> +
> +class MdadmTool(VendorTool):
> +    '''
> +    Our mdadm check kind of behaves like a VendorTool for the purpose of
> +    installation as it has a cronjob + check script
> +    '''
> +    def __init__(self):
> +        shortname = 'mdadm'
> +        super().__init__(shortname=shortname)
> +        self._cronjob_script = 'cron_{}.py'.format(shortname)
> +        self._nrpe_script = 'check_{}.py'.format(shortname)
> +
> +    def install(self):
> +        # mdadm should already be installed, but let's check
> +        fetch.apt_install(['mdadm'], fatal=True)
> +        self._install_cronjob()
> +        # No vendor binary to install
> +        Tool.install(self)
> +
> +    def remove(self):
> +        self._remove_cronjob()
> +        # There's no vendor binary to remove
> +        Tool.remove(self)
> +
> +
> +class IpmiTool(Tool):
> +    '''
> +    The IPMI check does not rely on a vendor binary, it only uses freeipmi
> +    (available via the main repos) and a nrpe check that is imported as a git
> +    submodule
> +    '''
> +    def __init__(self, nrpe_opts=''):
> +        super().__init__(shortname='ipmi',
> +                         nrpe_opts=hookenv.config('ipmi_check_options'))
> +        self._nrpe_script = 'check_ipmi_sensor'
> +        self._sudoer_file = 'check_ipmi_sensor.sudoer'

Why aren't these constants (in upper case, out of the __init__ constructor) as in the Tool class?

> +
> +    def install(self):
> +        # Install the sudoer file
> +        self._install_sudoer()
> +        super().install()
> +
> +    def remove(self):
> +        self._remove_sudoer()
> +        super().remove()
> +
> +    def _install_sudoer(self):
> +        '''
> +        User nagios is normally not able to run ipmi-sel and ipmi-sensors, so
> +        we need to add a sudoer entry for it
> +        '''
> +        src = os.path.join(self._files_dir, self._sudoer_file)
> +        dst = shutil.copy(src, self.SUDOERS_DIR)
> +        os.chmod(dst, self.SUDOERS_MODE)
> +        os.chown(dst, uid=self.SUDOERS_UID, gid=self.SUDOERS_GID)
> +        hookenv.log('sudoer file for tool [{}] installed at {}'
> +                    ''.format(self._shortname, dst),
> +                    hookenv.DEBUG)
> +        return dst
> +
> +    def _remove_sudoer(self):
> +        sudoer_path = os.path.join(self.SUDOERS_DIR, self._sudoer_file)
> +        os.remove(sudoer_path)
> +        hookenv.log('deleted sudoer file'.format(sudoer_path), hookenv.DEBUG)
> +
> +
> +class HPTool(VendorTool):
> +    # ['hponcfg', 'hp-health', 'hpssacli']
> +    def __init__(self):
> +        raise NotImplementedError
> +
> +
> +class HPETool(VendorTool):
> +    # ['hponcfg', 'ilorest']
> +    def __init__(self):
> +        raise NotImplementedError
> diff --git a/src/reactive/hw_health.py b/src/reactive/hw_health.py
> index 20bc354..941b33a 100644
> --- a/src/reactive/hw_health.py
> +++ b/src/reactive/hw_health.py
> @@ -2,69 +2,49 @@ import os
>  
>  from charmhelpers.core import hookenv, host, unitdata
>  from charms.layer import status
> -from charms.reactive import (
> +from charms.reactive.flags import (
> +    set_flag,
>      clear_flag,
> +)
> +from charms.reactive import (
>      hook,
> -    set_flag,
>      when,
>      when_none,
>      when_not,
>  )
>  from hwhealth.hwdiscovery import get_tools
> -from hwhealth.tooling import (
> -    configure_nrpe_checks,
> -    install_resource,
> -    remove_nrpe_checks,
> -)
> +from hwhealth import classes
>  
>  
>  @when_none('hw-health.installed', 'hw-health.unsupported')
> +@when('general-info.connected')
>  def install():
> -    if host.is_container():
> +    manufacturer = hookenv.config('manufacturer')

We may want to implement this in the future as a lib helper (hwhealth_helper = HwhealthHelper(); not hwheatlh_helper.is_supported(), etc.)

> +    if host.is_container() and manufacturer != 'test':
>          status.blocked('Containers are not supported')
>          set_flag('hw-health.unsupported')
>          return
>  
> -    config = hookenv.config()
> -    if config.get('manufacturer', '') != 'auto':
> +    if manufacturer not in ['auto', 'test']:
>          status.blocked('manufacturer needs to be set to auto')
>          return
>  
> +    # Detect hardware and return a list of tools we need to use
>      status.maintenance('Autodiscovering hardware')
> -    tools = get_tools()
> +    tools = get_tools(manufacturer)
>      if not tools:
> -        set_flag('hw-health.unsupported')
>          status.blocked('Hardware not supported')
> +        set_flag('hw-health.unsupported')
>      else:
> -        resource = hookenv.resource_get('tools')
> -        if resource:
> -            hookenv.log('Installing from resource')
> -            status.waiting('Installing from attached resource')
> -            installed = install_resource(tools, resource)
> -        else:
> -            hookenv.log(
> -                'Missing Juju resource: tools - alternative method is not'
> -                ' available yet', hookenv.ERROR)
> -            status.blocked('Missing Juju resource: tools')
> -            return
> -            # TODO(aluria): install vendor utilities via snaps
> -            # https://forum.snapcraft.io/t/request-for-classic-confinement-sas2ircu/9023
> -            # Snap interface for hw tooling is not supported
> -            # in strict mode (confined). Classic mode
> -            # (unconfined snaps) need approval
> -            #
> -            # hookenv.log('Installing from snap')
> -            # status.waiting('Installing from snap')
> -            # installed = install_tools(tools, method='snap')
> -
> -        if installed:
> -            kv = unitdata.kv()
> -            kv.set('tools', list(tools))
> -            set_flag('hw-health.installed')
> -            status.waiting('Preparing scripts installation')
> -        else:
> -            missing_tools = tools - {'mdadm', 'ipmi'}
> -            status.blocked('Tools not found: {}'.format(', '.join(missing_tools)))
> +        tool_list = list()
> +        for tool in tools:
> +            tool.install()
> +            # Save the class name in the unit kv db. This will be reused when
> +            # reconfiguring or removing the checks
> +            tool_list.append(type(tool).__name__)
> +        unitdb = unitdata.kv()
> +        unitdb.set('tools', tool_list)
> +        set_flag('hw-health.installed')
>  
>  
>  @hook('upgrade-charm')
> @@ -75,23 +55,36 @@ def upgrade():
>      status.maintenance('Charm upgrade in progress')
>  
>  
> -@hook('stop')
> -def remove_checks(nrpe):
> -    # Note(aluria): Remove nrpe check(s)
> -    kv = unitdata.kv()
> -    tools = kv.get('tools', set())
> -    removed = remove_nrpe_checks(tools)
> -    if removed:
> -        nrpe.updated()
> +@when('hw-health.installed')
> +@when_not('general-info.available')
> +def remove_tools():

This probably needs a docstring to understand when general-info.available is not set (unit is a subordinate, and not having the juju-info relation would mean that it is stopped).

> +    unitdb = unitdata.kv()
> +    for tool_class_name in unitdb.get('tools', set()):
> +        # Re-instantiate the tool from the saved class name
> +        tool_class = getattr(classes, tool_class_name)
> +        tool_class().remove()
> +    clear_flag('hw-health.installed')
> +    clear_flag('hw-health.unsupported')
> +    clear_flag('hw-health.configured')
>  
>  
>  @when('config.changed')
> +@when_not('config.changed.manufacturer')
>  def config_changed():
>      clear_flag('hw-health.configured')
>  
>  
> +@when('config.changed.manufacturer')
> +def toolset_changed():
> +    # Changing the manufacturer option will trigger a reinstallation of the
> +    # tools
> +    remove_tools()
> +    status.maintenance('Reinstallation of tools in progress')
> +
> +
>  @when('hw-health.installed')
>  @when_not('nrpe-external-master.available')
> +@when_not('hw-health.configured')
>  def blocked_on_nrpe():
>      status.blocked('Missing relations: nrpe-external-master')
>  


-- 
https://code.launchpad.net/~aieri/hw-health-charm/+git/hw-health-charm/+merge/364757
Your team Nagios Charm developers is subscribed to branch hw-health-charm:oo-rewrite-integration.


References