← Back to team overview

nagios-charmers team mailing list archive

Re: [Merge] ~afreiberger/charm-hw-health:add-mdadm-checks into ~nagios-charmers/charm-hw-health:master

 

Thanks for the feedback!  Thanks for picking up those bits.  There are some clunky bits that could use some clean up after yesterday's merge of other contributions and rebase.

Diff comments:

> diff --git a/src/files/mdadm/cron_mdadm.py b/src/files/mdadm/cron_mdadm.py
> index 0b0b365..d57c44c 100755
> --- a/src/files/mdadm/cron_mdadm.py
> +++ b/src/files/mdadm/cron_mdadm.py
> @@ -31,90 +35,150 @@ def get_devices():
>  
>  def generate_output(msg):
>      try:
> -        with open(TEMP_FILE, "w") as fd:
> +        with open(TEMP_FILE, 'w') as fd:
>              fd.write(msg)
>          shutil.move(TEMP_FILE, OUTPUT_FILE)
>          return True
>      except Exception as error:
> -        print("Unable to generate output file:", error)
> +        print('Unable to generate output file:', error)
>          return False
>  
>  
>  def get_devices_stats(devices):
> -    mdadm_detail = ["/sbin/mdadm", "--detail"]
> +    mdadm_detail = ['/sbin/mdadm', '--detail']
>      mdadm_detail.extend(sorted(devices))
> -    devices_details_raw = subprocess.check_output(mdadm_detail)
> -
> -    devices_re = r"^(/\S+):$"
> -    state_re = r"^\s*State\s+:\s+(\S+)$"
> -    status_re = r"^\s*(Active|Working|Failed|Spare) Devices\s+:\s+(\d+)$"
> +    try:
> +        devices_details_raw = subprocess.check_output(mdadm_detail)
> +    except subprocess.CalledProcessError as error:
> +        return generate_output(
> +            'WARNING: error executing mdadm: {}'.format(error)
> +        )
> +
> +    devices_re = r'^(/\S+):$'
> +    state_re = r'^\s*State\s+:\s+(.+)\s*$'
> +    status_re = r'^\s*(Active|Working|Failed|Spare) Devices\s+:\s+(\d+)$'
> +    rebuild_status_re = r'^\s*Rebuild Status\s+:\s+(\d+%\s+\S+)$'
> +    removed_re = r'^\s*-\s+0\s+0\s+(\d+)\s+removed$'
> +    #        4 8 162 3 spare rebuilding /dev/sdk2
> +    rebuilding_re = r'^\s*\d+\s+\d+\s+\d+\s+\d+\s+\S+\s+rebuilding\s+(\S+)$'
>  
>      devices_cre = re.compile(devices_re)
>      state_cre = re.compile(state_re)
>      status_cre = re.compile(status_re)
> +    rebuild_status_cre = re.compile(rebuild_status_re)
> +    removed_cre = re.compile(removed_re)
> +    rebuilding_cre = re.compile(rebuilding_re)
>  
>      device = None
>      devices_stats = {}
> -    for line in devices_details_raw.decode().split("\n"):
> +    for line in devices_details_raw.decode().split('\n'):
>          line = line.rstrip()
>          m = devices_cre.match(line)
>          if m:
>              device = m.group(1)
>              devices_stats[device] = {
> -                "stats": {"Active": 0, "Working": 0, "Failed": 0, "Spare": 0},
> -                "degraded": False,
> +                'stats': {
> +                    'Active': 0,
> +                    'Working': 0,
> +                    'Failed': 0,
> +                    'Spare': 0,
> +                },
> +                'rebuild_status': '',
> +                'degraded': False,
> +                'recovering': False,
> +                'removed': [],
> +                'rebuilding': [],
>              }
>              continue
>  
>          m = state_cre.match(line)
>          if m:
> -            if "degraded" in m.group(1) and device:
> -                devices_stats[device]["degraded"] = True
> +            # format for State line can be "clean" or "clean, degraded" or "active, degraded, rebuilding", etc.
> +            states = m.group(1).split(", ")
> +            if 'degraded' in states and device:
> +                devices_stats[device]['degraded'] = True
> +            if 'recovering' in states and device:
> +                devices_stats[device]['recovering'] = True
>              continue
>  
>          m = status_cre.match(line)
>          if m and device:
> -            devices_stats[device]["stats"][m.group(1)] = int(m.group(2))
> +            devices_stats[device]['stats'][m.group(1)] = int(m.group(2))
> +            continue
> +
> +        m = removed_cre.match(line)
> +        if m and device:
> +            devices_stats[device]['removed'].append(m.group(1))
> +            continue
> +
> +        m = rebuild_status_cre.match(line)
> +        if m and device:
> +            devices_stats[device]['rebuild_status'] = m.group(1)
> +            continue
> +
> +        m = rebuilding_cre.match(line)
> +        if m and device:
> +            devices_stats[device]['rebuilding'].append(m.group(1))
> +            continue
> +
>      return devices_stats
>  
>  
>  def parse_output():
>      devices = get_devices()
>      if len(devices) == 0:
> -        return generate_output("WARNING: unexpectedly checked no devices")
> +        return generate_output('WARNING: unexpectedly checked no devices')
>  
> -    try:
> -        devices_stats = get_devices_stats(devices)
> -    except subprocess.CalledProcessError as error:
> -        return generate_output("WARNING: error executing mdadm: {}".format(error))
> +    devices_stats = get_devices_stats(devices)
> +    if isinstance(devices_stats, bool):
> +        # if the device_stats is boolean, generate_output was already called

Perhaps the called function should raise something to be caught by parse_output instead of calling generate_output directly.

> +        return devices_stats
>  
>      msg = []
>      critical = False
> +    warning = False
>      for device in devices_stats:
> +        parts = []
>          # Is device degraded?
> -        if devices_stats[device]["degraded"]:
> +        if devices_stats[device]['degraded'] and devices_stats[device]['recovering']:
> +            warning = True
> +            parts = ['{} recovering'.format(device)]
> +        elif devices_stats[device]['degraded']:
>              critical = True
> -            parts = ["{} degraded".format(device)]
> +            parts = ['{} degraded'.format(device)]
>          else:
> -            parts = ["{} ok".format(device)]
> +            parts = ['{} ok'.format(device)]
>  
>          # If Failed drives are found, list counters (how many?)
> -        failed_cnt = devices_stats[device]["stats"].get("Failed", 0)
> +        failed_cnt = devices_stats[device]['stats'].get('Failed', 0)
>          if failed_cnt > 0:
>              critical = True
>              dev_stats = [
> -                "{}[{}]".format(status, devices_stats[device]["stats"][status])
> -                for status in sorted(devices_stats[device]["stats"])
> +                '{}[{}]'.format(status, devices_stats[device]['stats'][status])
> +                for status in sorted(devices_stats[device]['stats'])
>              ]
>              parts.extend(dev_stats)
> -        msg.append(", ".join(parts))
> +
> +        if len(devices_stats[device]['removed']) != 0:
> +            critical = True
> +            members = " and ".join(devices_stats[device]['removed'])
> +            parts.append('RaidDevice(s) {} marked removed'.format(members))
> +
> +        if len(devices_stats[device]['rebuilding']) != 0:
> +            rebuilding_members = " ".join(devices_stats[device]['rebuilding'])
> +            rebuild_status = devices_stats[device]['rebuild_status']
> +            parts.append('{} rebuilding ({})'.format(rebuilding_members, rebuild_status))
> +
> +        msg.append(', '.join(parts))
>  
>      if critical:
> -        msg = "CRITICAL: {}".format("; ".join(msg))
> +        msg = 'CRITICAL: {}'.format('; '.join(msg))
> +    elif warning:
> +        msg = 'WARNING: {}'.format('; '.join(msg))
>      else:
> -        msg = "OK: {}".format("; ".join(msg))
> +        msg = 'OK: {}'.format('; '.join(msg))
>      return generate_output(msg)
>  
>  
> -if __name__ == "__main__":
> +if __name__ == '__main__':
>      parse_output()
> diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
> index f16878f..4dcc0d9 100644
> --- a/src/tests/functional/test_hwhealth.py
> +++ b/src/tests/functional/test_hwhealth.py
> @@ -65,10 +69,17 @@ async def deploy_app(request, model):
>      hw_health_checksum_app_name = 'hw-health-checksum-{}'.format(release)
>  
>      for principal_app in ['ubuntu', 'nagios']:
> +        relname = series = release
> +        if principal_app == "nagios" and release == "focal":
> +            # NOTE(aluria): cs:nagios was not available in focal
> +            # On the other hand, bionic testing would create nagios-bionic
> +            # hence nagios-bionic2

as the nagios package is not being included in focal, focal's new monitoring target will be icinga.  We could definitely put in a TODO comment to update to icinga or other focal monitoring charm.

> +            relname = "bionic2"
> +            series = "bionic"
>          await model.deploy(
>              principal_app,
> -            application_name='{}-{}'.format(principal_app, release),
> -            series=release,
> +            application_name='{}-{}'.format(principal_app, relname),
> +            series=series,
>              channel=channel,
>          )
>      await model.deploy('ubuntu', application_name='ubuntu-checksum-{}'.format(release),


-- 
https://code.launchpad.net/~afreiberger/charm-hw-health/+git/hw-health-charm/+merge/374838
Your team Nagios Charm developers is subscribed to branch ~nagios-charmers/charm-hw-health:master.


References