nagios-charmers team mailing list archive
-
nagios-charmers team
-
Mailing list archive
-
Message #00940
Re: [Merge] ~afreiberger/charm-hw-health:add-mdadm-checks into ~nagios-charmers/charm-hw-health:master
Review: Approve
Looks good. Some inline comments on some things worth fixing before landing, in particular moving the lint pragmas into global flake8 config since black is just going to keep making more of them. There are also some remaining os.getcwd() calls in the test suite that can easily be replaced by making them relative to the TESTS_DIR constant that now exists, making the test suite less fragile.
Diff comments:
> diff --git a/src/files/ipmi/cron_ipmi_sensors.py b/src/files/ipmi/cron_ipmi_sensors.py
> index a11f9fb..2b6cdd5 100644
> --- a/src/files/ipmi/cron_ipmi_sensors.py
> +++ b/src/files/ipmi/cron_ipmi_sensors.py
> @@ -16,6 +15,16 @@ NAGIOS_ERRORS = {
> }
>
>
> +def write_output_file(output):
> + try:
> + with open(TMP_OUTPUT_FILE, 'w') as fd:
> + fd.write(output)
> + except IOError as e:
> + print("Cannot write output file {}, error {}".format(TMP_OUTPUT_FILE, e))
> + sys.exit(1)
> + os.rename(TMP_OUTPUT_FILE, OUTPUT_FILE)
Fixed temp filename and output filename seem OK in this case (a cronjob that needs to write to a well-known location). Testing would likely be easier if they were passed in as parameters, but that can wait until someone writing the tests needs it.
> +
> +
> def gather_metrics():
> # Check if a PID file exists
> if os.path.exists(CHECK_IPMI_PID):
> diff --git a/src/files/mdadm/cron_mdadm.py b/src/files/mdadm/cron_mdadm.py
> index 0b0b365..d57c44c 100755
> --- a/src/files/mdadm/cron_mdadm.py
> +++ b/src/files/mdadm/cron_mdadm.py
> @@ -31,90 +35,150 @@ def get_devices():
>
> def generate_output(msg):
> try:
> - with open(TEMP_FILE, "w") as fd:
> + with open(TEMP_FILE, 'w') as fd:
> fd.write(msg)
> shutil.move(TEMP_FILE, OUTPUT_FILE)
> return True
> except Exception as error:
> - print("Unable to generate output file:", error)
> + print('Unable to generate output file:', error)
> return False
>
>
> def get_devices_stats(devices):
> - mdadm_detail = ["/sbin/mdadm", "--detail"]
> + mdadm_detail = ['/sbin/mdadm', '--detail']
> mdadm_detail.extend(sorted(devices))
> - devices_details_raw = subprocess.check_output(mdadm_detail)
> -
> - devices_re = r"^(/\S+):$"
> - state_re = r"^\s*State\s+:\s+(\S+)$"
> - status_re = r"^\s*(Active|Working|Failed|Spare) Devices\s+:\s+(\d+)$"
> + try:
> + devices_details_raw = subprocess.check_output(mdadm_detail)
> + except subprocess.CalledProcessError as error:
> + return generate_output(
> + 'WARNING: error executing mdadm: {}'.format(error)
> + )
> +
> + devices_re = r'^(/\S+):$'
> + state_re = r'^\s*State\s+:\s+(.+)\s*$'
> + status_re = r'^\s*(Active|Working|Failed|Spare) Devices\s+:\s+(\d+)$'
> + rebuild_status_re = r'^\s*Rebuild Status\s+:\s+(\d+%\s+\S+)$'
> + removed_re = r'^\s*-\s+0\s+0\s+(\d+)\s+removed$'
> + # 4 8 162 3 spare rebuilding /dev/sdk2
> + rebuilding_re = r'^\s*\d+\s+\d+\s+\d+\s+\d+\s+\S+\s+rebuilding\s+(\S+)$'
>
> devices_cre = re.compile(devices_re)
> state_cre = re.compile(state_re)
> status_cre = re.compile(status_re)
> + rebuild_status_cre = re.compile(rebuild_status_re)
> + removed_cre = re.compile(removed_re)
> + rebuilding_cre = re.compile(rebuilding_re)
>
> device = None
> devices_stats = {}
> - for line in devices_details_raw.decode().split("\n"):
> + for line in devices_details_raw.decode().split('\n'):
> line = line.rstrip()
> m = devices_cre.match(line)
> if m:
> device = m.group(1)
> devices_stats[device] = {
> - "stats": {"Active": 0, "Working": 0, "Failed": 0, "Spare": 0},
> - "degraded": False,
> + 'stats': {
> + 'Active': 0,
> + 'Working': 0,
> + 'Failed': 0,
> + 'Spare': 0,
> + },
> + 'rebuild_status': '',
> + 'degraded': False,
> + 'recovering': False,
> + 'removed': [],
> + 'rebuilding': [],
> }
> continue
>
> m = state_cre.match(line)
> if m:
> - if "degraded" in m.group(1) and device:
> - devices_stats[device]["degraded"] = True
> + # format for State line can be "clean" or "clean, degraded" or "active, degraded, rebuilding", etc.
> + states = m.group(1).split(", ")
> + if 'degraded' in states and device:
> + devices_stats[device]['degraded'] = True
> + if 'recovering' in states and device:
> + devices_stats[device]['recovering'] = True
> continue
>
> m = status_cre.match(line)
> if m and device:
> - devices_stats[device]["stats"][m.group(1)] = int(m.group(2))
> + devices_stats[device]['stats'][m.group(1)] = int(m.group(2))
> + continue
> +
> + m = removed_cre.match(line)
> + if m and device:
> + devices_stats[device]['removed'].append(m.group(1))
> + continue
> +
> + m = rebuild_status_cre.match(line)
> + if m and device:
> + devices_stats[device]['rebuild_status'] = m.group(1)
> + continue
> +
> + m = rebuilding_cre.match(line)
> + if m and device:
> + devices_stats[device]['rebuilding'].append(m.group(1))
> + continue
> +
> return devices_stats
>
>
> def parse_output():
> devices = get_devices()
> if len(devices) == 0:
> - return generate_output("WARNING: unexpectedly checked no devices")
> + return generate_output('WARNING: unexpectedly checked no devices')
>
> - try:
> - devices_stats = get_devices_stats(devices)
> - except subprocess.CalledProcessError as error:
> - return generate_output("WARNING: error executing mdadm: {}".format(error))
> + devices_stats = get_devices_stats(devices)
> + if isinstance(devices_stats, bool):
> + # if the device_stats is boolean, generate_output was already called
Yuck. A well deserved comment for what seems a poor and surprising API.
> + return devices_stats
>
> msg = []
> critical = False
> + warning = False
> for device in devices_stats:
> + parts = []
> # Is device degraded?
> - if devices_stats[device]["degraded"]:
> + if devices_stats[device]['degraded'] and devices_stats[device]['recovering']:
> + warning = True
> + parts = ['{} recovering'.format(device)]
> + elif devices_stats[device]['degraded']:
> critical = True
> - parts = ["{} degraded".format(device)]
> + parts = ['{} degraded'.format(device)]
> else:
> - parts = ["{} ok".format(device)]
> + parts = ['{} ok'.format(device)]
>
> # If Failed drives are found, list counters (how many?)
> - failed_cnt = devices_stats[device]["stats"].get("Failed", 0)
> + failed_cnt = devices_stats[device]['stats'].get('Failed', 0)
> if failed_cnt > 0:
> critical = True
> dev_stats = [
> - "{}[{}]".format(status, devices_stats[device]["stats"][status])
> - for status in sorted(devices_stats[device]["stats"])
> + '{}[{}]'.format(status, devices_stats[device]['stats'][status])
> + for status in sorted(devices_stats[device]['stats'])
> ]
> parts.extend(dev_stats)
> - msg.append(", ".join(parts))
> +
> + if len(devices_stats[device]['removed']) != 0:
> + critical = True
> + members = " and ".join(devices_stats[device]['removed'])
> + parts.append('RaidDevice(s) {} marked removed'.format(members))
> +
> + if len(devices_stats[device]['rebuilding']) != 0:
> + rebuilding_members = " ".join(devices_stats[device]['rebuilding'])
> + rebuild_status = devices_stats[device]['rebuild_status']
> + parts.append('{} rebuilding ({})'.format(rebuilding_members, rebuild_status))
> +
> + msg.append(', '.join(parts))
>
> if critical:
> - msg = "CRITICAL: {}".format("; ".join(msg))
> + msg = 'CRITICAL: {}'.format('; '.join(msg))
> + elif warning:
> + msg = 'WARNING: {}'.format('; '.join(msg))
> else:
> - msg = "OK: {}".format("; ".join(msg))
> + msg = 'OK: {}'.format('; '.join(msg))
> return generate_output(msg)
>
>
> -if __name__ == "__main__":
> +if __name__ == '__main__':
> parse_output()
> diff --git a/src/tests/download_nagios_plugin3.py b/src/tests/download_nagios_plugin3.py
> index 173fa31..b454caf 100755
> --- a/src/tests/download_nagios_plugin3.py
> +++ b/src/tests/download_nagios_plugin3.py
> @@ -18,7 +18,7 @@ def content():
>
>
> def main():
> - for i in glob('.tox/unit/lib/python3*'):
> + for i in glob('.tox/unit/lib/python3*/site-packages'):
> mod_path = os.path.join(i, MODULE_NAME)
> if os.path.isdir(i) and not os.path.exists(mod_path):
> open(mod_path, 'wb').write(content())
o_O Tests that self-modify their own test environment? That tox expects to be maintaining? Existing code, so fine, but future travelers might consider adding an entry to sys.path for the plugins that the test suite can write to and cleanup.
> diff --git a/src/tests/functional/test_hwhealth.py b/src/tests/functional/test_hwhealth.py
> index f16878f..4dcc0d9 100644
> --- a/src/tests/functional/test_hwhealth.py
> +++ b/src/tests/functional/test_hwhealth.py
> @@ -65,10 +69,17 @@ async def deploy_app(request, model):
> hw_health_checksum_app_name = 'hw-health-checksum-{}'.format(release)
>
> for principal_app in ['ubuntu', 'nagios']:
> + relname = series = release
> + if principal_app == "nagios" and release == "focal":
> + # NOTE(aluria): cs:nagios was not available in focal
> + # On the other hand, bionic testing would create nagios-bionic
> + # hence nagios-bionic2
Should this comment also mention that this workaround should be removed once cs:nagios exists with focal support?
> + relname = "bionic2"
> + series = "bionic"
> await model.deploy(
> principal_app,
> - application_name='{}-{}'.format(principal_app, release),
> - series=release,
> + application_name='{}-{}'.format(principal_app, relname),
> + series=series,
> channel=channel,
> )
> await model.deploy('ubuntu', application_name='ubuntu-checksum-{}'.format(release),
> @@ -117,7 +128,7 @@ async def deploy_app(request, model):
> # The app will initially be in blocked state because it's running in a
> # container
> await model.block_until(
> - lambda: (hw_health_app.status == 'blocked' and
> + lambda: (hw_health_app.status == 'blocked' and # noqa: W504
Warnings introduced by black should be fixed by ignoring the warnings in flake8 config (in a [flake8] section in tox.ini is probably appropriate for this project), ignore=W503,W504,E203,E231) . Black and flake8 disagree about PEP8, which is documented along with evidence to support the black side.
> hw_health_checksum_app.status == 'blocked'),
> timeout=600
> )
> @@ -190,7 +201,7 @@ async def test_checksum_forced_deploy(deploy_app, model, run_command):
>
> await hw_health_checksum_app.set_config({'manufacturer': 'test'})
> await model.block_until(
> - lambda: (hw_health_checksum_app.status == 'blocked' and
> + lambda: (hw_health_checksum_app.status == 'blocked' and # noqa: W504
Project config, not inline # noqa
> hw_health_checksum_unit.workload_status_message == 'Tool megacli - checksum error'),
> timeout=600)
>
> @@ -206,7 +217,7 @@ async def test_checksum_updated_resource_missing(deploy_app, model):
> break
>
> await model.block_until(
> - lambda: (hw_health_checksum_app.status == 'blocked' and
> + lambda: (hw_health_checksum_app.status == 'blocked' and # noqa: W504
here
> hw_health_checksum_unit.workload_status_message == 'Tool megacli not found'),
> timeout=600
> )
> @@ -223,7 +234,7 @@ async def test_checksum_updated_resource_ok(deploy_app, model):
> break
>
> await model.block_until(
> - lambda: (hw_health_checksum_app.status == 'active' and
> + lambda: (hw_health_checksum_app.status == 'active' and # noqa: W504
here
> hw_health_checksum_unit.workload_status_message == 'ready'),
> timeout=600
> )
> diff --git a/src/tests/unit/test_cron_mdadm.py b/src/tests/unit/test_cron_mdadm.py
> index d124931..9aa75f2 100644
> --- a/src/tests/unit/test_cron_mdadm.py
> +++ b/src/tests/unit/test_cron_mdadm.py
> @@ -100,3 +104,35 @@ class TestCronMdadm(unittest.TestCase):
> ' /dev/md2 ok')
> self.assertTrue(cron_mdadm.parse_output())
> genout.assert_called_once_with(expected)
> +
> + @mock.patch('cron_mdadm.generate_output')
> + @mock.patch('cron_mdadm.get_devices')
> + @mock.patch('subprocess.check_output')
> + def test_parse_output_removed(self, mdadm_details, devices, genout):
> + test_output = os.path.join(
> + os.getcwd(), 'tests', 'hw-health-samples',
Best to make this relative to TESTS_DIR rather than os.getcwd(), which makes the tests more flexible on how they are run (from a Makefile at the top level, with a testrunner like pytest (or is it nose?) that implicitly changes cwd etc.)
> + 'mdadm.output.critical.2')
> + with open(test_output, 'r') as fd:
> + mdadm_details.return_value = ''.join(fd.readlines()).encode()
> +
> + devices.return_value = set(['/dev/md1'])
> + genout.return_value = True
> + expected = ('CRITICAL: /dev/md1 degraded, RaidDevice(s) 2 marked removed')
> + self.assertTrue(cron_mdadm.parse_output())
> + genout.assert_called_once_with(expected)
> +
> + @mock.patch('cron_mdadm.generate_output')
> + @mock.patch('cron_mdadm.get_devices')
> + @mock.patch('subprocess.check_output')
> + def test_parse_output_rebuilding(self, mdadm_details, devices, genout):
> + test_output = os.path.join(
> + os.getcwd(), 'tests', 'hw-health-samples',
Here too... relative to TESTS_DIR rather than getcwd()
> + 'mdadm.output.warning')
> + with open(test_output, 'r') as fd:
> + mdadm_details.return_value = ''.join(fd.readlines()).encode()
> +
> + devices.return_value = set(['/dev/md0', '/dev/md1', '/dev/md2'])
> + genout.return_value = True
> + expected = ('WARNING: /dev/md1 recovering, /dev/sdk2 rebuilding (13% complete)')
> + self.assertTrue(cron_mdadm.parse_output())
> + genout.assert_called_once_with(expected)
> diff --git a/src/tests/unit/test_hwdiscovery.py b/src/tests/unit/test_hwdiscovery.py
> index 6e5bf3d..0590132 100644
> --- a/src/tests/unit/test_hwdiscovery.py
> +++ b/src/tests/unit/test_hwdiscovery.py
> @@ -83,8 +83,9 @@ class TestGetTools(unittest.TestCase):
> 'lshw.supermicro.sas.02.json': set(['Nvme', 'Sas3Ircu']),
> }
>
> - for filename in glob.glob(os.path.join(
> - os.getcwd(), 'tests/hw-health-samples/lshw.*.json')):
> + for filename in glob.glob(os.path.join(os.getcwd(),
And here... although you will need to duplicate the TESTS_DIR definition from test_cron_mdadm.py
> + 'tests/hw-health-samples/lshw.*.json')):
> +
> mock_hwinfo.return_value = Hardware(filename)
> actual = hwdiscovery._get_tools()
> if os.path.basename(filename) in TOOLS:
--
https://code.launchpad.net/~afreiberger/charm-hw-health/+git/hw-health-charm/+merge/374838
Your team Nagios Charm developers is subscribed to branch ~nagios-charmers/charm-hw-health:master.
References