← Back to team overview

livepatch-charmers team mailing list archive

[Merge] ~barryprice/canonical-livepatch-charm/+git/canonical-livepatch-charm:master into canonical-livepatch-charm:master

 

Barry Price has proposed merging ~barryprice/canonical-livepatch-charm/+git/canonical-livepatch-charm:master into canonical-livepatch-charm:master.

Commit message:
Clearer check output, and warn if the client output format changes (i.e. if checkState or patchState go away).

Requested reviews:
  Livepatch charm developers (livepatch-charmers)
Related bugs:
  Bug #1794327 in Canonical Livepatch Charm: "Nagios check can miss output changes"
  https://bugs.launchpad.net/canonical-livepatch-charm/+bug/1794327

For more details, see:
https://code.launchpad.net/~barryprice/canonical-livepatch-charm/+git/canonical-livepatch-charm/+merge/355822
-- 
Your team Livepatch charm developers is requested to review the proposed merge of ~barryprice/canonical-livepatch-charm/+git/canonical-livepatch-charm:master into canonical-livepatch-charm:master.
diff --git a/files/check_canonical-livepatch.py b/files/check_canonical-livepatch.py
index 7878169..f9a0c24 100755
--- a/files/check_canonical-livepatch.py
+++ b/files/check_canonical-livepatch.py
@@ -5,72 +5,122 @@
 import os
 import nagios_plugin
 from subprocess import check_output, call
+from yaml import safe_load
 
 supported_archs = ['x86_64']
 
 
 ##############################################################################
 
-def check_package_installed():
+def check_snap_installed():
+    """Confirm the snap is installed, raise an error if not"""
     cmd = ['snap', 'list', 'canonical-livepatch']
     try:
         check_output(cmd, universal_newlines=True)
     except Exception:
-        raise nagios_plugin.CriticalError("canonical-livepatch snap is not installed")
+        raise nagios_plugin.CriticalError('canonical-livepatch snap is not installed')
 
 
 ##############################################################################
 
-def check_vmlinuz():
-    vmlinuz = '/vmlinuz'
-    if os.path.exists(vmlinuz):
-        full_kernel_path = os.path.realpath(vmlinuz)
-    elif os.path.exists('/boot' + vmlinuz):
-        vmlinuz = '/boot' + vmlinuz
-        full_kernel_path = os.path.realpath(vmlinuz)
-    else:
-        return 'no /vmlinuz or /boot/vmlinuz'
-    kernel_filename = os.path.basename(full_kernel_path)
-    # remove 'vmlinuz'-' from start:
-    kernel_version = '-'.join(kernel_filename.split('-', 1)[1:])
-    # check for '-generic-pae' kernels that need two removes:
-    if '-generic-pae' in kernel_version:
-        kernel_version = '-'.join(kernel_version.split('-')[:-1])
-    # remove e.g. '-generic' from end:
-    kernel_version = '-'.join(kernel_version.split('-')[:-1])
-    return kernel_version.strip()
+def parse_status():
+    """Load the cached status from disk, return it as a string"""
+    livepatch_output_path = '/var/lib/nagios/canonical-livepatch-status.txt'
+
+    with open(livepatch_output_path, 'r') as canonical_livepatch_log:
+        livepatch_status = canonical_livepatch_log.read()
+
+    return livepatch_status
+
+
+##############################################################################
+
+def check_enabled():
+    """Confirm machine is enabled, raise an error if not"""
+    livepatch_status = parse_status()
+    if 'Machine is not enabled' in livepatch_status:
+        raise nagios_plugin.CriticalError('Machine is not enabled.')
+
+
+##############################################################################
+
+def active_kernel_version():
+    """Return the active kernel version, from livepatch's perspective"""
+    livepatch_status = parse_status()
+    status_yaml = safe_load(livepatch_status)
+    for kernel in status_yaml.get('status'):
+        if kernel.get('running') is True:
+            return kernel.get('kernel')
 
 
 ##############################################################################
 
 def check_status():
-    livepatch_output_path = '/var/lib/nagios/canonical-livepatch-status.txt'
-    err_lines = []
-    wrn_lines = []
+    """Check the cached status, raise an error if we find any issues"""
+    livepatch_status = parse_status()
+    err = ''
 
-    with open(livepatch_output_path, 'r') as canonical_livepatch_log:
-        for line in canonical_livepatch_log:
-            line = line.strip()
-            if 'State:' in line:
-                if 'apply-failed' in line:
-                    err_lines.append('Livepatch failed to apply patches.')
-                elif 'check-failed' in line:
-                    err_lines.append('Livepatch failed to check the remote service for patches.')
-                elif 'unknown' in line:
-                    err_lines.append('Livepatch reports an unknown error.')
-                elif 'kernel-upgrade-required' in line:
-                    err_lines.append('A kernel upgrade (and reboot) is required.')
-            elif 'Machine is not enabled' in line:
-                err_lines.append('Machine is not enabled.')
-
-    if err_lines:
-        err = " ".join(err_lines)
+    status_yaml = safe_load(livepatch_status)
+
+    for kernel in status_yaml.get('status'):
+        if kernel.get('running') is True:
+            check_state = kernel.get('livepatch').get('checkState')
+            patch_state = kernel.get('livepatch').get('patchState')
+
+    check_state_errors = check_check_state(check_state)
+    patch_state_errors = check_patch_state(patch_state)
+
+    if check_state_errors:
+        err = err + ' '.join(check_state_errors)
+
+    if err != '':
+        err = err + ' '
+
+    if patch_state_errors:
+        err = err + ' '.join(patch_state_errors)
+
+    if err != '':
         raise nagios_plugin.CriticalError(err)
-    elif wrn_lines:
-        wrn = " ".join(wrn_lines)
-        raise nagios_plugin.WarnError(wrn)
 
 
+##############################################################################
+
+def check_check_state(check_state):
+    """Check for issues with checkState, including unexpected output"""
+    error_list = []
+    if check_state in ['checked', 'needs-check']:
+        pass
+    elif check_state == 'check-failed':
+        error_list.append('Livepatch failed to check the remote service for patches.')
+    else:
+        error_list.append('Unknown check state: {}'.format(check_state))
+
+    return error_list
+
+
+##############################################################################
+
+def check_patch_state(patch_state):
+    """Check for issues with patchState, including unexpected output"""
+    error_list = []
+    if patch_state in ['applied', 'nothing-to-apply']:
+        pass
+    elif patch_state == 'unapplied':
+        error_list.append('Livepatch has not applied the downloaded patches.')
+    elif patch_state == 'applied-with-bug':
+        error_list.append('Livepatch has detected a kernel bug while applying patches.')
+    elif patch_state == 'apply-failed':
+        error_list.append('Livepatch failed to apply patches.')
+    elif patch_state == 'kernel-upgrade-required':
+        error_list.append('A kernel upgrade (and reboot) is required.')
+    else:
+        error_list.append('Unknown patch state: {}'.format(patch_state))
+
+    return error_list
+
+
+##############################################################################
+
 def lsb_release():
     """Return /etc/lsb-release in a dict"""
     d = {}
@@ -81,6 +131,8 @@ def lsb_release():
     return d
 
 
+##############################################################################
+
 def init_is_systemd():
     """Return True if the host system uses systemd, False otherwise."""
     if lsb_release()['DISTRIB_CODENAME'] == 'trusty':
@@ -88,6 +140,8 @@ def init_is_systemd():
     return os.path.isdir('/run/systemd/system')
 
 
+##############################################################################
+
 def is_container():
     """Determine whether unit is running in a container
 
@@ -112,9 +166,11 @@ def main():
     elif is_container():
         print("canonical-livepatch not needed in OS containers.")
     else:
-        nagios_plugin.try_check(check_package_installed)
+        nagios_plugin.try_check(check_snap_installed)
+        nagios_plugin.try_check(check_enabled)
         nagios_plugin.try_check(check_status)
-        print("OK - canonical-livepatch seems to be installed and working")
+        kernel_version = active_kernel_version()
+        print("OK - canonical-livepatch is active on kernel {}".format(kernel_version))
 
 
 ##############################################################################

Follow ups