cf-charmers team mailing list archive
-
cf-charmers team
-
Mailing list archive
-
Message #00625
[Merge] lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status into lp:~cf-charmers/charms/trusty/cloudfoundry/trunk
Cory Johns has proposed merging lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status into lp:~cf-charmers/charms/trusty/cloudfoundry/trunk.
Requested reviews:
Cloud Foundry Charmers (cf-charmers)
For more details, see:
https://code.launchpad.net/~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status/+merge/242372
Since the monit status is so unreliable, switch the basic health check to use the self-reported Juju status until we have something that more deeply introspects.
--
Your team Cloud Foundry Charmers is requested to review the proposed merge of lp:~johnsca/charms/trusty/cloudfoundry/better-basic-reconciler-status into lp:~cf-charmers/charms/trusty/cloudfoundry/trunk.
=== modified file 'cloudfoundry/health_checks.py'
--- cloudfoundry/health_checks.py 2014-08-24 21:36:50 +0000
+++ cloudfoundry/health_checks.py 2014-11-20 16:14:14 +0000
@@ -1,3 +1,4 @@
+from charmhelpers.core import hookenv
from cloudfoundry import tasks
@@ -17,3 +18,21 @@
message='not all services running',
data={'services': summary})
return result
+
+
+def status(service):
+ result = {
+ 'name': 'monit_summary',
+ 'health': 'pass',
+ 'message': None,
+ 'data': {},
+ }
+ status = hookenv.juju_status()
+ if status['status'] == 'error':
+ return dict(result, health='fail', message=status['message'])
+ elif status['status'] == 'blocked' and status['manual']:
+ return dict(result, health='fail', message='Blocked: %s' % status['blockers'])
+ elif status['status'] == 'up':
+ return result
+ else:
+ return dict(result, health='warn', message='Working (%s)' % status['status'])
=== modified file 'cloudfoundry/jobs.py'
--- cloudfoundry/jobs.py 2014-10-03 15:47:17 +0000
+++ cloudfoundry/jobs.py 2014-11-20 16:14:14 +0000
@@ -68,7 +68,7 @@
service_def = service_data[charm_name]
results = []
health = 'pass'
- checks = service_def.get('health', []) + [health_checks.monit_summary]
+ checks = service_def.get('health', []) + [health_checks.status]
for health_check in checks:
result = health_check(service_def)
if result['health'] == 'fail':
=== modified file 'reconciler/app.py'
--- reconciler/app.py 2014-11-14 17:00:08 +0000
+++ reconciler/app.py 2014-11-20 16:14:14 +0000
@@ -83,13 +83,14 @@
units = service.get('Units', {}) or {}
for unit_name, unit in units.iteritems():
unit_addr = unit.get('PublicAddress')
+ unit_state = unit.get('AgentState')
if unit_addr:
loop = tornado.ioloop.IOLoop.instance()
loop.add_callback(check_health, service_name,
- unit_name, unit_addr)
-
-
-def check_health(service_name, unit_name, unit_addr):
+ unit_name, unit_addr, unit_state)
+
+
+def check_health(service_name, unit_name, unit_addr, unit_state):
service = health.setdefault(service_name, {
'name': service_name,
'health': 'unknown',
@@ -118,8 +119,13 @@
unit['health'] = 'fail'
unit['state'] = {'message': 'Unable to parse health: {}'.format(output)}
except subprocess.CalledProcessError as e:
- unit['health'] = 'warn'
unit['state'] = {'message': 'Unable to retrieve health: {}'.format(e.output)}
+ if unit_state == 'started':
+ unit['health'] = 'pass'
+ elif unit_state == 'error':
+ unit['health'] = 'fail'
+ else:
+ unit['health'] = 'warn'
units_fail = [u['health'] == 'fail' for u in service['units'].values()]
units_not_pass = [u['health'] != 'pass' for u in service['units'].values()]
Follow ups