bigdata-dev team mailing list archive
-
bigdata-dev team
-
Mailing list archive
-
Message #00083
[Merge] lp:~aisrael/charms/trusty/apache-hadoop-client/benchmarks into lp:~bigdata-dev/charms/trusty/apache-hadoop-client/trunk
Adam Israel has proposed merging lp:~aisrael/charms/trusty/apache-hadoop-client/benchmarks into lp:~bigdata-dev/charms/trusty/apache-hadoop-client/trunk.
Requested reviews:
Juju Big Data Development (bigdata-dev)
For more details, see:
https://code.launchpad.net/~aisrael/charms/trusty/apache-hadoop-client/benchmarks/+merge/260526
This merge proposal adds support for benchmarking, and implements a 'terasort' benchmark. This adds two external dependencies: python-pip (which may already be installed via other requirements) and charm-benchmark, which is installed via pip.
--
Your team Juju Big Data Development is requested to review the proposed merge of lp:~aisrael/charms/trusty/apache-hadoop-client/benchmarks into lp:~bigdata-dev/charms/trusty/apache-hadoop-client/trunk.
=== added directory 'actions'
=== added file 'actions.yaml'
--- actions.yaml 1970-01-01 00:00:00 +0000
+++ actions.yaml 2015-05-28 21:01:32 +0000
@@ -0,0 +1,38 @@
+teragen:
+ description: foo
+ params:
+ size:
+ description: The number of 100 byte rows, default to 100MB of data to generate and sort
+ type: string
+ default: "10000000"
+ indir:
+ description: foo
+ type: string
+ default: 'tera_demo_in'
+terasort:
+ description: foo
+ params:
+ indir:
+ description: foo
+ type: string
+ default: 'tera_demo_in'
+ outdir:
+ description: foo
+ type: string
+ default: 'tera_demo_out'
+ size:
+ description: The number of 100 byte rows, default to 100MB of data to generate and sort
+ type: string
+ default: "10000000"
+ maps:
+ description: The default number of map tasks per job. 1-20
+ type: integer
+ default: 1
+ reduces:
+ description: The default number of reduce tasks per job. Typically set to 99% of the cluster's reduce capacity, so that if a node fails the reduces can still be executed in a single wave. Try 1-20
+ type: integer
+ default: 1
+ numtasks:
+ description: How many tasks to run per jvm. If set to -1, there is no limit.
+ type: integer
+ default: 1
=== added file 'actions/parseTerasort.py'
--- actions/parseTerasort.py 1970-01-01 00:00:00 +0000
+++ actions/parseTerasort.py 2015-05-28 21:01:32 +0000
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+"""
+Simple script to parse cassandra-stress' transaction results
+and reformat them as JSON for sending back to juju
+"""
+import sys
+import subprocess
+import json
+from charmhelpers.contrib.benchmark import Benchmark
+import re
+
+
+def action_set(key, val):
+ action_cmd = ['action-set']
+ if isinstance(val, dict):
+ for k, v in val.iteritems():
+ action_set('%s.%s' % (key, k), v)
+ return
+
+ action_cmd.append('%s=%s' % (key, val))
+ subprocess.check_call(action_cmd)
+
+
+def parse_terasort_output():
+ """
+ Parse the output from terasort and set the action results:
+
+ """
+
+ results = {}
+
+ # Find all of the interesting things
+ regex = re.compile('\t+(.*)=(.*)')
+ for line in sys.stdin.readlines():
+ m = regex.match(line)
+ if m:
+ results[m.group(1)] = m.group(2)
+ action_set("results.raw", json.dumps(results))
+
+ # Calculate what's important
+ if 'CPU time spent (ms)' in results:
+ composite = int(results['CPU time spent (ms)']) + int(results['GC time elapsed (ms)'])
+ Benchmark.set_composite_score(
+ composite,
+ 'ms',
+ 'asc'
+ )
+ else:
+ print "Invalid test results"
+ print results
+
+
+if __name__ == "__main__":
+ parse_terasort_output()
=== added file 'actions/teragen'
--- actions/teragen 1970-01-01 00:00:00 +0000
+++ actions/teragen 2015-05-28 21:01:32 +0000
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -eux
+SIZE=`action-get size`
+IN_DIR=`action-get indir`
+
+benchmark-start
+
+# I don't know why, but have to source /etc/environment before and after
+# invoking the bash shell to get it working.
+. /etc/environment
+su ubuntu << EOF
+. /etc/environment
+if JAVA_HOME=${JAVA_HOME} hadoop fs -stat ${IN_DIR}; then
+ JAVA_HOME=${JAVA_HOME} hadoop fs -rm -r -skipTrash ${IN_DIR} || true
+fi
+
+JAVA_HOME=${JAVA_HOME} hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples*.jar teragen ${SIZE} ${IN_DIR}
+
+EOF
+
+benchmark-finish
=== added file 'actions/terasort'
--- actions/terasort 1970-01-01 00:00:00 +0000
+++ actions/terasort 2015-05-28 21:01:32 +0000
@@ -0,0 +1,49 @@
+#!/bin/bash
+IN_DIR=`action-get indir`
+OUT_DIR=`action-get outdir`
+SIZE=`action-get size`
+OPTIONS=''
+
+MAPS=`action-get maps`
+REDUCES=`action-get reduces`
+NUMTASKS=`action-get numtasks`
+
+OPTIONS="${OPTIONS} -D mapreduce.job.maps=${MAPS}"
+OPTIONS="${OPTIONS} -D mapreduce.job.reduces=${REDUCES}"
+OPTIONS="${OPTIONS} -D mapreduce.job.jvm.numtasks=${NUMTASKS}"
+
+mkdir -p /opt/terasort
+chown ubuntu:ubuntu /opt/terasort
+run=`date +%s`
+
+# HACK: the environment reset below is munging the PATH
+OLDPATH=$PATH
+
+
+# I don't know why, but have to source /etc/environment before and after
+# invoking the bash shell to get it working.
+. /etc/environment
+su ubuntu << EOF
+. /etc/environment
+
+mkdir -p /opt/terasort/results/$run
+
+# If there's no data generated yet, create it using the action defaults
+if ! JAVA_HOME=${JAVA_HOME} hadoop fs -stat ${IN_DIR} &> /dev/null; then
+ JAVA_HOME=${JAVA_HOME} hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples*.jar teragen ${SIZE} ${IN_DIR} > /dev/null
+
+fi
+
+# If there's already sorted data, remove it
+if JAVA_HOME=${JAVA_HOME} hadoop fs -stat ${OUT_DIR} &> /dev/null; then
+ JAVA_HOME=${JAVA_HOME} hadoop fs -rm -r -skipTrash ${OUT_DIR} || true
+fi
+
+benchmark-start
+JAVA_HOME=${JAVA_HOME} hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples*.jar terasort ${OPTIONS} ${IN_DIR} ${OUT_DIR} &> /opt/terasort/results/$run/terasort.log
+benchmark-finish
+
+EOF
+PATH=$OLDPATH
+
+`cat /opt/terasort/results/$run/terasort.log | python $CHARM_DIR/actions/parseTerasort.py`
=== added file 'hooks/benchmark-relation-changed'
--- hooks/benchmark-relation-changed 1970-01-01 00:00:00 +0000
+++ hooks/benchmark-relation-changed 2015-05-28 21:01:32 +0000
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+relation-set benchmarks=terasort
=== modified file 'hooks/install'
--- hooks/install 2015-05-11 22:25:12 +0000
+++ hooks/install 2015-05-28 21:01:32 +0000
@@ -1,2 +1,4 @@
#!/bin/bash
+apt-get install -y python-pip && pip install -U charm-benchmark
+
hooks/status-set blocked "Please add relation to apache-hadoop-plugin"
=== added symlink 'hooks/upgrade-charm'
=== target is u'install'
=== modified file 'metadata.yaml'
--- metadata.yaml 2015-05-12 22:18:09 +0000
+++ metadata.yaml 2015-05-28 21:01:32 +0000
@@ -12,3 +12,5 @@
hadoop-plugin:
interface: hadoop-plugin
scope: container
+ benchmark:
+ interface: benchmark
Follow ups