bigdata-dev team mailing list archive
-
bigdata-dev team
-
Mailing list archive
-
Message #00293
[Merge] lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin
Kevin W Monroe has proposed merging lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin.
Requested reviews:
Juju Big Data Development (bigdata-dev)
For more details, see:
https://code.launchpad.net/~bigdata-dev/charms/trusty/apache-zeppelin/trunk/+merge/271385
--
Your team Juju Big Data Development is requested to review the proposed merge of lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin.
=== modified file 'hooks/callbacks.py'
--- hooks/callbacks.py 2015-08-25 02:14:22 +0000
+++ hooks/callbacks.py 2015-09-16 21:19:37 +0000
@@ -89,23 +89,29 @@
# default env). Include our own tutorial, which does work in a
# spark+hdfs env. Inspiration for this notebook came from here:
# https://github.com/apache/incubator-zeppelin/pull/46
- tutorial_source = Path('resources/hdfs-tutorial')
- tutorial_source.copytree(self.dist_config.path('zeppelin_notebooks') / 'hdfs-tutorial')
-
- # move the tutorial dir included in the tarball to our notebook dir and
- # symlink that dir under our zeppelin home. we've seen issues where
- # zepp doesn't honor ZEPPELIN_NOTEBOOK_DIR and instead looks for
- # notebooks in ZEPPELIN_HOME/notebook.
notebook_dir = self.dist_config.path('zeppelin_notebooks')
dist_notebook_dir = self.dist_config.path('zeppelin') / 'notebook'
dist_tutorial_dir = dist_notebook_dir.dirs()[0]
dist_tutorial_dir.move(notebook_dir)
+ self.copy_tutorial("hdfs-tutorial")
+ self.copy_tutorial("flume-tutorial")
dist_notebook_dir.rmtree_p()
+ # move the tutorial dir included in the tarball to our notebook dir and
+ # symlink that dir under our zeppelin home. we've seen issues where
+ # zepp doesn't honor ZEPPELIN_NOTEBOOK_DIR and instead looks for
+ # notebooks in ZEPPELIN_HOME/notebook.
notebook_dir.symlink(dist_notebook_dir)
# make sure the notebook dir's contents are owned by our user
cmd = "chown -R ubuntu:hadoop {}".format(notebook_dir)
call(cmd.split())
+
+
+ def copy_tutorial(self, tutorial_name):
+ tutorial_source = Path('resources/{}'.format(tutorial_name))
+ tutorial_source.copytree(self.dist_config.path('zeppelin_notebooks') / tutorial_name)
+
+
def configure_zeppelin(self):
'''
=== added directory 'resources/flume-tutorial'
=== added file 'resources/flume-tutorial/note.json'
--- resources/flume-tutorial/note.json 1970-01-01 00:00:00 +0000
+++ resources/flume-tutorial/note.json 2015-09-16 21:19:37 +0000
@@ -0,0 +1,337 @@
+{
+ "paragraphs": [
+ {
+ "text": "%md\n## Welcome to Realtime Syslog Analytic Tutorial Powered by Juju.\n### In this live tutorial we will demonstrat three main phases of any big data solution:\n#### 1. Data Ingestion - Apache Flume-syslog -\u003e Apache flume-hdfs\n#### 2. Data Processing - Apache Spark+YARN\n#### 3. Data Visualization - SparkSQL",
+ "config": {
+ "colWidth": 12.0,
+ "graph": {
+ "mode": "table",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [],
+ "values": [],
+ "groups": [],
+ "scatter": {}
+ }
+ },
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440101679810_1108841391",
+ "id": "20150820-151439_133078543",
+ "result": {
+ "code": "SUCCESS",
+ "type": "HTML",
+ "msg": "\u003ch2\u003eWelcome to Realtime Syslog Analytic Tutorial Powered by Juju.\u003c/h2\u003e\n\u003ch3\u003eIn this live tutorial we will demonstrat three main phases of any big data solution:\u003c/h3\u003e\n\u003ch4\u003e1. Data Ingestion - Apache Flume-syslog -\u003e Apache flume-hdfs\u003c/h4\u003e\n\u003ch4\u003e2. Data Processing - Apache Spark+YARN\u003c/h4\u003e\n\u003ch4\u003e3. Data Visualization - SparkSQL\u003c/h4\u003e\n"
+ },
+ "dateCreated": "Aug 20, 2015 3:14:39 PM",
+ "dateStarted": "Aug 25, 2015 9:34:23 AM",
+ "dateFinished": "Aug 25, 2015 9:34:23 AM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Ingestion",
+ "text": "import sys.process._\n// Generate syslog messages by running an spakk\n\"/home/ubuntu/sparkpi.sh\" !!\n// Verify that FLume has collected and sent the syslog messages to HDFS\n\"hadoop fs -ls -R /user/flume/flume-syslog\" !!",
+ "config": {
+ "colWidth": 12.0,
+ "graph": {
+ "mode": "table",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [],
+ "values": [],
+ "groups": [],
+ "scatter": {}
+ },
+ "title": true
+ },
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440112183363_1890510694",
+ "id": "20150820-180943_1527660289",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TEXT",
+ "msg": "" },
+ "dateCreated": "Aug 20, 2015 6:09:43 PM",
+ "dateStarted": "Aug 24, 2015 10:51:34 PM",
+ "dateFinished": "Aug 24, 2015 10:52:11 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Processing in python",
+ "text": "%pyspark\nsc.textFile(\"/user/flume/flume-syslog/*/*/*\").filter(lambda l: \"sshd\" in l).collect()",
+ "config": {
+ "colWidth": 12.0,
+ "graph": {
+ "mode": "table",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [],
+ "values": [],
+ "groups": [],
+ "scatter": {}
+ },
+ "title": true,
+ "tableHide": false,
+ "editorHide": false
+ },
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440112260119_-1393028364",
+ "id": "20150820-181100_389628381",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TEXT",
+ "msg": "" },
+ "dateCreated": "Aug 20, 2015 6:11:00 PM",
+ "dateStarted": "Aug 24, 2015 10:54:10 PM",
+ "dateFinished": "Aug 24, 2015 10:54:15 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Processing In Scala",
+ "text": "import org.joda.time.DateTime\nimport org.joda.time.format.{DateTimeFormatterBuilder, DateTimeFormat}\nimport scala.util.Try\nval reSystemLog \u003d \"\"\"^\\\u003c\\d+\\\u003e([A-Za-z0-9, ]+\\d{2}:\\d{2}:\\d{2}(?:\\.\\d{3})?)\\s+(\\S+)\\s+([^\\[]+)\\[(\\d+)\\]\\s*:?\\s*(.*)\"\"\".r\ncase class SyslogMessage(timestamp: String, host: Option[String], process: String, pid: Int, message: String)\n\nval lines \u003d sc.textFile(\"/user/flume/flume-syslog/*/*/*\")\nval events \u003d lines.flatMap {\n case reSystemLog(timestamp,hostname, proc, pidS, msg) \u003d\u003e\n for {pid \u003c- Try(pidS.toInt).toOption} yield SyslogMessage(timestamp,Some(hostname), proc, pid, msg)\n case _ \u003d\u003e None\n }.toDF()\n\nevents.registerTempTable(\"syslog\")\n",
+ "config": {
+ "colWidth": 12.0,
+ "graph": {
+ "mode": "table",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [],
+ "values": [],
+ "groups": [],
+ "scatter": {}
+ },
+ "title": true,
+ "editorHide": false,
+ "tableHide": false
+ },
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440133397982_798196016",
+ "id": "20150821-000317_766530322",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TEXT",
+ "msg": "import org.joda.time.DateTime\nimport org.joda.time.format.{DateTimeFormatterBuilder, DateTimeFormat}\nimport scala.util.Try\nreSystemLog: scala.util.matching.Regex \u003d ^\\\u003c\\d+\\\u003e([A-Za-z0-9, ]+\\d{2}:\\d{2}:\\d{2}(?:\\.\\d{3})?)\\s+(\\S+)\\s+([^\\[]+)\\[(\\d+)\\]\\s*:?\\s*(.*)\ndefined class SyslogMessage\nlines: org.apache.spark.rdd.RDD[String] \u003d /user/flume/flume-syslog/*/*/* MapPartitionsRDD[509] at textFile at \u003cconsole\u003e:73\nevents: org.apache.spark.sql.DataFrame \u003d [timestamp: string, host: string, process: string, pid: int, message: string]\n"
+ },
+ "dateCreated": "Aug 21, 2015 12:03:17 AM",
+ "dateStarted": "Aug 24, 2015 10:54:28 PM",
+ "dateFinished": "Aug 24, 2015 10:54:29 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Visualization",
+ "text": "%sql \nselect process, count(1) value\nfrom syslog\ngroup by process \norder by process",
+ "config": {
+ "colWidth": 4.0,
+ "graph": {
+ "mode": "pieChart",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [
+ {
+ "name": "process",
+ "index": 0.0,
+ "aggr": "sum"
+ }
+ ],
+ "values": [
+ {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ ],
+ "groups": [],
+ "scatter": {
+ "xAxis": {
+ "name": "process",
+ "index": 0.0,
+ "aggr": "sum"
+ },
+ "yAxis": {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ }
+ },
+ "title": true
+ },
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440473498968_444762596",
+ "id": "20150824-223138_1548703563",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TABLE",
+ "msg": "process\tvalue\nCRON\t180\nntpdate\t1\nsshd\t6\nsu\t1\nsystemd-logind\t1\n"
+ },
+ "dateCreated": "Aug 24, 2015 10:31:38 PM",
+ "dateStarted": "Aug 24, 2015 10:54:37 PM",
+ "dateFinished": "Aug 24, 2015 10:54:41 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Visualization",
+ "text": "%sql \nselect pid, count(1) value\nfrom syslog\nwhere pid \u003e 5000 and pid \u003c 20000 and timestamp \u003e ${maxDate\u003d\"Aug 24\"}\ngroup by pid \norder by pid\n",
+ "config": {
+ "colWidth": 4.0,
+ "graph": {
+ "mode": "pieChart",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [
+ {
+ "name": "pid",
+ "index": 0.0,
+ "aggr": "sum"
+ }
+ ],
+ "values": [
+ {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ ],
+ "groups": [],
+ "scatter": {
+ "xAxis": {
+ "name": "pid",
+ "index": 0.0,
+ "aggr": "sum"
+ },
+ "yAxis": {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ }
+ },
+ "title": true
+ },
+ "settings": {
+ "params": {},
+ "forms": {
+ "maxDate": {
+ "name": "maxDate",
+ "defaultValue": "\"Aug 24\"",
+ "hidden": false
+ }
+ }
+ },
+ "jobName": "paragraph_1440137477230_886878134",
+ "id": "20150821-011117_310225391",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TABLE",
+ "msg": "pid\tvalue\n5073\t2\n5074\t1\n5218\t2\n5219\t1\n5374\t2\n5375\t1\n5485\t2\n5881\t2\n5882\t1\n"
+ },
+ "dateCreated": "Aug 21, 2015 1:11:17 AM",
+ "dateStarted": "Aug 24, 2015 10:54:43 PM",
+ "dateFinished": "Aug 24, 2015 10:54:45 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "title": "Data Visualization",
+ "text": "%sql \nselect timestamp, count(1) value\nfrom syslog\nwhere timestamp \u003e ${maxDate\u003d\"Aug 24\"} and process \u003d\u003d \"sshd\"\ngroup by timestamp\norder by timestamp",
+ "config": {
+ "colWidth": 4.0,
+ "graph": {
+ "mode": "pieChart",
+ "height": 300.0,
+ "optionOpen": false,
+ "keys": [
+ {
+ "name": "timestamp",
+ "index": 0.0,
+ "aggr": "sum"
+ }
+ ],
+ "values": [
+ {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ ],
+ "groups": [],
+ "scatter": {
+ "xAxis": {
+ "name": "timestamp",
+ "index": 0.0,
+ "aggr": "sum"
+ },
+ "yAxis": {
+ "name": "value",
+ "index": 1.0,
+ "aggr": "sum"
+ }
+ }
+ },
+ "title": true
+ },
+ "settings": {
+ "params": {
+ "maxDate": "\"Aug 20\""
+ },
+ "forms": {
+ "maxDate": {
+ "name": "maxDate",
+ "defaultValue": "\"Aug 24\"",
+ "hidden": false
+ }
+ }
+ },
+ "jobName": "paragraph_1440163786226_421898739",
+ "id": "20150821-082946_601268612",
+ "result": {
+ "code": "SUCCESS",
+ "type": "TABLE",
+ "msg": "timestamp\tvalue\nAug 21 11:20:45\t2\nAug 21 19:58:30\t2\nAug 24 21:59:47\t2\n"
+ },
+ "dateCreated": "Aug 21, 2015 8:29:46 AM",
+ "dateStarted": "Aug 24, 2015 10:54:54 PM",
+ "dateFinished": "Aug 24, 2015 10:54:55 PM",
+ "status": "FINISHED",
+ "progressUpdateIntervalMs": 500
+ },
+ {
+ "config": {},
+ "settings": {
+ "params": {},
+ "forms": {}
+ },
+ "jobName": "paragraph_1440473909272_653880463",
+ "id": "20150824-223829_186145308",
+ "dateCreated": "Aug 24, 2015 10:38:29 PM",
+ "status": "READY",
+ "progressUpdateIntervalMs": 500
+ }
+ ],
+ "name": "Real-time Analytic Tutorial",
+ "id": "flume-tutorial",
+ "angularObjects": {},
+ "config": {
+ "looknfeel": "default"
+ },
+ "info": {}
+}
=== added file 'resources/python/jujuresources-0.2.11.tar.gz'
Binary files resources/python/jujuresources-0.2.11.tar.gz 1970-01-01 00:00:00 +0000 and resources/python/jujuresources-0.2.11.tar.gz 2015-09-16 21:19:37 +0000 differ
=== removed file 'resources/python/jujuresources-0.2.9.tar.gz'
Binary files resources/python/jujuresources-0.2.9.tar.gz 2015-06-29 21:07:04 +0000 and resources/python/jujuresources-0.2.9.tar.gz 1970-01-01 00:00:00 +0000 differ
=== modified file 'tests/00-setup'
--- tests/00-setup 2015-05-05 03:25:30 +0000
+++ tests/00-setup 2015-09-16 21:19:37 +0000
@@ -1,5 +1,8 @@
#!/bin/bash
-sudo add-apt-repository ppa:juju/stable -y
-sudo apt-get update
-sudo apt-get install python3 amulet -y
+if ! dpkg -s amulet &> /dev/null; then
+ echo Installing Amulet...
+ sudo add-apt-repository -y ppa:juju/stable
+ sudo apt-get update
+ sudo apt-get -y install amulet
+fi
=== modified file 'tests/100-deploy-spark-hdfs-yarn'
--- tests/100-deploy-spark-hdfs-yarn 2015-08-25 02:14:22 +0000
+++ tests/100-deploy-spark-hdfs-yarn 2015-09-16 21:19:37 +0000
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
import unittest
import amulet
@@ -6,18 +6,18 @@
class TestDeploy(unittest.TestCase):
"""
- Deployment test for Apache Spark using HDFS as shared storage and YARN as
- cluster job manager.
+ Deployment test for Apache Spark+Zeppelin using HDFS as shared storage
+ and YARN as cluster job manager.
"""
@classmethod
def setUpClass(cls):
cls.d = amulet.Deployment(series='trusty')
# Deploy a hadoop cluster
- cls.d.add('yarn-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-yarn-master')
- cls.d.add('hdfs-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-hdfs-master')
- cls.d.add('compute-slave', charm='cs:~bigdata-dev/trusty/apache-hadoop-compute-slave', units=3)
- cls.d.add('plugin', charm='cs:~bigdata-dev/trusty/apache-hadoop-plugin')
+ cls.d.add('yarn-master', charm='cs:trusty/apache-hadoop-yarn-master')
+ cls.d.add('hdfs-master', charm='cs:trusty/apache-hadoop-hdfs-master')
+ cls.d.add('compute-slave', charm='cs:trusty/apache-hadoop-compute-slave', units=3)
+ cls.d.add('plugin', charm='cs:trusty/apache-hadoop-plugin')
cls.d.relate('yarn-master:namenode', 'hdfs-master:namenode')
cls.d.relate('compute-slave:nodemanager', 'yarn-master:nodemanager')
cls.d.relate('compute-slave:datanode', 'hdfs-master:datanode')
@@ -25,15 +25,15 @@
cls.d.relate('plugin:namenode', 'hdfs-master:namenode')
# Add Spark Service
- cls.d.add('spark', charm='cs:~bigdata-dev/trusty/apache-spark')
+ cls.d.add('spark', charm='cs:trusty/apache-spark')
cls.d.relate('spark:hadoop-plugin', 'plugin:hadoop-plugin')
# Add Apache Zeppelin
- cls.d.add('zeppelin', charm='cs:~bigdata-dev/trusty/apache-zeppelin')
+ cls.d.add('zeppelin', charm='cs:trusty/apache-zeppelin')
cls.d.relate('zeppelin:spark', 'spark:spark')
cls.d.setup(timeout=3600)
- cls.d.sentry.wait()
+ cls.d.sentry.wait(timeout=3600)
cls.unit = cls.d.sentry.unit['zeppelin/0']
###########################################################################
Follow ups