bigdata-dev team mailing list archive

Thread
Date

[Merge] lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin

To: mp+271385@xxxxxxxxxxxxxxxxxx
From: Kevin W Monroe <kevin.monroe@xxxxxxxxxxxxx>
Date: Wed, 16 Sep 2015 21:20:44 -0000
Reply-to: mp+271385@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Kevin W Monroe has proposed merging lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin.

Requested reviews:
  Juju Big Data Development (bigdata-dev)

For more details, see:
https://code.launchpad.net/~bigdata-dev/charms/trusty/apache-zeppelin/trunk/+merge/271385
-- 
Your team Juju Big Data Development is requested to review the proposed merge of lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin.

=== modified file 'hooks/callbacks.py'
--- hooks/callbacks.py	2015-08-25 02:14:22 +0000
+++ hooks/callbacks.py	2015-09-16 21:19:37 +0000
@@ -89,23 +89,29 @@
         # default env). Include our own tutorial, which does work in a
         # spark+hdfs env. Inspiration for this notebook came from here:
         #   https://github.com/apache/incubator-zeppelin/pull/46
-        tutorial_source = Path('resources/hdfs-tutorial')
-        tutorial_source.copytree(self.dist_config.path('zeppelin_notebooks') / 'hdfs-tutorial')
-
-        # move the tutorial dir included in the tarball to our notebook dir and
-        # symlink that dir under our zeppelin home. we've seen issues where
-        # zepp doesn't honor ZEPPELIN_NOTEBOOK_DIR and instead looks for
-        # notebooks in ZEPPELIN_HOME/notebook.
         notebook_dir = self.dist_config.path('zeppelin_notebooks')
         dist_notebook_dir = self.dist_config.path('zeppelin') / 'notebook'
         dist_tutorial_dir = dist_notebook_dir.dirs()[0]
         dist_tutorial_dir.move(notebook_dir)
+        self.copy_tutorial("hdfs-tutorial")
+        self.copy_tutorial("flume-tutorial")
         dist_notebook_dir.rmtree_p()
+        # move the tutorial dir included in the tarball to our notebook dir and
+        # symlink that dir under our zeppelin home. we've seen issues where
+        # zepp doesn't honor ZEPPELIN_NOTEBOOK_DIR and instead looks for
+        # notebooks in ZEPPELIN_HOME/notebook.
         notebook_dir.symlink(dist_notebook_dir)
 
         # make sure the notebook dir's contents are owned by our user
         cmd = "chown -R ubuntu:hadoop {}".format(notebook_dir)
         call(cmd.split())
+        
+        
+    def copy_tutorial(self, tutorial_name):
+        tutorial_source = Path('resources/{}'.format(tutorial_name))
+        tutorial_source.copytree(self.dist_config.path('zeppelin_notebooks') / tutorial_name)
+
+        
 
     def configure_zeppelin(self):
         '''

=== added directory 'resources/flume-tutorial'
=== added file 'resources/flume-tutorial/note.json'
--- resources/flume-tutorial/note.json	1970-01-01 00:00:00 +0000
+++ resources/flume-tutorial/note.json	2015-09-16 21:19:37 +0000
@@ -0,0 +1,337 @@
+{
+  "paragraphs": [
+    {
+      "text": "%md\n## Welcome to Realtime Syslog Analytic Tutorial Powered by Juju.\n### In this live tutorial we will demonstrat three main phases of any big data solution:\n#### 1. Data Ingestion - Apache Flume-syslog -\u003e Apache flume-hdfs\n#### 2. Data Processing - Apache Spark+YARN\n#### 3. Data Visualization - SparkSQL",
+      "config": {
+        "colWidth": 12.0,
+        "graph": {
+          "mode": "table",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [],
+          "values": [],
+          "groups": [],
+          "scatter": {}
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440101679810_1108841391",
+      "id": "20150820-151439_133078543",
+      "result": {
+        "code": "SUCCESS",
+        "type": "HTML",
+        "msg": "\u003ch2\u003eWelcome to Realtime Syslog Analytic Tutorial Powered by Juju.\u003c/h2\u003e\n\u003ch3\u003eIn this live tutorial we will demonstrat three main phases of any big data solution:\u003c/h3\u003e\n\u003ch4\u003e1. Data Ingestion - Apache Flume-syslog -\u003e Apache flume-hdfs\u003c/h4\u003e\n\u003ch4\u003e2. Data Processing - Apache Spark+YARN\u003c/h4\u003e\n\u003ch4\u003e3. Data Visualization - SparkSQL\u003c/h4\u003e\n"
+      },
+      "dateCreated": "Aug 20, 2015 3:14:39 PM",
+      "dateStarted": "Aug 25, 2015 9:34:23 AM",
+      "dateFinished": "Aug 25, 2015 9:34:23 AM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Ingestion",
+      "text": "import sys.process._\n// Generate syslog messages by running an spakk\n\"/home/ubuntu/sparkpi.sh\" !!\n// Verify that FLume has collected and sent the syslog messages to HDFS\n\"hadoop fs -ls -R /user/flume/flume-syslog\" !!",
+      "config": {
+        "colWidth": 12.0,
+        "graph": {
+          "mode": "table",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [],
+          "values": [],
+          "groups": [],
+          "scatter": {}
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440112183363_1890510694",
+      "id": "20150820-180943_1527660289",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TEXT",
+        "msg": ""      },
+      "dateCreated": "Aug 20, 2015 6:09:43 PM",
+      "dateStarted": "Aug 24, 2015 10:51:34 PM",
+      "dateFinished": "Aug 24, 2015 10:52:11 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Processing in python",
+      "text": "%pyspark\nsc.textFile(\"/user/flume/flume-syslog/*/*/*\").filter(lambda l: \"sshd\" in l).collect()",
+      "config": {
+        "colWidth": 12.0,
+        "graph": {
+          "mode": "table",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [],
+          "values": [],
+          "groups": [],
+          "scatter": {}
+        },
+        "title": true,
+        "tableHide": false,
+        "editorHide": false
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440112260119_-1393028364",
+      "id": "20150820-181100_389628381",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TEXT",
+        "msg": "" },
+      "dateCreated": "Aug 20, 2015 6:11:00 PM",
+      "dateStarted": "Aug 24, 2015 10:54:10 PM",
+      "dateFinished": "Aug 24, 2015 10:54:15 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Processing In Scala",
+      "text": "import org.joda.time.DateTime\nimport org.joda.time.format.{DateTimeFormatterBuilder, DateTimeFormat}\nimport scala.util.Try\nval reSystemLog \u003d \"\"\"^\\\u003c\\d+\\\u003e([A-Za-z0-9, ]+\\d{2}:\\d{2}:\\d{2}(?:\\.\\d{3})?)\\s+(\\S+)\\s+([^\\[]+)\\[(\\d+)\\]\\s*:?\\s*(.*)\"\"\".r\ncase class SyslogMessage(timestamp: String, host: Option[String], process: String, pid: Int, message: String)\n\nval lines \u003d sc.textFile(\"/user/flume/flume-syslog/*/*/*\")\nval events \u003d lines.flatMap {\n      case reSystemLog(timestamp,hostname, proc, pidS, msg) \u003d\u003e\n        for {pid \u003c- Try(pidS.toInt).toOption} yield SyslogMessage(timestamp,Some(hostname), proc, pid, msg)\n      case _ \u003d\u003e None\n    }.toDF()\n\nevents.registerTempTable(\"syslog\")\n",
+      "config": {
+        "colWidth": 12.0,
+        "graph": {
+          "mode": "table",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [],
+          "values": [],
+          "groups": [],
+          "scatter": {}
+        },
+        "title": true,
+        "editorHide": false,
+        "tableHide": false
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440133397982_798196016",
+      "id": "20150821-000317_766530322",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TEXT",
+        "msg": "import org.joda.time.DateTime\nimport org.joda.time.format.{DateTimeFormatterBuilder, DateTimeFormat}\nimport scala.util.Try\nreSystemLog: scala.util.matching.Regex \u003d ^\\\u003c\\d+\\\u003e([A-Za-z0-9, ]+\\d{2}:\\d{2}:\\d{2}(?:\\.\\d{3})?)\\s+(\\S+)\\s+([^\\[]+)\\[(\\d+)\\]\\s*:?\\s*(.*)\ndefined class SyslogMessage\nlines: org.apache.spark.rdd.RDD[String] \u003d /user/flume/flume-syslog/*/*/* MapPartitionsRDD[509] at textFile at \u003cconsole\u003e:73\nevents: org.apache.spark.sql.DataFrame \u003d [timestamp: string, host: string, process: string, pid: int, message: string]\n"
+      },
+      "dateCreated": "Aug 21, 2015 12:03:17 AM",
+      "dateStarted": "Aug 24, 2015 10:54:28 PM",
+      "dateFinished": "Aug 24, 2015 10:54:29 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Visualization",
+      "text": "%sql \nselect process, count(1) value\nfrom syslog\ngroup by process \norder by process",
+      "config": {
+        "colWidth": 4.0,
+        "graph": {
+          "mode": "pieChart",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [
+            {
+              "name": "process",
+              "index": 0.0,
+              "aggr": "sum"
+            }
+          ],
+          "values": [
+            {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          ],
+          "groups": [],
+          "scatter": {
+            "xAxis": {
+              "name": "process",
+              "index": 0.0,
+              "aggr": "sum"
+            },
+            "yAxis": {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          }
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440473498968_444762596",
+      "id": "20150824-223138_1548703563",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TABLE",
+        "msg": "process\tvalue\nCRON\t180\nntpdate\t1\nsshd\t6\nsu\t1\nsystemd-logind\t1\n"
+      },
+      "dateCreated": "Aug 24, 2015 10:31:38 PM",
+      "dateStarted": "Aug 24, 2015 10:54:37 PM",
+      "dateFinished": "Aug 24, 2015 10:54:41 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Visualization",
+      "text": "%sql \nselect pid, count(1) value\nfrom syslog\nwhere pid \u003e 5000 and pid \u003c 20000 and timestamp \u003e ${maxDate\u003d\"Aug 24\"}\ngroup by pid \norder by pid\n",
+      "config": {
+        "colWidth": 4.0,
+        "graph": {
+          "mode": "pieChart",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [
+            {
+              "name": "pid",
+              "index": 0.0,
+              "aggr": "sum"
+            }
+          ],
+          "values": [
+            {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          ],
+          "groups": [],
+          "scatter": {
+            "xAxis": {
+              "name": "pid",
+              "index": 0.0,
+              "aggr": "sum"
+            },
+            "yAxis": {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          }
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {},
+        "forms": {
+          "maxDate": {
+            "name": "maxDate",
+            "defaultValue": "\"Aug 24\"",
+            "hidden": false
+          }
+        }
+      },
+      "jobName": "paragraph_1440137477230_886878134",
+      "id": "20150821-011117_310225391",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TABLE",
+        "msg": "pid\tvalue\n5073\t2\n5074\t1\n5218\t2\n5219\t1\n5374\t2\n5375\t1\n5485\t2\n5881\t2\n5882\t1\n"
+      },
+      "dateCreated": "Aug 21, 2015 1:11:17 AM",
+      "dateStarted": "Aug 24, 2015 10:54:43 PM",
+      "dateFinished": "Aug 24, 2015 10:54:45 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "title": "Data Visualization",
+      "text": "%sql \nselect timestamp, count(1) value\nfrom syslog\nwhere timestamp \u003e ${maxDate\u003d\"Aug 24\"} and process \u003d\u003d \"sshd\"\ngroup by timestamp\norder by timestamp",
+      "config": {
+        "colWidth": 4.0,
+        "graph": {
+          "mode": "pieChart",
+          "height": 300.0,
+          "optionOpen": false,
+          "keys": [
+            {
+              "name": "timestamp",
+              "index": 0.0,
+              "aggr": "sum"
+            }
+          ],
+          "values": [
+            {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          ],
+          "groups": [],
+          "scatter": {
+            "xAxis": {
+              "name": "timestamp",
+              "index": 0.0,
+              "aggr": "sum"
+            },
+            "yAxis": {
+              "name": "value",
+              "index": 1.0,
+              "aggr": "sum"
+            }
+          }
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {
+          "maxDate": "\"Aug 20\""
+        },
+        "forms": {
+          "maxDate": {
+            "name": "maxDate",
+            "defaultValue": "\"Aug 24\"",
+            "hidden": false
+          }
+        }
+      },
+      "jobName": "paragraph_1440163786226_421898739",
+      "id": "20150821-082946_601268612",
+      "result": {
+        "code": "SUCCESS",
+        "type": "TABLE",
+        "msg": "timestamp\tvalue\nAug 21 11:20:45\t2\nAug 21 19:58:30\t2\nAug 24 21:59:47\t2\n"
+      },
+      "dateCreated": "Aug 21, 2015 8:29:46 AM",
+      "dateStarted": "Aug 24, 2015 10:54:54 PM",
+      "dateFinished": "Aug 24, 2015 10:54:55 PM",
+      "status": "FINISHED",
+      "progressUpdateIntervalMs": 500
+    },
+    {
+      "config": {},
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "jobName": "paragraph_1440473909272_653880463",
+      "id": "20150824-223829_186145308",
+      "dateCreated": "Aug 24, 2015 10:38:29 PM",
+      "status": "READY",
+      "progressUpdateIntervalMs": 500
+    }
+  ],
+  "name": "Real-time Analytic Tutorial",
+  "id": "flume-tutorial",
+  "angularObjects": {},
+  "config": {
+    "looknfeel": "default"
+  },
+  "info": {}
+}

=== added file 'resources/python/jujuresources-0.2.11.tar.gz'
Binary files resources/python/jujuresources-0.2.11.tar.gz	1970-01-01 00:00:00 +0000 and resources/python/jujuresources-0.2.11.tar.gz	2015-09-16 21:19:37 +0000 differ
=== removed file 'resources/python/jujuresources-0.2.9.tar.gz'
Binary files resources/python/jujuresources-0.2.9.tar.gz	2015-06-29 21:07:04 +0000 and resources/python/jujuresources-0.2.9.tar.gz	1970-01-01 00:00:00 +0000 differ
=== modified file 'tests/00-setup'
--- tests/00-setup	2015-05-05 03:25:30 +0000
+++ tests/00-setup	2015-09-16 21:19:37 +0000
@@ -1,5 +1,8 @@
 #!/bin/bash
 
-sudo add-apt-repository ppa:juju/stable -y
-sudo apt-get update
-sudo apt-get install python3 amulet -y
+if ! dpkg -s amulet &> /dev/null; then
+    echo Installing Amulet...
+    sudo add-apt-repository -y ppa:juju/stable
+    sudo apt-get update
+    sudo apt-get -y install amulet
+fi

=== modified file 'tests/100-deploy-spark-hdfs-yarn'
--- tests/100-deploy-spark-hdfs-yarn	2015-08-25 02:14:22 +0000
+++ tests/100-deploy-spark-hdfs-yarn	2015-09-16 21:19:37 +0000
@@ -1,4 +1,4 @@
-#!/usr/bin/python3
+#!/usr/bin/env python3
 
 import unittest
 import amulet
@@ -6,18 +6,18 @@
 
 class TestDeploy(unittest.TestCase):
     """
-    Deployment test for Apache Spark using HDFS as shared storage and YARN as
-    cluster job manager.
+    Deployment test for Apache Spark+Zeppelin using HDFS as shared storage
+    and YARN as cluster job manager.
     """
 
     @classmethod
     def setUpClass(cls):
         cls.d = amulet.Deployment(series='trusty')
         # Deploy a hadoop cluster
-        cls.d.add('yarn-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-yarn-master')
-        cls.d.add('hdfs-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-hdfs-master')
-        cls.d.add('compute-slave', charm='cs:~bigdata-dev/trusty/apache-hadoop-compute-slave', units=3)
-        cls.d.add('plugin', charm='cs:~bigdata-dev/trusty/apache-hadoop-plugin')
+        cls.d.add('yarn-master', charm='cs:trusty/apache-hadoop-yarn-master')
+        cls.d.add('hdfs-master', charm='cs:trusty/apache-hadoop-hdfs-master')
+        cls.d.add('compute-slave', charm='cs:trusty/apache-hadoop-compute-slave', units=3)
+        cls.d.add('plugin', charm='cs:trusty/apache-hadoop-plugin')
         cls.d.relate('yarn-master:namenode', 'hdfs-master:namenode')
         cls.d.relate('compute-slave:nodemanager', 'yarn-master:nodemanager')
         cls.d.relate('compute-slave:datanode', 'hdfs-master:datanode')
@@ -25,15 +25,15 @@
         cls.d.relate('plugin:namenode', 'hdfs-master:namenode')
 
         # Add Spark Service
-        cls.d.add('spark', charm='cs:~bigdata-dev/trusty/apache-spark')
+        cls.d.add('spark', charm='cs:trusty/apache-spark')
         cls.d.relate('spark:hadoop-plugin', 'plugin:hadoop-plugin')
 
         # Add Apache Zeppelin
-        cls.d.add('zeppelin', charm='cs:~bigdata-dev/trusty/apache-zeppelin')
+        cls.d.add('zeppelin', charm='cs:trusty/apache-zeppelin')
         cls.d.relate('zeppelin:spark', 'spark:spark')
 
         cls.d.setup(timeout=3600)
-        cls.d.sentry.wait()
+        cls.d.sentry.wait(timeout=3600)
         cls.unit = cls.d.sentry.unit['zeppelin/0']
 
 ###########################################################################

Follow ups

[Merge] lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin
From: noreply, 2015-09-16
Re: [Merge] lp:~bigdata-dev/charms/trusty/apache-zeppelin/trunk into lp:charms/trusty/apache-zeppelin
From: Kevin W Monroe, 2015-09-16