You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tinkerpop.apache.org by ok...@apache.org on 2017/11/06 16:26:48 UTC

[10/14] tinkerpop git commit: Added spark-yarn recipe and missing manifest items in spark-gremlin

Added spark-yarn recipe and missing manifest items in spark-gremlin


Project: http://git-wip-us.apache.org/repos/asf/tinkerpop/repo
Commit: http://git-wip-us.apache.org/repos/asf/tinkerpop/commit/3396e924
Tree: http://git-wip-us.apache.org/repos/asf/tinkerpop/tree/3396e924
Diff: http://git-wip-us.apache.org/repos/asf/tinkerpop/diff/3396e924

Branch: refs/heads/TINKERPOP-1802
Commit: 3396e924243845204de0f47962b58a3ffef87459
Parents: 19e261c
Author: HadoopMarc <vt...@xs4all.nl>
Authored: Sun Sep 10 14:45:45 2017 +0200
Committer: HadoopMarc <vt...@xs4all.nl>
Committed: Thu Oct 19 16:11:57 2017 +0200

----------------------------------------------------------------------
 docs/preprocessor/preprocess-file.sh       |   2 +-
 docs/src/recipes/index.asciidoc            |   2 +
 docs/src/recipes/olap-spark-yarn.asciidoc  | 145 ++++++++++++++++++++++++
 hadoop-gremlin/conf/hadoop-gryo.properties |   2 +-
 pom.xml                                    |   1 +
 spark-gremlin/pom.xml                      |   5 +-
 6 files changed, 153 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/docs/preprocessor/preprocess-file.sh
----------------------------------------------------------------------
diff --git a/docs/preprocessor/preprocess-file.sh b/docs/preprocessor/preprocess-file.sh
index 16612fe..0ca534a 100755
--- a/docs/preprocessor/preprocess-file.sh
+++ b/docs/preprocessor/preprocess-file.sh
@@ -107,7 +107,7 @@ if [ ! ${SKIP} ] && [ $(grep -c '^\[gremlin' ${input}) -gt 0 ]; then
       mv ext/spark-gremlin .ext/
       cat ext/plugins.txt | tee .ext/plugins.all | grep -Fv 'SparkGremlinPlugin' > .ext/plugins.txt
       ;;
-    "implementations-hadoop-start" | "implementations-hadoop-end" | "implementations-spark" | "implementations-giraph")
+    "implementations-hadoop-start" | "implementations-hadoop-end" | "implementations-spark" | "implementations-giraph" | "olap-spark-yarn")
       # deactivate Neo4j plugin to prevent version conflicts between TinkerPop's Spark jars and Neo4j's Spark jars
       mkdir .ext
       mv ext/neo4j-gremlin .ext/

http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/docs/src/recipes/index.asciidoc
----------------------------------------------------------------------
diff --git a/docs/src/recipes/index.asciidoc b/docs/src/recipes/index.asciidoc
index f549b1f..bb88301 100644
--- a/docs/src/recipes/index.asciidoc
+++ b/docs/src/recipes/index.asciidoc
@@ -58,6 +58,8 @@ include::traversal-induced-values.asciidoc[]
 
 include::tree.asciidoc[]
 
+include::olap-spark-yarn.asciidoc[]
+
 = Implementation Recipes
 
 include::style-guide.asciidoc[]

http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/docs/src/recipes/olap-spark-yarn.asciidoc
----------------------------------------------------------------------
diff --git a/docs/src/recipes/olap-spark-yarn.asciidoc b/docs/src/recipes/olap-spark-yarn.asciidoc
new file mode 100644
index 0000000..fbe9c8f
--- /dev/null
+++ b/docs/src/recipes/olap-spark-yarn.asciidoc
@@ -0,0 +1,145 @@
+////
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+////
+[[olap-spark-yarn]]
+OLAP traversals with Spark on Yarn
+----------------------------------
+
+Tinkerpop's combination of http://tinkerpop.apache.org/docs/current/reference/#sparkgraphcomputer[SparkGraphComputer]
+and http://tinkerpop.apache.org/docs/current/reference/#_properties_files[HadoopGraph] allows for running
+distributed, analytical graph queries (OLAP) on a computer cluster. The
+http://tinkerpop.apache.org/docs/current/reference/#sparkgraphcomputer[reference documentation] covers the cases
+where Spark runs locally or where the cluster is managed by a Spark server. However, many users can only run OLAP jobs
+via the http://hadoop.apache.org/[Hadoop 2.x] Resource Manager (Yarn), which requires `SparkGraphComputer` to be
+configured differently. This recipe describes this configuration.
+
+Approach
+~~~~~~~~
+
+Most configuration problems of Tinkerpop with Spark on Yarn stem from three reasons:
+
+1. `SparkGraphComputer` creates its own `SparkContext` so it does not get any configs from the usual `spark-submit` command.
+2. The Tinkerpop Spark-plugin did not include Spark Yarn runtime dependencies until version 3.2.7/3.3.1.
+3. Resolving reason 2 by adding the cluster's `spark-assembly` jar to the classpath creates a host of version
+conflicts, because Spark 1.x dependency versions have remained frozen since 2014.
+
+The current recipe follows a minimalist approach in which no dependencies are added to the dependencies
+included in the Tinkerpop binary distribution. The Hadoop cluster's Spark installation is completely ignored. This
+approach minimizes the chance of dependency version conflicts.
+
+Prerequisites
+~~~~~~~~~~~~~
+This recipe is suitable for both a real external and a local pseudo Hadoop cluster. While the recipe is maintained
+for the vanilla Hadoop pseudo-cluster, it has been reported to work on real clusters with Hadoop distributions
+from various vendors.
+
+If you want to try the recipe on a local Hadoop pseudo-cluster, the easiest way to install
+it is to look at the install script at https://github.com/apache/tinkerpop/blob/x.y.z/docker/hadoop/install.sh
+and the `start hadoop` section of https://github.com/apache/tinkerpop/blob/x.y.z/docker/scripts/build.sh.
+
+This recipe assumes that you installed the gremlin console with the
+http://tinkerpop.apache.org/docs/x.y.z/reference/#spark-plugin[spark plugin] (the
+http://tinkerpop.apache.org/docs/x.y.z/reference/#hadoop-plugin[hadoop plugin] is optional). Your Hadoop cluster
+may have been configured to use file compression, e.g. lzo compression. If so, you need to copy the relevant
+jar (e.g. `hadoop-lzo-*.jar`) to gremlin console's `ext/spark-gremlin/lib` folder.
+
+For starting the gremlin console in the right environment, create a shell script (e.g. `bin/spark-yarn.sh`) with the
+contents below. Of course, actual values for `GREMLIN_HOME`, `HADOOP_HOME` and `HADOOP_CONF_DIR` need to be adapted to
+your particular environment.
+
+[source]
+----
+#!/bin/bash
+# Variables to be adapted to the actual environment
+GREMLIN_HOME=/home/yourdir/lib/apache-tinkerpop-gremlin-console-x.y.z-standalone
+export HADOOP_HOME=/usr/local/lib/hadoop-2.7.2
+export HADOOP_CONF_DIR=/usr/local/lib/hadoop-2.7.2/etc/hadoop
+
+# Have Tinkerpop find the hadoop cluster configs and hadoop native libraries
+export CLASSPATH=$HADOOP_CONF_DIR
+export JAVA_OPTIONS="-Djava.library.path=$HADOOP_HOME/lib/native:$HADOOP_HOME/lib/native/Linux-amd64-64"
+
+# Start gremlin-console without getting the HADOOP_GREMLIN_LIBS warning
+cd $GREMLIN_HOME
+[ ! -e empty ] && mkdir empty
+export HADOOP_GREMLIN_LIBS=$GREMLIN_HOME/empty
+bin/gremlin.sh
+----
+
+Running the job
+~~~~~~~~~~~~~~~
+
+You can now run a gremlin OLAP query with Spark on Yarn:
+
+[source]
+----
+$ hdfs dfs -put data/tinkerpop-modern.kryo .
+$ . bin/spark-yarn.sh
+----
+
+[gremlin-groovy]
+----
+hadoop = System.getenv('HADOOP_HOME')
+hadoopConfDir = System.getenv('HADOOP_CONF_DIR')
+archive = 'spark-gremlin.zip'
+archivePath = "/tmp/$archive"
+['bash', '-c', "rm $archivePath 2>/dev/null; cd ext/spark-gremlin/lib && zip $archivePath *.jar"].execute()
+conf = new PropertiesConfiguration('conf/hadoop/hadoop-gryo.properties')
+conf.setProperty('spark.master', 'yarn-client')
+conf.setProperty('spark.yarn.dist.archives', "$archivePath")
+conf.setProperty('spark.yarn.appMasterEnv.CLASSPATH', "./$archive/*:$hadoopConfDir")
+conf.setProperty('spark.executor.extraClassPath', "./$archive/*:$hadoopConfDir")
+conf.setProperty('spark.driver.extraLibraryPath', "$hadoop/lib/native:$hadoop/lib/native/Linux-amd64-64")
+conf.setProperty('spark.executor.extraLibraryPath', "$hadoop/lib/native:$hadoop/lib/native/Linux-amd64-64")
+graph = GraphFactory.open(conf)
+g = graph.traversal().withComputer(SparkGraphComputer)
+g.V().group().by(values('name')).by(both().count())
+----
+
+If you run into exceptions, the best way to see what is going on is to look into the Yarn Resource Manager UI
+(e.g. http://rm.your.domain:8088/cluster) to find the `applicationId` and get the logs using
+`yarn logs -applicationId application_1498627870374_0008` from the command shell.
+
+Explanation
+~~~~~~~~~~~
+
+This recipe does not require running the `bin/hadoop/init-tp-spark.sh` script described in the
+http://tinkerpop.apache.org/docs/current/reference/#sparkgraphcomputer[reference documentation] and thus is also
+valid for cluster users without access permissions to do so.
+Rather, it exploits the `spark.yarn.dist.archives` property, which points to an archive with jars on the local file
+system and is loaded into the various Yarn containers. As a result the `spark-gremlin.zip` archive becomes available
+as the directory named `spark-gremlin.zip` in the Yarn containers. The `spark.executor.extraClassPath` and
+`spark.yarn.appMasterEnv.CLASSPATH` properties point to the files inside this archive.
+This is why they contain the `./spark-gremlin.zip/*` item. Just because a Spark executor got the archive with
+jars loaded into its container, does not mean it knows how to access them.
+Also the `HADOOP_GREMLIN_LIBS` mechanism is not used because it can not work for Spark on Yarn as implemented (jars
+added to the `SparkContext` are not available to the Yarn application master).
+
+Additional configuration options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This recipe does most of the graph configuration in the gremlin console so that environment variables can be used and
+the chance of configuration mistakes is minimal. Once you have your setup working, it is probably easier to make a copy
+of the `conf/hadoop/hadoop-gryo.properties` file and put the property values specific to your environment there. This is
+also the right moment to take a look at the `spark-defaults.xml` file of your cluster, in particular the settings for
+the Spark History Service, which allows you to access logs of finished jobs via the Yarn resource manager UI.
+
+This recipe uses the gremlin console, but things should not be very different for your own JVM-based application,
+as long as you do not use the `spark-submit` or `spark-shell` commands.
+
+You may not like the idea that the Hadoop and Spark jars from the Tinkerpop distribution differ from the versions in
+your cluster. If so, just build Tinkerpop from source with the corresponding dependencies changed in the various `pom.xml`
+files (e.g. `spark-core_2.10-1.6.1-some-vendor.jar` instead of `spark-core_2.10-1.6.1.jar`). Of course, Tinkerpop will
+only build for exactly matching or slightly differing artifact versions.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/hadoop-gremlin/conf/hadoop-gryo.properties
----------------------------------------------------------------------
diff --git a/hadoop-gremlin/conf/hadoop-gryo.properties b/hadoop-gremlin/conf/hadoop-gryo.properties
index aaab24d..7990431 100644
--- a/hadoop-gremlin/conf/hadoop-gryo.properties
+++ b/hadoop-gremlin/conf/hadoop-gryo.properties
@@ -29,8 +29,8 @@ gremlin.hadoop.outputLocation=output
 spark.master=local[4]
 spark.executor.memory=1g
 spark.serializer=org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoSerializer
+gremlin.spark.persistContext=true
 # gremlin.spark.graphStorageLevel=MEMORY_AND_DISK
-# gremlin.spark.persistContext=true
 # gremlin.spark.graphWriter=org.apache.tinkerpop.gremlin.spark.structure.io.PersistedOutputRDD
 # gremlin.spark.persistStorageLevel=DISK_ONLY
 # spark.kryo.registrationRequired=true

http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 867aaf4..5a93109 100644
--- a/pom.xml
+++ b/pom.xml
@@ -149,6 +149,7 @@ limitations under the License.
         <netty.version>4.0.50.Final</netty.version>
         <slf4j.version>1.7.21</slf4j.version>
         <snakeyaml.version>1.15</snakeyaml.version>
+        <spark.version>1.6.1</spark.version>
 
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>

http://git-wip-us.apache.org/repos/asf/tinkerpop/blob/3396e924/spark-gremlin/pom.xml
----------------------------------------------------------------------
diff --git a/spark-gremlin/pom.xml b/spark-gremlin/pom.xml
index 560e236..77a455b 100644
--- a/spark-gremlin/pom.xml
+++ b/spark-gremlin/pom.xml
@@ -104,7 +104,7 @@
         <dependency>
             <groupId>org.apache.spark</groupId>
             <artifactId>spark-core_2.10</artifactId>
-            <version>1.6.1</version>
+            <version>${spark.version}</version>
             <exclusions>
                 <!-- self conflicts -->
                 <exclusion>
@@ -382,7 +382,8 @@
                 <configuration>
                     <archive>
                         <manifestEntries>
-                            <Gremlin-Plugin-Dependencies>org.apache.hadoop:hadoop-client:2.7.2
+                            <Gremlin-Plugin-Dependencies>
+                                org.apache.hadoop:hadoop-client:${hadoop.version};org.apache.hadoop:hadoop-yarn-server-web-proxy:${hadoop.version};org.apache.spark:spark-yarn_2.10:${spark.version}
                             </Gremlin-Plugin-Dependencies>
                             <!-- deletes the servlet-api jar from the path after install - causes conflicts -->
                             <Gremlin-Plugin-Paths>servlet-api-2.5.jar=</Gremlin-Plugin-Paths>