You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by xu...@apache.org on 2015/09/18 01:33:51 UTC
svn commit: r1703730 - in /pig/branches/spark: bin/pig build.xml ivy.xml
ivy/libraries.properties src/docs/src/documentation/content/xdocs/start.xml
Author: xuefu
Date: Thu Sep 17 23:33:50 2015
New Revision: 1703730
URL: http://svn.apache.org/viewvc?rev=1703730&view=rev
Log:
PIG-4667: Enable Pig on Spark to run on Yarn Client mode (Srilkanth via Xuefu)
Modified:
pig/branches/spark/bin/pig
pig/branches/spark/build.xml
pig/branches/spark/ivy.xml
pig/branches/spark/ivy/libraries.properties
pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml
Modified: pig/branches/spark/bin/pig
URL: http://svn.apache.org/viewvc/pig/branches/spark/bin/pig?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/bin/pig (original)
+++ pig/branches/spark/bin/pig Thu Sep 17 23:33:50 2015
@@ -361,6 +361,38 @@ if [ "$includeHCatalog" == "true" ]; the
PIG_OPTS="$PIG_OPTS -Dpig.additional.jars.uris=$ADDITIONAL_CLASSPATHS"
fi
+################# ADDING SPARK DEPENDENCIES ##################
+# Spark typically works with a single assembly file. However this
+# assembly isn't available as a artifact to pull in via ivy.
+# To work around this short coming, we add all the jars barring
+# spark-yarn to DIST through dist-files and then add them to classpath
+# of the executors through an independent env variable. The reason
+# for excluding spark-yarn is because spark-yarn is already being added
+# by the spark-yarn-client via jarOf(Client.Class)
+
+for f in $PIG_HOME/lib/spark/*.jar; do
+ if [[ $f == $PIG_HOME/lib/spark/spark-yarn* ]]; then
+ # Exclude spark-yarn.jar from shipped jars, but retain in classpath
+ SPARK_JARS=${SPARK_JARS}:$f;
+ else
+ SPARK_JARS=${SPARK_JARS}:$f;
+ SPARK_YARN_DIST_FILES=${SPARK_YARN_DIST_FILES},file://$f;
+ SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:\${PWD}/`basename $f`
+ fi
+done
+
+for f in $PIG_HOME/lib/*.jar; do
+ SPARK_JARS=${SPARK_JARS}:$f;
+ SPARK_YARN_DIST_FILES=${SPARK_YARN_DIST_FILES},file://$f;
+ SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:\${PWD}/`basename $f`
+done
+CLASSPATH=${CLASSPATH}:${SPARK_JARS}
+
+export SPARK_YARN_DIST_FILES=`echo ${SPARK_YARN_DIST_FILES} | sed 's/^,//g'`
+export SPARK_JARS=${SPARK_YARN_DIST_FILES}
+export SPARK_DIST_CLASSPATH
+################# ADDING SPARK DEPENDENCIES ##################
+
# run it
if [ -n "$HADOOP_BIN" ]; then
if [ "$debug" == "true" ]; then
@@ -389,6 +421,12 @@ if [ -n "$HADOOP_BIN" ]; then
CLASSPATH=${CLASSPATH}:$f;
done
+ ###### Set Spark related env #####
+
+ export SPARK_PIG_JAR=${PIG_JAR}
+
+ ###### Set Spark related env #####a
+
export HADOOP_CLASSPATH=$CLASSPATH:$HADOOP_CLASSPATH
export HADOOP_CLIENT_OPTS="$JAVA_HEAP_MAX $PIG_OPTS $HADOOP_CLIENT_OPTS"
if [ "$debug" == "true" ]; then
@@ -425,6 +463,12 @@ else
echo "Cannot find local hadoop installation, using bundled `java -cp $CLASSPATH org.apache.hadoop.util.VersionInfo | head -1`"
fi
+ ###### Set Spark related env #####
+
+ export SPARK_PIG_JAR=${PIG_JAR}
+
+ ###### Set Spark related env #####a
+
CLASS=org.apache.pig.Main
if [ "$debug" == "true" ]; then
echo "dry run:"
Modified: pig/branches/spark/build.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/build.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/build.xml (original)
+++ pig/branches/spark/build.xml Thu Sep 17 23:33:50 2015
@@ -39,6 +39,7 @@
<!-- source properties -->
<property name="lib.dir" value="${basedir}/lib" />
+ <property name="spark.lib.dir" value="${basedir}/lib/spark" />
<property name="src.dir" value="${basedir}/src" />
<property name="python.src.dir" value="${src.dir}/python" />
<property name="src.lib.dir" value="${basedir}/lib-src" />
@@ -256,6 +257,7 @@
<property name="build.dir" location="build" />
<property name="build.ivy.dir" location="${build.dir}/ivy" />
<property name="build.ivy.lib.dir" location="${build.ivy.dir}/lib" />
+ <property name="build.ivy.spark.lib.dir" location="${build.ivy.dir}/lib/spark" />
<property name="ivy.lib.dir" location="${build.ivy.lib.dir}/${ant.project.name}"/>
<property name="build.ivy.report.dir" location="${build.ivy.dir}/report" />
<property name="build.ivy.maven.dir" location="${build.ivy.dir}/maven" />
@@ -357,6 +359,7 @@
<path refid="compile.classpath"/>
<fileset file="${ivy.lib.dir}/${zookeeper.jarfile}"/>
<fileset dir="${ivy.lib.dir}" includes="*.jar"/>
+ <fileset dir="${build.ivy.spark.lib.dir}/${ant.project.name}" includes="*.jar"/>
</path>
<!-- javadoc-classpath -->
@@ -719,6 +722,7 @@
<buildJar svnString="${svn.revision}" outputFile="${output.jarfile.core}" includedJars="core.dependencies.jar"/>
<buildJar svnString="${svn.revision}" outputFile="${output.jarfile.withouthadoop}" includedJars="runtime.dependencies-withouthadoop.jar"/>
<antcall target="copyCommonDependencies"/>
+ <antcall target="copySparkDependencies"/>
<antcall target="copyh1Dependencies"/>
<antcall target="copyh2Dependencies"/>
</target>
@@ -750,27 +754,17 @@
<fileset dir="${ivy.lib.dir}" includes="zookeeper-*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="accumulo-*.jar" excludes="accumulo-minicluster*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="json-simple-*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="spark*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="scala*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="akka*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="jcl-over-slf4j*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="jul-to-slf4j*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="slf4j*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="commons-lang3*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="config*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="netty*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="jetty*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="metrics-core*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="jackson*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="metrics-json-*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="json4s-*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="javax.servlet-*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="reflectasm*.jar"/>
- <fileset dir="${ivy.lib.dir}" includes="mesos*.jar"/>
<fileset dir="${ivy.lib.dir}" includes="kryo-*.jar"/>
</copy>
</target>
+ <target name="copySparkDependencies">
+ <mkdir dir="${spark.lib.dir}" />
+ <copy todir="${spark.lib.dir}">
+ <fileset dir="${build.ivy.spark.lib.dir}/${ant.project.name}" includes="*.jar"/>
+ </copy>
+ </target>
+
<target name="copyh1Dependencies" unless="isHadoop23">
<mkdir dir="${lib.dir}/h1" />
<copy todir="${lib.dir}/h1">
@@ -1720,6 +1714,8 @@
<target name="ivy-compile" depends="ivy-resolve" description="Retrieve Ivy-managed artifacts for compile configuration">
<ivy:retrieve settingsRef="${ant.project.name}.ivy.settings" log="${loglevel}"
pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="compile"/>
+ <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings" log="${loglevel}"
+ pattern="${build.ivy.spark.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="spark"/>
<ivy:cachepath pathid="compile.classpath" conf="compile"/>
</target>
Modified: pig/branches/spark/ivy.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/ivy.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/ivy.xml (original)
+++ pig/branches/spark/ivy.xml Thu Sep 17 23:33:50 2015
@@ -42,6 +42,7 @@
<conf name="hadoop23" visibility="private"/>
<conf name="hbase94" visibility="private"/>
<conf name="hbase95" visibility="private"/>
+ <conf name="spark" visibility="private" />
</configurations>
<publications>
<artifact name="pig" conf="master"/>
@@ -435,16 +436,20 @@
<dependency org="com.twitter" name="parquet-pig-bundle" rev="${parquet-pig-bundle.version}" conf="compile->master"/>
<!-- for Spark integration -->
- <dependency org="org.apache.spark" name="spark-core_2.10" rev="${spark.version}" conf="compile->default">
+ <dependency org="org.apache.spark" name="spark-core_2.10" rev="${spark.version}" conf="spark->default">
<exclude org="org.eclipse.jetty.orbit" module="javax.servlet"/>
<exclude org="org.eclipse.jetty.orbit" module="javax.transaction"/>
<exclude org="org.eclipse.jetty.orbit" module="javax.mail.glassfish"/>
<exclude org="org.eclipse.jetty.orbit" module="javax.activation"/>
+ <exclude org="org.apache.hadoop" />
+ <exclude org="com.esotericsoftware.kryo" />
+ <exclude org="com.google.guava" />
</dependency>
- <dependency org="asm" name="asm" rev="${asm.version}" conf="compile->master">
+ <dependency org="org.apache.spark" name="spark-yarn_2.10" rev="${spark.version}" conf="spark->default">
+ <exclude org="org.apache.hadoop" />
</dependency>
- <dependency org="javax.servlet" name="javax.servlet-api" rev="3.0.1"
- conf="compile->master"/>
+ <dependency org="asm" name="asm" rev="${asm.version}" conf="compile->master"/>
+ <dependency org="javax.servlet" name="javax.servlet-api" rev="3.0.1" conf="spark->default"/>
<!-- for Tez integration -->
Modified: pig/branches/spark/ivy/libraries.properties
URL: http://svn.apache.org/viewvc/pig/branches/spark/ivy/libraries.properties?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/ivy/libraries.properties (original)
+++ pig/branches/spark/ivy/libraries.properties Thu Sep 17 23:33:50 2015
@@ -77,7 +77,7 @@ netty.version=3.2.2
rats-lib.version=0.5.1
slf4j-api.version=1.6.1
slf4j-log4j12.version=1.6.1
-spark.version=1.3.0
+spark.version=1.4.1
xerces.version=2.10.0
xalan.version=2.7.1
wagon-http.version=1.0-beta-2
@@ -94,7 +94,7 @@ jsr311-api.version=1.1.1
mockito.version=1.8.4
jansi.version=1.9
asm.version=3.3.1
-snappy-java.version=1.1.0.1
+snappy-java.version=1.1.1.7
tez.version=0.5.3
parquet-pig-bundle.version=1.2.3
snappy.version=0.2
Modified: pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml (original)
+++ pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml Thu Sep 17 23:33:50 2015
@@ -91,46 +91,57 @@ Test the Pig installation with this simp
<!-- RUNNING PIG -->
<section id="run">
- <title>Running Pig </title>
+ <title>Running Pig </title>
<p>You can run Pig (execute Pig Latin statements and Pig commands) using various modes.</p>
<table>
<tr>
<td></td>
- <td><strong>Local Mode</strong></td>
- <td><strong>Tez Local Mode</strong></td>
- <td><strong>Mapreduce Mode</strong></td>
- <td><strong>Tez Mode</strong></td>
+ <td><strong>Interactive Mode </strong></td>
+ <td><strong>Batch Mode</strong> </td>
</tr>
<tr>
- <td><strong>Interactive Mode </strong></td>
- <td>yes</td>
- <td>experimental</td>
+ <td><strong>Local Mode</strong></td>
<td>yes</td>
<td>yes</td>
</tr>
+ <tr>
+ <td><strong>Mapreduce Mode</strong></td>
+ <td>yes</td>
+ <td>yes</td>
+ </tr>
<tr>
- <td><strong>Batch Mode</strong> </td>
- <td>yes</td>
+ <td><strong>Tez Local Mode</strong></td>
+ <td>experimental</td>
<td>experimental</td>
+ </tr>
+ <tr>
+ <td><strong>Tez Mode</strong></td>
<td>yes</td>
<td>yes</td>
</tr>
+ <tr>
+ <td><strong>Spark Mode</strong></td>
+ <td>experimental</td>
+ <td>experimental</td>
+ </tr>
</table>
-
+
<!-- ++++++++++++++++++++++++++++++++++ -->
<section id="execution-modes">
- <title>Execution Modes</title>
-<p>Pig has two execution modes or exectypes: </p>
+ <title>Execution Modes</title>
+<p>Pig has five execution modes or exectypes: </p>
<ul>
<li><strong>Local Mode</strong> - To run Pig in local mode, you need access to a single machine; all files are installed and run using your local host and file system. Specify local mode using the -x flag (pig -x local).
</li>
+<li><strong>Mapreduce Mode</strong> - To run Pig in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. Mapreduce mode is the default mode; you can, <em>but don't need to</em>, specify it using the -x flag (pig OR pig -x mapreduce).
+</li>
<li><strong>Tez Local Mode</strong> - To run Pig in tez local mode. It is similar to local mode, except internally Pig will invoke tez runtime engine. Specify Tez local mode using the -x flag (pig -x tez_local).
<p><strong>Note:</strong> Tez local mode is experimental. There are some queries which just error out on bigger data in local mode.</p>
</li>
-<li><strong>Mapreduce Mode</strong> - To run Pig in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. Mapreduce mode is the default mode; you can, <em>but don't need to</em>, specify it using the -x flag (pig OR pig -x mapreduce).
-</li>
<li><strong>Tez Mode</strong> - To run Pig in Tez mode, you need access to a Hadoop cluster and HDFS installation. Specify Tez mode using the -x flag (-x tez).
</li>
+<li><strong>Spark Mode</strong> - To run Pig in Spark mode, you need access to a Spark, Yarn or Mesos cluster and HDFS installation. Specify Spark mode using the -x flag (-x spark). In Spark execution mode, it is necessary to set env::SPARK_MASTER to an appropriate value (local - local mode, yarn-client - yarn-client mode, mesos://host:port - spark on mesos or spark://host:port - spark cluster. For more information refer to spark documentation on Master Urls, <em>yarn-cluster mode is currently not supported</em>)
+</li>
</ul>
<p></p>
@@ -156,6 +167,9 @@ $ pig -x mapreduce ...
/* Tez mode */
$ pig -x tez ...
+
+/* Spark mode */
+$ pig -x spark ...
</source>
</section>
@@ -180,7 +194,7 @@ grunt> dump B;
<source>
$ pig -x local
... - Connecting to ...
-grunt>
+grunt>
</source>
<p><strong>Tez Local Mode</strong></p>
@@ -209,6 +223,14 @@ $ pig -x tez
... - Connecting to ...
grunt>
</source>
+
+<p><strong>Spark Mode</strong> </p>
+<source>
+$ pig -x spark
+... - Connecting to ...
+grunt>
+</source>
+
</section>
</section>
@@ -248,10 +270,14 @@ $ pig -x mapreduce id.pig
<source>
$ pig -x tez id.pig
</source>
+<p><strong>Spark Mode</strong> </p>
+<source>
+$ pig -x spark id.pig
+</source>
</section>
<!-- ==================================================================== -->
-
+
<!-- PIG SCRIPTS -->
<section id="pig-scripts">
<title>Pig Scripts</title>