You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by xu...@apache.org on 2015/09/18 01:33:51 UTC

svn commit: r1703730 - in /pig/branches/spark: bin/pig build.xml ivy.xml ivy/libraries.properties src/docs/src/documentation/content/xdocs/start.xml

Author: xuefu
Date: Thu Sep 17 23:33:50 2015
New Revision: 1703730

URL: http://svn.apache.org/viewvc?rev=1703730&view=rev
Log:
PIG-4667: Enable Pig on Spark to run on Yarn Client mode (Srilkanth via Xuefu)

Modified:
    pig/branches/spark/bin/pig
    pig/branches/spark/build.xml
    pig/branches/spark/ivy.xml
    pig/branches/spark/ivy/libraries.properties
    pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml

Modified: pig/branches/spark/bin/pig
URL: http://svn.apache.org/viewvc/pig/branches/spark/bin/pig?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/bin/pig (original)
+++ pig/branches/spark/bin/pig Thu Sep 17 23:33:50 2015
@@ -361,6 +361,38 @@ if [ "$includeHCatalog" == "true" ]; the
   PIG_OPTS="$PIG_OPTS -Dpig.additional.jars.uris=$ADDITIONAL_CLASSPATHS"
 fi
 
+################# ADDING SPARK DEPENDENCIES ##################
+# Spark typically works with a single assembly file. However this
+# assembly isn't available as a artifact to pull in via ivy.
+# To work around this short coming, we add all the jars barring
+# spark-yarn to DIST through dist-files and then add them to classpath
+# of the executors through an independent env variable. The reason
+# for excluding spark-yarn is because spark-yarn is already being added
+# by the spark-yarn-client via jarOf(Client.Class)
+
+for f in $PIG_HOME/lib/spark/*.jar; do
+    if [[ $f == $PIG_HOME/lib/spark/spark-yarn* ]]; then
+        # Exclude spark-yarn.jar from shipped jars, but retain in classpath
+        SPARK_JARS=${SPARK_JARS}:$f;
+    else
+        SPARK_JARS=${SPARK_JARS}:$f;
+        SPARK_YARN_DIST_FILES=${SPARK_YARN_DIST_FILES},file://$f;
+        SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:\${PWD}/`basename $f`
+    fi
+done
+
+for f in $PIG_HOME/lib/*.jar; do
+    SPARK_JARS=${SPARK_JARS}:$f;
+    SPARK_YARN_DIST_FILES=${SPARK_YARN_DIST_FILES},file://$f;
+    SPARK_DIST_CLASSPATH=${SPARK_DIST_CLASSPATH}:\${PWD}/`basename $f`
+done
+CLASSPATH=${CLASSPATH}:${SPARK_JARS}
+
+export SPARK_YARN_DIST_FILES=`echo ${SPARK_YARN_DIST_FILES} | sed 's/^,//g'`
+export SPARK_JARS=${SPARK_YARN_DIST_FILES}
+export SPARK_DIST_CLASSPATH
+################# ADDING SPARK DEPENDENCIES ##################
+
 # run it
 if [ -n "$HADOOP_BIN" ]; then
     if [ "$debug" == "true" ]; then
@@ -389,6 +421,12 @@ if [ -n "$HADOOP_BIN" ]; then
         CLASSPATH=${CLASSPATH}:$f;
     done
 
+    ###### Set Spark related env #####
+
+    export SPARK_PIG_JAR=${PIG_JAR}
+
+    ###### Set Spark related env #####a
+
     export HADOOP_CLASSPATH=$CLASSPATH:$HADOOP_CLASSPATH
     export HADOOP_CLIENT_OPTS="$JAVA_HEAP_MAX $PIG_OPTS $HADOOP_CLIENT_OPTS"
     if [ "$debug" == "true" ]; then
@@ -425,6 +463,12 @@ else
         echo "Cannot find local hadoop installation, using bundled `java -cp $CLASSPATH org.apache.hadoop.util.VersionInfo | head -1`"
     fi
 
+    ###### Set Spark related env #####
+
+    export SPARK_PIG_JAR=${PIG_JAR}
+
+    ###### Set Spark related env #####a
+
     CLASS=org.apache.pig.Main
     if [ "$debug" == "true" ]; then
         echo "dry run:"

Modified: pig/branches/spark/build.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/build.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/build.xml (original)
+++ pig/branches/spark/build.xml Thu Sep 17 23:33:50 2015
@@ -39,6 +39,7 @@
 
     <!-- source properties -->
     <property name="lib.dir" value="${basedir}/lib" />
+    <property name="spark.lib.dir" value="${basedir}/lib/spark" />
     <property name="src.dir" value="${basedir}/src" />
     <property name="python.src.dir" value="${src.dir}/python" />
     <property name="src.lib.dir" value="${basedir}/lib-src" />
@@ -256,6 +257,7 @@
     <property name="build.dir" location="build" />
     <property name="build.ivy.dir" location="${build.dir}/ivy" />
     <property name="build.ivy.lib.dir" location="${build.ivy.dir}/lib" />
+    <property name="build.ivy.spark.lib.dir" location="${build.ivy.dir}/lib/spark" />
     <property name="ivy.lib.dir" location="${build.ivy.lib.dir}/${ant.project.name}"/>
     <property name="build.ivy.report.dir" location="${build.ivy.dir}/report" />
     <property name="build.ivy.maven.dir" location="${build.ivy.dir}/maven" />
@@ -357,6 +359,7 @@
         <path refid="compile.classpath"/>
         <fileset file="${ivy.lib.dir}/${zookeeper.jarfile}"/>
         <fileset dir="${ivy.lib.dir}" includes="*.jar"/>
+        <fileset dir="${build.ivy.spark.lib.dir}/${ant.project.name}" includes="*.jar"/>
     </path>
 
     <!-- javadoc-classpath -->
@@ -719,6 +722,7 @@
         <buildJar svnString="${svn.revision}" outputFile="${output.jarfile.core}" includedJars="core.dependencies.jar"/>
         <buildJar svnString="${svn.revision}" outputFile="${output.jarfile.withouthadoop}" includedJars="runtime.dependencies-withouthadoop.jar"/>
         <antcall target="copyCommonDependencies"/>
+        <antcall target="copySparkDependencies"/>
         <antcall target="copyh1Dependencies"/>
         <antcall target="copyh2Dependencies"/>
     </target>
@@ -750,27 +754,17 @@
             <fileset dir="${ivy.lib.dir}" includes="zookeeper-*.jar"/>
             <fileset dir="${ivy.lib.dir}" includes="accumulo-*.jar" excludes="accumulo-minicluster*.jar"/>
             <fileset dir="${ivy.lib.dir}" includes="json-simple-*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="spark*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="scala*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="akka*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="jcl-over-slf4j*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="jul-to-slf4j*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="slf4j*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="commons-lang3*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="config*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="netty*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="jetty*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="metrics-core*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="jackson*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="metrics-json-*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="json4s-*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="javax.servlet-*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="reflectasm*.jar"/>
-            <fileset dir="${ivy.lib.dir}" includes="mesos*.jar"/>
             <fileset dir="${ivy.lib.dir}" includes="kryo-*.jar"/>
         </copy>
     </target>
 
+    <target name="copySparkDependencies">
+        <mkdir dir="${spark.lib.dir}" />
+        <copy todir="${spark.lib.dir}">
+            <fileset dir="${build.ivy.spark.lib.dir}/${ant.project.name}" includes="*.jar"/>
+        </copy>
+    </target>
+
     <target name="copyh1Dependencies" unless="isHadoop23">
         <mkdir dir="${lib.dir}/h1" />
         <copy todir="${lib.dir}/h1">
@@ -1720,6 +1714,8 @@
      <target name="ivy-compile" depends="ivy-resolve" description="Retrieve Ivy-managed artifacts for compile configuration">
        <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings" log="${loglevel}"
                  pattern="${build.ivy.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="compile"/>
+       <ivy:retrieve settingsRef="${ant.project.name}.ivy.settings" log="${loglevel}"
+                 pattern="${build.ivy.spark.lib.dir}/${ivy.artifact.retrieve.pattern}" conf="spark"/>
        <ivy:cachepath pathid="compile.classpath" conf="compile"/>
      </target>
 

Modified: pig/branches/spark/ivy.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/ivy.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/ivy.xml (original)
+++ pig/branches/spark/ivy.xml Thu Sep 17 23:33:50 2015
@@ -42,6 +42,7 @@
     <conf name="hadoop23" visibility="private"/>
     <conf name="hbase94" visibility="private"/>
     <conf name="hbase95" visibility="private"/>
+    <conf name="spark" visibility="private" />
   </configurations>
   <publications>
     <artifact name="pig" conf="master"/>
@@ -435,16 +436,20 @@
     <dependency org="com.twitter" name="parquet-pig-bundle" rev="${parquet-pig-bundle.version}" conf="compile->master"/>
 
     <!-- for Spark integration -->
-    <dependency org="org.apache.spark" name="spark-core_2.10" rev="${spark.version}" conf="compile->default">
+    <dependency org="org.apache.spark" name="spark-core_2.10" rev="${spark.version}" conf="spark->default">
         <exclude org="org.eclipse.jetty.orbit" module="javax.servlet"/>
         <exclude org="org.eclipse.jetty.orbit" module="javax.transaction"/>
         <exclude org="org.eclipse.jetty.orbit" module="javax.mail.glassfish"/>
         <exclude org="org.eclipse.jetty.orbit" module="javax.activation"/>
+        <exclude org="org.apache.hadoop" />
+        <exclude org="com.esotericsoftware.kryo" />
+        <exclude org="com.google.guava" />
     </dependency>
-    <dependency org="asm" name="asm" rev="${asm.version}" conf="compile->master">
+    <dependency org="org.apache.spark" name="spark-yarn_2.10" rev="${spark.version}" conf="spark->default">
+        <exclude org="org.apache.hadoop" />
     </dependency>
-    <dependency org="javax.servlet" name="javax.servlet-api" rev="3.0.1"
-             conf="compile->master"/>
+    <dependency org="asm" name="asm" rev="${asm.version}" conf="compile->master"/>
+    <dependency org="javax.servlet" name="javax.servlet-api" rev="3.0.1" conf="spark->default"/>
 
 
     <!-- for Tez integration -->

Modified: pig/branches/spark/ivy/libraries.properties
URL: http://svn.apache.org/viewvc/pig/branches/spark/ivy/libraries.properties?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/ivy/libraries.properties (original)
+++ pig/branches/spark/ivy/libraries.properties Thu Sep 17 23:33:50 2015
@@ -77,7 +77,7 @@ netty.version=3.2.2
 rats-lib.version=0.5.1
 slf4j-api.version=1.6.1
 slf4j-log4j12.version=1.6.1
-spark.version=1.3.0
+spark.version=1.4.1
 xerces.version=2.10.0
 xalan.version=2.7.1
 wagon-http.version=1.0-beta-2
@@ -94,7 +94,7 @@ jsr311-api.version=1.1.1
 mockito.version=1.8.4
 jansi.version=1.9
 asm.version=3.3.1
-snappy-java.version=1.1.0.1
+snappy-java.version=1.1.1.7
 tez.version=0.5.3
 parquet-pig-bundle.version=1.2.3
 snappy.version=0.2

Modified: pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml
URL: http://svn.apache.org/viewvc/pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml?rev=1703730&r1=1703729&r2=1703730&view=diff
==============================================================================
--- pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml (original)
+++ pig/branches/spark/src/docs/src/documentation/content/xdocs/start.xml Thu Sep 17 23:33:50 2015
@@ -91,46 +91,57 @@ Test the Pig installation with this simp
     
    <!-- RUNNING PIG  -->
    <section id="run">
-	<title>Running Pig </title> 
+	<title>Running Pig </title>
 	<p>You can run Pig (execute Pig Latin statements and Pig commands) using various modes.</p>
 	<table>
 	<tr>
 	<td></td>
-    <td><strong>Local Mode</strong></td>
-    <td><strong>Tez Local Mode</strong></td>
-    <td><strong>Mapreduce Mode</strong></td>
-    <td><strong>Tez Mode</strong></td>
+    <td><strong>Interactive Mode </strong></td>
+    <td><strong>Batch Mode</strong> </td>
 	</tr>
 	<tr>
-	<td><strong>Interactive Mode </strong></td>
-    <td>yes</td>
-    <td>experimental</td>
+    <td><strong>Local Mode</strong></td>
     <td>yes</td>
     <td>yes</td>
 	</tr>
+  <tr>
+     <td><strong>Mapreduce Mode</strong></td>
+     <td>yes</td>
+     <td>yes</td>
+  </tr>
 	<tr>
-	<td><strong>Batch Mode</strong> </td>
-    <td>yes</td>
+    <td><strong>Tez Local Mode</strong></td>
+    <td>experimental</td>
     <td>experimental</td>
+	</tr>
+	<tr>
+    <td><strong>Tez Mode</strong></td>
     <td>yes</td>
     <td>yes</td>
 	</tr>
+	<tr>
+    <td><strong>Spark Mode</strong></td>
+    <td>experimental</td>
+    <td>experimental</td>
+	</tr>
 	</table>
-	
+
 	<!-- ++++++++++++++++++++++++++++++++++ -->
 	   <section id="execution-modes">
-	<title>Execution Modes</title> 
-<p>Pig has two execution modes or exectypes: </p>
+	<title>Execution Modes</title>
+<p>Pig has five execution modes or exectypes: </p>
 <ul>
 <li><strong>Local Mode</strong> - To run Pig in local mode, you need access to a single machine; all files are installed and run using your local host and file system. Specify local mode using the -x flag (pig -x local).
 </li>
+<li><strong>Mapreduce Mode</strong> - To run Pig in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. Mapreduce mode is the default mode; you can, <em>but don't need to</em>, specify it using the -x flag (pig OR pig -x mapreduce).
+</li>
 <li><strong>Tez Local Mode</strong> - To run Pig in tez local mode. It is similar to local mode, except internally Pig will invoke tez runtime engine. Specify Tez local mode using the -x flag (pig -x tez_local).
 <p><strong>Note:</strong> Tez local mode is experimental. There are some queries which just error out on bigger data in local mode.</p>
 </li>
-<li><strong>Mapreduce Mode</strong> - To run Pig in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. Mapreduce mode is the default mode; you can, <em>but don't need to</em>, specify it using the -x flag (pig OR pig -x mapreduce).
-</li>
 <li><strong>Tez Mode</strong> - To run Pig in Tez mode, you need access to a Hadoop cluster and HDFS installation. Specify Tez mode using the -x flag (-x tez).
 </li>
+<li><strong>Spark Mode</strong> - To run Pig in Spark mode, you need access to a Spark, Yarn or Mesos cluster and HDFS installation. Specify Spark mode using the -x flag (-x spark). In Spark execution mode, it is necessary to set env::SPARK_MASTER to an appropriate value (local - local mode, yarn-client - yarn-client mode, mesos://host:port - spark on mesos or spark://host:port - spark cluster. For more information refer to spark documentation on Master Urls, <em>yarn-cluster mode is currently not supported</em>)
+</li>
 </ul>
 <p></p>
 
@@ -156,6 +167,9 @@ $ pig -x mapreduce ...
 
 /* Tez mode */
 $ pig -x tez ...
+
+/* Spark mode */
+$ pig -x spark ...
 </source>
 
 </section>
@@ -180,7 +194,7 @@ grunt&gt; dump B;
 <source>
 $ pig -x local
 ... - Connecting to ...
-grunt> 
+grunt>
 </source>
 
 <p><strong>Tez Local Mode</strong></p>
@@ -209,6 +223,14 @@ $ pig -x tez
 ... - Connecting to ...
 grunt> 
 </source>
+
+<p><strong>Spark Mode</strong> </p>
+<source>
+$ pig -x spark
+... - Connecting to ...
+grunt>
+</source>
+
 </section>
 </section>
 
@@ -248,10 +270,14 @@ $ pig -x mapreduce id.pig
 <source>
 $ pig -x tez id.pig
 </source>
+<p><strong>Spark Mode</strong> </p>
+<source>
+$ pig -x spark id.pig
+</source>
 </section>
 
   <!-- ==================================================================== -->
-    
+
    <!-- PIG SCRIPTS -->
    <section id="pig-scripts">
 	<title>Pig Scripts</title>