You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by an...@apache.org on 2015/04/14 22:41:44 UTC

spark git commit: [SPARK-5808] [build] Package pyspark files in sbt assembly.

Repository: spark
Updated Branches:
  refs/heads/master 6adb8bcbf -> 65774370a


[SPARK-5808] [build] Package pyspark files in sbt assembly.

This turned out to be more complicated than I wanted because the
layout of python/ doesn't really follow the usual maven conventions.
So some extra code is needed to copy just the right things.

Author: Marcelo Vanzin <va...@cloudera.com>

Closes #5461 from vanzin/SPARK-5808 and squashes the following commits:

7153dac [Marcelo Vanzin] Only try to create resource dir if it doesn't already exist.
ee90e84 [Marcelo Vanzin] [SPARK-5808] [build] Package pyspark files in sbt assembly.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65774370
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65774370
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65774370

Branch: refs/heads/master
Commit: 65774370a1275e25cd8a3357e397d116767793a9
Parents: 6adb8bc
Author: Marcelo Vanzin <va...@cloudera.com>
Authored: Tue Apr 14 13:41:38 2015 -0700
Committer: Andrew Or <an...@databricks.com>
Committed: Tue Apr 14 13:41:38 2015 -0700

----------------------------------------------------------------------
 project/SparkBuild.scala | 60 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/65774370/project/SparkBuild.scala
----------------------------------------------------------------------
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 5f51f4b..09b4976 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-import java.io.File
+import java.io._
 
 import scala.util.Properties
 import scala.collection.JavaConversions._
@@ -166,6 +166,9 @@ object SparkBuild extends PomBuild {
   /* Enable Assembly for all assembly projects */
   assemblyProjects.foreach(enable(Assembly.settings))
 
+  /* Package pyspark artifacts in the main assembly. */
+  enable(PySparkAssembly.settings)(assembly)
+
   /* Enable unidoc only for the root spark project */
   enable(Unidoc.settings)(spark)
 
@@ -316,6 +319,7 @@ object Hive {
 }
 
 object Assembly {
+  import sbtassembly.AssemblyUtils._
   import sbtassembly.Plugin._
   import AssemblyKeys._
 
@@ -347,6 +351,60 @@ object Assembly {
   )
 }
 
+object PySparkAssembly {
+  import sbtassembly.Plugin._
+  import AssemblyKeys._
+
+  lazy val settings = Seq(
+    unmanagedJars in Compile += { BuildCommons.sparkHome / "python/lib/py4j-0.8.2.1-src.zip" },
+    // Use a resource generator to copy all .py files from python/pyspark into a managed directory
+    // to be included in the assembly. We can't just add "python/" to the assembly's resource dir
+    // list since that will copy unneeded / unwanted files.
+    resourceGenerators in Compile <+= resourceManaged in Compile map { outDir: File =>
+      val dst = new File(outDir, "pyspark")
+      if (!dst.isDirectory()) {
+        require(dst.mkdirs())
+      }
+
+      val src = new File(BuildCommons.sparkHome, "python/pyspark")
+      copy(src, dst)
+    }
+  )
+
+  private def copy(src: File, dst: File): Seq[File] = {
+    src.listFiles().flatMap { f =>
+      val child = new File(dst, f.getName())
+      if (f.isDirectory()) {
+        child.mkdir()
+        copy(f, child)
+      } else if (f.getName().endsWith(".py")) {
+        var in: Option[FileInputStream] = None
+        var out: Option[FileOutputStream] = None
+        try {
+          in = Some(new FileInputStream(f))
+          out = Some(new FileOutputStream(child))
+
+          val bytes = new Array[Byte](1024)
+          var read = 0
+          while (read >= 0) {
+            read = in.get.read(bytes)
+            if (read > 0) {
+              out.get.write(bytes, 0, read)
+            }
+          }
+
+          Some(child)
+        } finally {
+          in.foreach(_.close())
+          out.foreach(_.close())
+        }
+      } else {
+        None
+      }
+    }
+  }
+}
+
 object Unidoc {
 
   import BuildCommons._


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org