You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sr...@apache.org on 2016/12/06 22:09:38 UTC

spark git commit: [SPARK-18652][PYTHON] Include the example data and third-party licenses in pyspark package.

Repository: spark
Updated Branches:
  refs/heads/master eeed38eaf -> bd9a4a5ac


[SPARK-18652][PYTHON] Include the example data and third-party licenses in pyspark package.

## What changes were proposed in this pull request?

Since we already include the python examples in the pyspark package, we should include the example data with it as well.

We should also include the third-party licences since we distribute their jars with the pyspark package.

## How was this patch tested?

Manually tested with python2.7 and python3.4
```sh
$ ./build/mvn -DskipTests -Phive -Phive-thriftserver -Pyarn -Pmesos clean package
$ cd python
$ python setup.py sdist
$ pip install  dist/pyspark-2.1.0.dev0.tar.gz

$ ls -1 /usr/local/lib/python2.7/dist-packages/pyspark/data/
graphx
mllib
streaming

$ du -sh /usr/local/lib/python2.7/dist-packages/pyspark/data/
600K    /usr/local/lib/python2.7/dist-packages/pyspark/data/

$ ls -1  /usr/local/lib/python2.7/dist-packages/pyspark/licenses/|head -5
LICENSE-AnchorJS.txt
LICENSE-DPark.txt
LICENSE-Mockito.txt
LICENSE-SnapTree.txt
LICENSE-antlr.txt
```

Author: Shuai Lin <li...@gmail.com>

Closes #16082 from lins05/include-data-in-pyspark-dist.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd9a4a5a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd9a4a5a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd9a4a5a

Branch: refs/heads/master
Commit: bd9a4a5ac3abcc48131d1249df55e7d68266343a
Parents: eeed38e
Author: Shuai Lin <li...@gmail.com>
Authored: Wed Dec 7 06:09:27 2016 +0800
Committer: Sean Owen <so...@cloudera.com>
Committed: Wed Dec 7 06:09:27 2016 +0800

----------------------------------------------------------------------
 python/MANIFEST.in |  2 ++
 python/setup.py    | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bd9a4a5a/python/MANIFEST.in
----------------------------------------------------------------------
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
index bbcce1b..40f1fb2 100644
--- a/python/MANIFEST.in
+++ b/python/MANIFEST.in
@@ -17,6 +17,8 @@
 global-exclude *.py[cod] __pycache__ .DS_Store
 recursive-include deps/jars *.jar
 graft deps/bin
+recursive-include deps/data *.data *.txt
+recursive-include deps/licenses *.txt
 recursive-include deps/examples *.py
 recursive-include lib *.zip
 include README.md

http://git-wip-us.apache.org/repos/asf/spark/blob/bd9a4a5a/python/setup.py
----------------------------------------------------------------------
diff --git a/python/setup.py b/python/setup.py
index 625aea0..bc2eb4c 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -69,10 +69,14 @@ elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
 
 EXAMPLES_PATH = os.path.join(SPARK_HOME, "examples/src/main/python")
 SCRIPTS_PATH = os.path.join(SPARK_HOME, "bin")
+DATA_PATH = os.path.join(SPARK_HOME, "data")
+LICENSES_PATH = os.path.join(SPARK_HOME, "licenses")
+
 SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")
 JARS_TARGET = os.path.join(TEMP_PATH, "jars")
 EXAMPLES_TARGET = os.path.join(TEMP_PATH, "examples")
-
+DATA_TARGET = os.path.join(TEMP_PATH, "data")
+LICENSES_TARGET = os.path.join(TEMP_PATH, "licenses")
 
 # Check and see if we are under the spark path in which case we need to build the symlink farm.
 # This is important because we only want to build the symlink farm while under Spark otherwise we
@@ -114,11 +118,15 @@ try:
             os.symlink(JARS_PATH, JARS_TARGET)
             os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)
             os.symlink(EXAMPLES_PATH, EXAMPLES_TARGET)
+            os.symlink(DATA_PATH, DATA_TARGET)
+            os.symlink(LICENSES_PATH, LICENSES_TARGET)
         else:
             # For windows fall back to the slower copytree
             copytree(JARS_PATH, JARS_TARGET)
             copytree(SCRIPTS_PATH, SCRIPTS_TARGET)
             copytree(EXAMPLES_PATH, EXAMPLES_TARGET)
+            copytree(DATA_PATH, DATA_TARGET)
+            copytree(LICENSES_PATH, LICENSES_TARGET)
     else:
         # If we are not inside of SPARK_HOME verify we have the required symlink farm
         if not os.path.exists(JARS_TARGET):
@@ -161,18 +169,24 @@ try:
                   'pyspark.jars',
                   'pyspark.python.pyspark',
                   'pyspark.python.lib',
+                  'pyspark.data',
+                  'pyspark.licenses',
                   'pyspark.examples.src.main.python'],
         include_package_data=True,
         package_dir={
             'pyspark.jars': 'deps/jars',
             'pyspark.bin': 'deps/bin',
             'pyspark.python.lib': 'lib',
+            'pyspark.data': 'deps/data',
+            'pyspark.licenses': 'deps/licenses',
             'pyspark.examples.src.main.python': 'deps/examples',
         },
         package_data={
             'pyspark.jars': ['*.jar'],
             'pyspark.bin': ['*'],
             'pyspark.python.lib': ['*.zip'],
+            'pyspark.data': ['*.txt', '*.data'],
+            'pyspark.licenses': ['*.txt'],
             'pyspark.examples.src.main.python': ['*.py', '*/*.py']},
         scripts=scripts,
         license='http://www.apache.org/licenses/LICENSE-2.0',
@@ -202,8 +216,12 @@ finally:
             os.remove(os.path.join(TEMP_PATH, "jars"))
             os.remove(os.path.join(TEMP_PATH, "bin"))
             os.remove(os.path.join(TEMP_PATH, "examples"))
+            os.remove(os.path.join(TEMP_PATH, "data"))
+            os.remove(os.path.join(TEMP_PATH, "licenses"))
         else:
             rmtree(os.path.join(TEMP_PATH, "jars"))
             rmtree(os.path.join(TEMP_PATH, "bin"))
             rmtree(os.path.join(TEMP_PATH, "examples"))
+            rmtree(os.path.join(TEMP_PATH, "data"))
+            rmtree(os.path.join(TEMP_PATH, "licenses"))
         os.rmdir(TEMP_PATH)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org