You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/07/26 10:07:11 UTC

git commit: [SPARK-2652] [PySpark] Turning some default configs for PySpark

Repository: spark
Updated Branches:
  refs/heads/master 66f26a461 -> 75663b57f


[SPARK-2652] [PySpark] Turning some default configs for PySpark

Add several default configs for PySpark, related to serialization in JVM.

spark.serializer = org.apache.spark.serializer.KryoSerializer
spark.serializer.objectStreamReset = 100
spark.rdd.compress = True

This will help to reduce the memory usage during RDD.partitionBy()

Author: Davies Liu <da...@gmail.com>

Closes #1568 from davies/conf and squashes the following commits:

cd316f1 [Davies Liu] remove duplicated line
f71a355 [Davies Liu] rebase to master, add spark.rdd.compress = True
8f63f45 [Davies Liu] Merge branch 'master' into conf
8bc9f08 [Davies Liu] fix unittest
c04a83d [Davies Liu] some default configs for PySpark


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/75663b57
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/75663b57
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/75663b57

Branch: refs/heads/master
Commit: 75663b57f90bb173f0c6c288944ec568c4719b2a
Parents: 66f26a4
Author: Davies Liu <da...@gmail.com>
Authored: Sat Jul 26 01:07:08 2014 -0700
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Sat Jul 26 01:07:08 2014 -0700

----------------------------------------------------------------------
 python/pyspark/context.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/75663b57/python/pyspark/context.py
----------------------------------------------------------------------
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index bdf14ea..e8ac989 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -37,6 +37,15 @@ from pyspark.rdd import RDD
 from py4j.java_collections import ListConverter
 
 
+# These are special default configs for PySpark, they will overwrite
+# the default ones for Spark if they are not configured by user.
+DEFAULT_CONFIGS = {
+    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
+    "spark.serializer.objectStreamReset": 100,
+    "spark.rdd.compress": True,
+}
+
+
 class SparkContext(object):
     """
     Main entry point for Spark functionality. A SparkContext represents the
@@ -101,7 +110,7 @@ class SparkContext(object):
         else:
             self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                 batchSize)
-        self._conf.setIfMissing("spark.rdd.compress", "true")
+
         # Set any parameters passed directly to us on the conf
         if master:
             self._conf.setMaster(master)
@@ -112,6 +121,8 @@ class SparkContext(object):
         if environment:
             for key, value in environment.iteritems():
                 self._conf.setExecutorEnv(key, value)
+        for key, value in DEFAULT_CONFIGS.items():
+            self._conf.setIfMissing(key, value)
 
         # Check that we have at least the required parameters
         if not self._conf.contains("spark.master"):