You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/07/25 03:15:47 UTC
git commit: [SPARK-2014] Make PySpark store RDDs in MEMORY_ONLY_SER
with compression by default
Repository: spark
Updated Branches:
refs/heads/master a45d5480f -> eff9714e1
[SPARK-2014] Make PySpark store RDDs in MEMORY_ONLY_SER with compression by default
Author: Prashant Sharma <pr...@imaginea.com>
Closes #1051 from ScrapCodes/SPARK-2014/pyspark-cache and squashes the following commits:
f192df7 [Prashant Sharma] Code Review
2a2f43f [Prashant Sharma] [SPARK-2014] Make PySpark store RDDs in MEMORY_ONLY_SER with compression by default
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eff9714e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eff9714e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eff9714e
Branch: refs/heads/master
Commit: eff9714e1c88e39e28317358ca9ec87677f121dc
Parents: a45d548
Author: Prashant Sharma <pr...@imaginea.com>
Authored: Thu Jul 24 18:15:37 2014 -0700
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Thu Jul 24 18:15:37 2014 -0700
----------------------------------------------------------------------
python/pyspark/conf.py | 6 ++++++
python/pyspark/context.py | 2 +-
python/pyspark/rdd.py | 4 ++--
3 files changed, 9 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/eff9714e/python/pyspark/conf.py
----------------------------------------------------------------------
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index b50590a..b4c82f5 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -100,6 +100,12 @@ class SparkConf(object):
self._jconf.set(key, unicode(value))
return self
+ def setIfMissing(self, key, value):
+ """Set a configuration property, if not already set."""
+ if self.get(key) is None:
+ self.set(key, value)
+ return self
+
def setMaster(self, value):
"""Set master URL to connect to."""
self._jconf.setMaster(value)
http://git-wip-us.apache.org/repos/asf/spark/blob/eff9714e/python/pyspark/context.py
----------------------------------------------------------------------
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index e21be0e..024fb88 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -101,7 +101,7 @@ class SparkContext(object):
else:
self.serializer = BatchedSerializer(self._unbatched_serializer,
batchSize)
-
+ self._conf.setIfMissing("spark.rdd.compress", "true")
# Set any parameters passed directly to us on the conf
if master:
self._conf.setMaster(master)
http://git-wip-us.apache.org/repos/asf/spark/blob/eff9714e/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 94ba223..a38dd0b 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -231,10 +231,10 @@ class RDD(object):
def cache(self):
"""
- Persist this RDD with the default storage level (C{MEMORY_ONLY}).
+ Persist this RDD with the default storage level (C{MEMORY_ONLY_SER}).
"""
self.is_cached = True
- self._jrdd.cache()
+ self.persist(StorageLevel.MEMORY_ONLY_SER)
return self
def persist(self, storageLevel):