You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jk...@apache.org on 2015/02/24 23:50:07 UTC
spark git commit: [SPARK-5973] [PySpark] fix zip with two RDDs with
AutoBatchedSerializer
Repository: spark
Updated Branches:
refs/heads/master a2b913792 -> da505e592
[SPARK-5973] [PySpark] fix zip with two RDDs with AutoBatchedSerializer
Author: Davies Liu <da...@databricks.com>
Closes #4745 from davies/fix_zip and squashes the following commits:
2124b2c [Davies Liu] Update tests.py
b5c828f [Davies Liu] increase the number of records
c1e40fd [Davies Liu] fix zip with two RDDs with AutoBatchedSerializer
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/da505e59
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/da505e59
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/da505e59
Branch: refs/heads/master
Commit: da505e59274d1c838653c1109db65ad374e65304
Parents: a2b9137
Author: Davies Liu <da...@databricks.com>
Authored: Tue Feb 24 14:50:00 2015 -0800
Committer: Joseph K. Bradley <jo...@databricks.com>
Committed: Tue Feb 24 14:50:00 2015 -0800
----------------------------------------------------------------------
python/pyspark/rdd.py | 2 +-
python/pyspark/tests.py | 6 ++++++
2 files changed, 7 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/da505e59/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index ba2347a..d3148de 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -1950,7 +1950,7 @@ class RDD(object):
my_batch = get_batch_size(self._jrdd_deserializer)
other_batch = get_batch_size(other._jrdd_deserializer)
- if my_batch != other_batch:
+ if my_batch != other_batch or not my_batch:
# use the smallest batchSize for both of them
batchSize = min(my_batch, other_batch)
if batchSize <= 0:
http://git-wip-us.apache.org/repos/asf/spark/blob/da505e59/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 52e8209..06ba2b4 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -543,6 +543,12 @@ class RDDTests(ReusedPySparkTestCase):
# regression test for bug in _reserializer()
self.assertEqual(cnt, t.zip(rdd).count())
+ def test_zip_with_different_object_sizes(self):
+ # regress test for SPARK-5973
+ a = self.sc.parallelize(range(10000)).map(lambda i: '*' * i)
+ b = self.sc.parallelize(range(10000, 20000)).map(lambda i: '*' * i)
+ self.assertEqual(10000, a.zip(b).count())
+
def test_zip_with_different_number_of_items(self):
a = self.sc.parallelize(range(5), 2)
# different number of partitions
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org