You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jo...@apache.org on 2014/09/16 20:40:03 UTC
git commit: [SPARK-3519] add distinct(n) to PySpark
Repository: spark
Updated Branches:
refs/heads/master 86d253ec4 -> 9d5fa763d
[SPARK-3519] add distinct(n) to PySpark
Added missing rdd.distinct(numPartitions) and associated tests
Author: Matthew Farrellee <ma...@redhat.com>
Closes #2383 from mattf/SPARK-3519 and squashes the following commits:
30b837a [Matthew Farrellee] Combine test cases to save on JVM startups
6bc4a2c [Matthew Farrellee] [SPARK-3519] add distinct(n) to SchemaRDD in PySpark
7a17f2b [Matthew Farrellee] [SPARK-3519] add distinct(n) to PySpark
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9d5fa763
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9d5fa763
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9d5fa763
Branch: refs/heads/master
Commit: 9d5fa763d8559ac412a18d7a2f43c4368a0af897
Parents: 86d253e
Author: Matthew Farrellee <ma...@redhat.com>
Authored: Tue Sep 16 11:39:57 2014 -0700
Committer: Josh Rosen <jo...@apache.org>
Committed: Tue Sep 16 11:39:57 2014 -0700
----------------------------------------------------------------------
python/pyspark/rdd.py | 4 ++--
python/pyspark/sql.py | 7 +++++--
python/pyspark/tests.py | 17 +++++++++++++++++
3 files changed, 24 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/9d5fa763/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 21f182b..cb09c19 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -301,7 +301,7 @@ class RDD(object):
return ifilter(f, iterator)
return self.mapPartitions(func, True)
- def distinct(self):
+ def distinct(self, numPartitions=None):
"""
Return a new RDD containing the distinct elements in this RDD.
@@ -309,7 +309,7 @@ class RDD(object):
[1, 2, 3]
"""
return self.map(lambda x: (x, None)) \
- .reduceByKey(lambda x, _: x) \
+ .reduceByKey(lambda x, _: x, numPartitions) \
.map(lambda (x, _): x)
def sample(self, withReplacement, fraction, seed=None):
http://git-wip-us.apache.org/repos/asf/spark/blob/9d5fa763/python/pyspark/sql.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py
index fc9310f..eac55cb 100644
--- a/python/pyspark/sql.py
+++ b/python/pyspark/sql.py
@@ -1694,8 +1694,11 @@ class SchemaRDD(RDD):
rdd = self._jschema_rdd.coalesce(numPartitions, shuffle)
return SchemaRDD(rdd, self.sql_ctx)
- def distinct(self):
- rdd = self._jschema_rdd.distinct()
+ def distinct(self, numPartitions=None):
+ if numPartitions is None:
+ rdd = self._jschema_rdd.distinct()
+ else:
+ rdd = self._jschema_rdd.distinct(numPartitions)
return SchemaRDD(rdd, self.sql_ctx)
def intersection(self, other):
http://git-wip-us.apache.org/repos/asf/spark/blob/9d5fa763/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f255b44..0b38543 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -587,6 +587,14 @@ class TestRDDFunctions(PySparkTestCase):
self.assertEquals(partitions[0], [(0, 5), (0, 8), (2, 6)])
self.assertEquals(partitions[1], [(1, 3), (3, 8), (3, 8)])
+ def test_distinct(self):
+ rdd = self.sc.parallelize((1, 2, 3)*10, 10)
+ self.assertEquals(rdd.getNumPartitions(), 10)
+ self.assertEquals(rdd.distinct().count(), 3)
+ result = rdd.distinct(5)
+ self.assertEquals(result.getNumPartitions(), 5)
+ self.assertEquals(result.count(), 3)
+
class TestSQL(PySparkTestCase):
@@ -636,6 +644,15 @@ class TestSQL(PySparkTestCase):
srdd.count()
srdd.collect()
+ def test_distinct(self):
+ rdd = self.sc.parallelize(['{"a": 1}', '{"b": 2}', '{"c": 3}']*10, 10)
+ srdd = self.sqlCtx.jsonRDD(rdd)
+ self.assertEquals(srdd.getNumPartitions(), 10)
+ self.assertEquals(srdd.distinct().count(), 3)
+ result = srdd.distinct(5)
+ self.assertEquals(result.getNumPartitions(), 5)
+ self.assertEquals(result.count(), 3)
+
class TestIO(PySparkTestCase):
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org