You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/03/12 23:57:50 UTC
git commit: SPARK-1162 Added top in python.
Repository: spark
Updated Branches:
refs/heads/master 5d1ec64e7 -> b8afe3052
SPARK-1162 Added top in python.
Author: Prashant Sharma <pr...@imaginea.com>
Closes #93 from ScrapCodes/SPARK-1162/pyspark-top-takeOrdered and squashes the following commits:
ece1fa4 [Prashant Sharma] Added top in python.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8afe305
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8afe305
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8afe305
Branch: refs/heads/master
Commit: b8afe3052086547879ebf28d6e36207e0d370710
Parents: 5d1ec64
Author: Prashant Sharma <pr...@imaginea.com>
Authored: Wed Mar 12 15:57:44 2014 -0700
Committer: Matei Zaharia <ma...@databricks.com>
Committed: Wed Mar 12 15:57:44 2014 -0700
----------------------------------------------------------------------
python/pyspark/rdd.py | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/b8afe305/python/pyspark/rdd.py
----------------------------------------------------------------------
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 0f28dbd..6d549b4 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -29,6 +29,7 @@ from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile
from threading import Thread
import warnings
+from heapq import heappush, heappop, heappushpop
from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
BatchedSerializer, CloudPickleSerializer, PairDeserializer, pack_long
@@ -660,6 +661,30 @@ class RDD(object):
m1[k] += v
return m1
return self.mapPartitions(countPartition).reduce(mergeMaps)
+
+ def top(self, num):
+ """
+ Get the top N elements from a RDD.
+
+ Note: It returns the list sorted in ascending order.
+ >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
+ [12]
+ >>> sc.parallelize([2, 3, 4, 5, 6]).cache().top(2)
+ [5, 6]
+ """
+ def topIterator(iterator):
+ q = []
+ for k in iterator:
+ if len(q) < num:
+ heappush(q, k)
+ else:
+ heappushpop(q, k)
+ yield q
+
+ def merge(a, b):
+ return next(topIterator(a + b))
+
+ return sorted(self.mapPartitions(topIterator).reduce(merge))
def take(self, num):
"""