You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by da...@apache.org on 2015/09/29 22:38:19 UTC

spark git commit: [SPARK-6919] [PYSPARK] Add asDict method to StatCounter

Repository: spark
Updated Branches:
  refs/heads/master ab41864f9 -> 7d399c9da


[SPARK-6919] [PYSPARK] Add asDict method to StatCounter

Add method to easily convert a StatCounter instance into a Python dict

https://issues.apache.org/jira/browse/SPARK-6919

Note: This is my original work and the existing Spark license applies.

Author: Erik Shilts <er...@opower.com>

Closes #5516 from eshilts/statcounter-asdict.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d399c9d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d399c9d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d399c9d

Branch: refs/heads/master
Commit: 7d399c9daa6769ab234890c551e1b3456e0e6e85
Parents: ab41864
Author: Erik Shilts <er...@opower.com>
Authored: Tue Sep 29 13:38:15 2015 -0700
Committer: Davies Liu <da...@gmail.com>
Committed: Tue Sep 29 13:38:15 2015 -0700

----------------------------------------------------------------------
 python/pyspark/statcounter.py | 22 ++++++++++++++++++++++
 python/pyspark/tests.py       | 20 ++++++++++++++++++++
 2 files changed, 42 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7d399c9d/python/pyspark/statcounter.py
----------------------------------------------------------------------
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
index 0fee3b2..03ea0b6 100644
--- a/python/pyspark/statcounter.py
+++ b/python/pyspark/statcounter.py
@@ -131,6 +131,28 @@ class StatCounter(object):
     def sampleStdev(self):
         return sqrt(self.sampleVariance())
 
+    def asDict(self, sample=False):
+        """Returns the :class:`StatCounter` members as a ``dict``.
+
+        >>> sc.parallelize([1., 2., 3., 4.]).stats().asDict()
+        {'count': 4L,
+         'max': 4.0,
+         'mean': 2.5,
+         'min': 1.0,
+         'stdev': 1.2909944487358056,
+         'sum': 10.0,
+         'variance': 1.6666666666666667}
+        """
+        return {
+            'count': self.count(),
+            'mean': self.mean(),
+            'sum': self.sum(),
+            'min': self.min(),
+            'max': self.max(),
+            'stdev': self.stdev() if sample else self.sampleStdev(),
+            'variance': self.variance() if sample else self.sampleVariance()
+        }
+
     def __repr__(self):
         return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %
                 (self.count(), self.mean(), self.stdev(), self.max(), self.min()))

http://git-wip-us.apache.org/repos/asf/spark/blob/7d399c9d/python/pyspark/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index f11aaf0..63cc87e 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1976,6 +1976,26 @@ class NumPyTests(PySparkTestCase):
         self.assertSequenceEqual([3.0, 3.0], s.max().tolist())
         self.assertSequenceEqual([1.0, 1.0], s.sampleStdev().tolist())
 
+        stats_dict = s.asDict()
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_dict['sum'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['stdev'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_dict['variance'].tolist())
+
+        stats_sample_dict = s.asDict(sample=True)
+        self.assertEqual(3, stats_dict['count'])
+        self.assertSequenceEqual([2.0, 2.0], stats_sample_dict['mean'].tolist())
+        self.assertSequenceEqual([1.0, 1.0], stats_sample_dict['min'].tolist())
+        self.assertSequenceEqual([3.0, 3.0], stats_sample_dict['max'].tolist())
+        self.assertSequenceEqual([6.0, 6.0], stats_sample_dict['sum'].tolist())
+        self.assertSequenceEqual(
+            [0.816496580927726, 0.816496580927726], stats_sample_dict['stdev'].tolist())
+        self.assertSequenceEqual(
+            [0.6666666666666666, 0.6666666666666666], stats_sample_dict['variance'].tolist())
+
 
 if __name__ == "__main__":
     if not _have_scipy:


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org