You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by da...@apache.org on 2016/02/12 21:43:17 UTC
spark git commit: [SPARK-12962] [SQL] [PySpark] PySpark support
covar_samp and covar_pop
Repository: spark
Updated Branches:
refs/heads/master ac7d6af1c -> 90de6b2fa
[SPARK-12962] [SQL] [PySpark] PySpark support covar_samp and covar_pop
PySpark support ```covar_samp``` and ```covar_pop```.
cc rxin davies marmbrus
Author: Yanbo Liang <yb...@gmail.com>
Closes #10876 from yanboliang/spark-12962.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/90de6b2f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/90de6b2f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/90de6b2f
Branch: refs/heads/master
Commit: 90de6b2fae71d05415610be70300625c409f6092
Parents: ac7d6af
Author: Yanbo Liang <yb...@gmail.com>
Authored: Fri Feb 12 12:43:13 2016 -0800
Committer: Davies Liu <da...@gmail.com>
Committed: Fri Feb 12 12:43:13 2016 -0800
----------------------------------------------------------------------
python/pyspark/sql/functions.py | 41 ++++++++++++++++++++++++++++++------
1 file changed, 35 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/90de6b2f/python/pyspark/sql/functions.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 680493e..416d722 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -250,17 +250,46 @@ def corr(col1, col2):
"""Returns a new :class:`Column` for the Pearson Correlation Coefficient for ``col1``
and ``col2``.
- >>> a = [x * x - 2 * x + 3.5 for x in range(20)]
- >>> b = range(20)
- >>> corrDf = sqlContext.createDataFrame(zip(a, b))
- >>> corrDf = corrDf.agg(corr(corrDf._1, corrDf._2).alias('c'))
- >>> corrDf.selectExpr('abs(c - 0.9572339139475857) < 1e-16 as t').collect()
- [Row(t=True)]
+ >>> a = range(20)
+ >>> b = [2 * x for x in range(20)]
+ >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"])
+ >>> df.agg(corr("a", "b").alias('c')).collect()
+ [Row(c=1.0)]
"""
sc = SparkContext._active_spark_context
return Column(sc._jvm.functions.corr(_to_java_column(col1), _to_java_column(col2)))
+@since(2.0)
+def covar_pop(col1, col2):
+ """Returns a new :class:`Column` for the population covariance of ``col1``
+ and ``col2``.
+
+ >>> a = [1] * 10
+ >>> b = [1] * 10
+ >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"])
+ >>> df.agg(covar_pop("a", "b").alias('c')).collect()
+ [Row(c=0.0)]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.covar_pop(_to_java_column(col1), _to_java_column(col2)))
+
+
+@since(2.0)
+def covar_samp(col1, col2):
+ """Returns a new :class:`Column` for the sample covariance of ``col1``
+ and ``col2``.
+
+ >>> a = [1] * 10
+ >>> b = [1] * 10
+ >>> df = sqlContext.createDataFrame(zip(a, b), ["a", "b"])
+ >>> df.agg(covar_samp("a", "b").alias('c')).collect()
+ [Row(c=0.0)]
+ """
+ sc = SparkContext._active_spark_context
+ return Column(sc._jvm.functions.covar_samp(_to_java_column(col1), _to_java_column(col2)))
+
+
@since(1.3)
def countDistinct(col, *cols):
"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org