You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/01/05 01:14:52 UTC

spark git commit: [SPARK-12509][SQL] Fixed error messages for DataFrame correlation and covariance

Repository: spark
Updated Branches:
  refs/heads/master 34de24abb -> fdfac22d0


[SPARK-12509][SQL] Fixed error messages for DataFrame correlation and covariance

Currently, when we call corr or cov on dataframe with invalid input we see these error messages for both corr and cov:
   -  "Currently cov supports calculating the covariance between two columns"
   -  "Covariance calculation for columns with dataType "[DataType Name]" not supported."

I've fixed this issue by passing the function name as an argument. We could also do the input checks separately for each function. I avoided doing that because of code duplication.

Thanks!

Author: Narine Kokhlikyan <na...@gmail.com>

Closes #10458 from NarineK/sparksqlstatsmessages.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fdfac22d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fdfac22d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fdfac22d

Branch: refs/heads/master
Commit: fdfac22d08fc4fdc640843dd93a29e2ce4aee2ef
Parents: 34de24a
Author: Narine Kokhlikyan <na...@gmail.com>
Authored: Mon Jan 4 16:14:49 2016 -0800
Committer: Reynold Xin <rx...@databricks.com>
Committed: Mon Jan 4 16:14:49 2016 -0800

----------------------------------------------------------------------
 .../spark/sql/execution/stat/StatFunctions.scala       | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/fdfac22d/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index 00231d6..725d682 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -29,7 +29,7 @@ private[sql] object StatFunctions extends Logging {
 
   /** Calculate the Pearson Correlation Coefficient for the given columns */
   private[sql] def pearsonCorrelation(df: DataFrame, cols: Seq[String]): Double = {
-    val counts = collectStatisticalData(df, cols)
+    val counts = collectStatisticalData(df, cols, "correlation")
     counts.Ck / math.sqrt(counts.MkX * counts.MkY)
   }
 
@@ -73,13 +73,14 @@ private[sql] object StatFunctions extends Logging {
     def cov: Double = Ck / (count - 1)
   }
 
-  private def collectStatisticalData(df: DataFrame, cols: Seq[String]): CovarianceCounter = {
-    require(cols.length == 2, "Currently cov supports calculating the covariance " +
+  private def collectStatisticalData(df: DataFrame, cols: Seq[String],
+              functionName: String): CovarianceCounter = {
+    require(cols.length == 2, s"Currently $functionName calculation is supported " +
       "between two columns.")
     cols.map(name => (name, df.schema.fields.find(_.name == name))).foreach { case (name, data) =>
       require(data.nonEmpty, s"Couldn't find column with name $name")
-      require(data.get.dataType.isInstanceOf[NumericType], "Covariance calculation for columns " +
-        s"with dataType ${data.get.dataType} not supported.")
+      require(data.get.dataType.isInstanceOf[NumericType], s"Currently $functionName calculation " +
+        s"for columns with dataType ${data.get.dataType} not supported.")
     }
     val columns = cols.map(n => Column(Cast(Column(n).expr, DoubleType)))
     df.select(columns: _*).queryExecution.toRdd.aggregate(new CovarianceCounter)(
@@ -98,7 +99,7 @@ private[sql] object StatFunctions extends Logging {
    * @return the covariance of the two columns.
    */
   private[sql] def calculateCov(df: DataFrame, cols: Seq[String]): Double = {
-    val counts = collectStatisticalData(df, cols)
+    val counts = collectStatisticalData(df, cols, "covariance")
     counts.cov
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org