You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/05/05 20:01:29 UTC
spark git commit: [SPARK-7243][SQL] Reduce size for Contingency
Tables in DataFrames
Repository: spark
Updated Branches:
refs/heads/master 9f1f9b103 -> 18340d7be
[SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames
Reduced take size from 1e8 to 1e6.
cc rxin
Author: Burak Yavuz <br...@gmail.com>
Closes #5900 from brkyvz/df-cont-followup and squashes the following commits:
c11e762 [Burak Yavuz] fix grammar
b30ace2 [Burak Yavuz] address comments
a417ba5 [Burak Yavuz] [SPARK-7243][SQL] Reduce size for Contingency Tables in DataFrames
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18340d7b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18340d7b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18340d7b
Branch: refs/heads/master
Commit: 18340d7be55a6834918956555bf820c96769aa52
Parents: 9f1f9b1
Author: Burak Yavuz <br...@gmail.com>
Authored: Tue May 5 11:01:25 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Tue May 5 11:01:25 2015 -0700
----------------------------------------------------------------------
python/pyspark/sql/dataframe.py | 9 +++++----
.../scala/org/apache/spark/sql/DataFrameStatFunctions.scala | 9 +++++----
.../org/apache/spark/sql/execution/stat/StatFunctions.scala | 6 +++---
3 files changed, 13 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/18340d7b/python/pyspark/sql/dataframe.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index f30a92d..17448b3 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -934,10 +934,11 @@ class DataFrame(object):
def crosstab(self, col1, col2):
"""
Computes a pair-wise frequency table of the given columns. Also known as a contingency
- table. The number of distinct values for each column should be less than 1e4. The first
- column of each row will be the distinct values of `col1` and the column names will be the
- distinct values of `col2`. The name of the first column will be `$col1_$col2`. Pairs that
- have no occurrences will have `null` as their counts.
+ table. The number of distinct values for each column should be less than 1e4. At most 1e6
+ non-zero pair frequencies will be returned.
+ The first column of each row will be the distinct values of `col1` and the column names
+ will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.
+ Pairs that have no occurrences will have `null` as their counts.
:func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.
:param col1: The name of the first column. Distinct items will make the first item of
http://git-wip-us.apache.org/repos/asf/spark/blob/18340d7b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
index fcf21ca..cb88dea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -65,10 +65,11 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
/**
* Computes a pair-wise frequency table of the given columns. Also known as a contingency table.
- * The number of distinct values for each column should be less than 1e4. The first
- * column of each row will be the distinct values of `col1` and the column names will be the
- * distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts will be
- * returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
+ * The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero
+ * pair frequencies will be returned.
+ * The first column of each row will be the distinct values of `col1` and the column names will
+ * be the distinct values of `col2`. The name of the first column will be `$col1_$col2`. Counts
+ * will be returned as `Long`s. Pairs that have no occurrences will have `null` as their counts.
*
* @param col1 The name of the first column. Distinct items will make the first item of
* each row.
http://git-wip-us.apache.org/repos/asf/spark/blob/18340d7b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
index b50f606..386ac96 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala
@@ -102,9 +102,9 @@ private[sql] object StatFunctions extends Logging {
/** Generate a table of frequencies for the elements of two columns. */
private[sql] def crossTabulate(df: DataFrame, col1: String, col2: String): DataFrame = {
val tableName = s"${col1}_$col2"
- val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e8.toInt)
- if (counts.length == 1e8.toInt) {
- logWarning("The maximum limit of 1e8 pairs have been collected, which may not be all of " +
+ val counts = df.groupBy(col1, col2).agg(col(col1), col(col2), count("*")).take(1e6.toInt)
+ if (counts.length == 1e6.toInt) {
+ logWarning("The maximum limit of 1e6 pairs have been collected, which may not be all of " +
"the pairs. Please try reducing the amount of distinct items in your columns.")
}
// get the distinct values of column 2, so that we can make them the column names
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org