You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sh...@apache.org on 2015/07/23 06:40:27 UTC
spark git commit: [SPARK-8364] [SPARKR] Add crosstab to SparkR
DataFrames
Repository: spark
Updated Branches:
refs/heads/master b217230f2 -> 2f5cbd860
[SPARK-8364] [SPARKR] Add crosstab to SparkR DataFrames
Add `crosstab` to SparkR DataFrames, which takes two column names and returns a local R data.frame. This is similar to `table` in R. However, `table` in SparkR is used for loading SQL tables as DataFrames. The return type is data.frame instead table for `crosstab` to be compatible with Scala/Python.
I couldn't run R tests successfully on my local. Many unit tests failed. So let's try Jenkins.
Author: Xiangrui Meng <me...@databricks.com>
Closes #7318 from mengxr/SPARK-8364 and squashes the following commits:
d75e894 [Xiangrui Meng] fix tests
53f6ddd [Xiangrui Meng] fix tests
f1348d6 [Xiangrui Meng] update test
47cb088 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-8364
5621262 [Xiangrui Meng] first version without test
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2f5cbd86
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2f5cbd86
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2f5cbd86
Branch: refs/heads/master
Commit: 2f5cbd860e487e7339e627dd7e2c9baa5116b819
Parents: b217230
Author: Xiangrui Meng <me...@databricks.com>
Authored: Wed Jul 22 21:40:23 2015 -0700
Committer: Shivaram Venkataraman <sh...@cs.berkeley.edu>
Committed: Wed Jul 22 21:40:23 2015 -0700
----------------------------------------------------------------------
R/pkg/NAMESPACE | 1 +
R/pkg/R/DataFrame.R | 28 ++++++++++++++++++++++++++++
R/pkg/R/generics.R | 4 ++++
R/pkg/inst/tests/test_sparkSQL.R | 13 +++++++++++++
4 files changed, 46 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/2f5cbd86/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 5834813..7f7a8a2 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -26,6 +26,7 @@ exportMethods("arrange",
"collect",
"columns",
"count",
+ "crosstab",
"describe",
"distinct",
"dropna",
http://git-wip-us.apache.org/repos/asf/spark/blob/2f5cbd86/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index a58433d..06dd6b7 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1554,3 +1554,31 @@ setMethod("fillna",
}
dataFrame(sdf)
})
+
+#' crosstab
+#'
+#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
+#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
+#' non-zero pair frequencies will be returned.
+#'
+#' @param col1 name of the first column. Distinct items will make the first item of each row.
+#' @param col2 name of the second column. Distinct items will make the column names of the output.
+#' @return a local R data.frame representing the contingency table. The first column of each row
+#' will be the distinct values of `col1` and the column names will be the distinct values
+#' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
+#' occurrences will have `null` as their counts.
+#'
+#' @rdname statfunctions
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlCtx, "/path/to/file.json")
+#' ct = crosstab(df, "title", "gender")
+#' }
+setMethod("crosstab",
+ signature(x = "DataFrame", col1 = "character", col2 = "character"),
+ function(x, col1, col2) {
+ statFunctions <- callJMethod(x@sdf, "stat")
+ sct <- callJMethod(statFunctions, "crosstab", col1, col2)
+ collect(dataFrame(sct))
+ })
http://git-wip-us.apache.org/repos/asf/spark/blob/2f5cbd86/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 39b5586..836e017 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -59,6 +59,10 @@ setGeneric("count", function(x) { standardGeneric("count") })
# @export
setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
+# @rdname statfunctions
+# @export
+setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
+
# @rdname distinct
# @export
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
http://git-wip-us.apache.org/repos/asf/spark/blob/2f5cbd86/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index a3039d3..62fe48a 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -987,6 +987,19 @@ test_that("fillna() on a DataFrame", {
expect_identical(expected, actual)
})
+test_that("crosstab() on a DataFrame", {
+ rdd <- lapply(parallelize(sc, 0:3), function(x) {
+ list(paste0("a", x %% 3), paste0("b", x %% 2))
+ })
+ df <- toDF(rdd, list("a", "b"))
+ ct <- crosstab(df, "a", "b")
+ ordered <- ct[order(ct$a_b),]
+ row.names(ordered) <- NULL
+ expected <- data.frame("a_b" = c("a0", "a1", "a2"), "b0" = c(1, 0, 1), "b1" = c(1, 1, 0),
+ stringsAsFactors = FALSE, row.names = NULL)
+ expect_identical(expected, ordered)
+})
+
unlink(parquetPath)
unlink(jsonPath)
unlink(jsonPathNa)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org