You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by sh...@apache.org on 2015/10/09 18:36:44 UTC
spark git commit: [SPARK-10905] [SPARKR] Export freqItems() for
DataFrameStatFunctions
Repository: spark
Updated Branches:
refs/heads/master 5994cfe81 -> 70f44ad2d
[SPARK-10905] [SPARKR] Export freqItems() for DataFrameStatFunctions
[SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions
- Add function (together with roxygen2 doc) to DataFrame.R and generics.R
- Expose the function in NAMESPACE
- Add unit test for the function
Author: Rerngvit Yanggratoke <re...@kth.se>
Closes #8962 from rerngvit/SPARK-10905.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70f44ad2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70f44ad2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70f44ad2
Branch: refs/heads/master
Commit: 70f44ad2d836236c74e1336a7368982d5fe3abff
Parents: 5994cfe
Author: Rerngvit Yanggratoke <re...@kth.se>
Authored: Fri Oct 9 09:36:40 2015 -0700
Committer: Shivaram Venkataraman <sh...@cs.berkeley.edu>
Committed: Fri Oct 9 09:36:40 2015 -0700
----------------------------------------------------------------------
R/pkg/NAMESPACE | 1 +
R/pkg/R/generics.R | 4 ++++
R/pkg/R/stats.R | 27 +++++++++++++++++++++++++++
R/pkg/inst/tests/test_sparkSQL.R | 21 +++++++++++++++++++++
4 files changed, 53 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/70f44ad2/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9aad354..255be2e 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -40,6 +40,7 @@ exportMethods("arrange",
"fillna",
"filter",
"first",
+ "freqItems",
"group_by",
"groupBy",
"head",
http://git-wip-us.apache.org/repos/asf/spark/blob/70f44ad2/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e9086fd..c447413 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
# @export
setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
+# @rdname statfunctions
+# @export
+setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
+
# @rdname distinct
# @export
setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
http://git-wip-us.apache.org/repos/asf/spark/blob/70f44ad2/R/pkg/R/stats.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 06382d5..4928cf4 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -100,3 +100,30 @@ setMethod("corr",
statFunctions <- callJMethod(x@sdf, "stat")
callJMethod(statFunctions, "corr", col1, col2, method)
})
+
+#' freqItems
+#'
+#' Finding frequent items for columns, possibly with false positives.
+#' Using the frequent element count algorithm described in
+#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param cols A vector column names to search frequent items in.
+#' @param support (Optional) The minimum frequency for an item to be considered `frequent`.
+#' Should be greater than 1e-4. Default support = 0.01.
+#' @return a local R data.frame with the frequent items in each column
+#'
+#' @rdname statfunctions
+#' @name freqItems
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' fi = freqItems(df, c("title", "gender"))
+#' }
+setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
+ function(x, cols, support = 0.01) {
+ statFunctions <- callJMethod(x@sdf, "stat")
+ sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
+ collect(dataFrame(sct))
+ })
http://git-wip-us.apache.org/repos/asf/spark/blob/70f44ad2/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e85de25..4804ecf 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", {
expect_true(abs(result - 1.0) < 1e-12)
})
+test_that("freqItems() on a DataFrame", {
+ input <- 1:1000
+ rdf <- data.frame(numbers = input, letters = as.character(input),
+ negDoubles = input * -1.0, stringsAsFactors = F)
+ rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+ df <- createDataFrame(sqlContext, rdf)
+ multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
+ expect_true(1 %in% multiColResults$numbers[[1]])
+ expect_true("1" %in% multiColResults$letters[[1]])
+ singleColResult <- freqItems(df, "negDoubles", support=0.1)
+ expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
+
+ l <- lapply(c(0:99), function(i) {
+ if (i %% 2 == 0) { list(1L, -1.0) }
+ else { list(i, i * -1.0) }})
+ df <- createDataFrame(sqlContext, l, c("a", "b"))
+ result <- freqItems(df, c("a", "b"), 0.4)
+ expect_identical(result[[1]], list(list(1L, 99L)))
+ expect_identical(result[[2]], list(list(-1, -99)))
+})
+
test_that("SQL error message is returned from JVM", {
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
expect_equal(grepl("Table Not Found: blah", retError), TRUE)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org