You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by me...@apache.org on 2016/02/26 06:23:44 UTC

spark git commit: [SPARK-13504] [SPARKR] Add approxQuantile for SparkR

Repository: spark
Updated Branches:
  refs/heads/master f3be369ef -> 50e60e36f


[SPARK-13504] [SPARKR] Add approxQuantile for SparkR

## What changes were proposed in this pull request?
Add ```approxQuantile``` for SparkR.
## How was this patch tested?
unit tests

Author: Yanbo Liang <yb...@gmail.com>

Closes #11383 from yanboliang/spark-13504 and squashes the following commits:

4f17adb [Yanbo Liang] Add approxQuantile for SparkR


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/50e60e36
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/50e60e36
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/50e60e36

Branch: refs/heads/master
Commit: 50e60e36f7775a10cf39338e7c5716578a24d89f
Parents: f3be369
Author: Yanbo Liang <yb...@gmail.com>
Authored: Thu Feb 25 21:23:41 2016 -0800
Committer: Xiangrui Meng <me...@databricks.com>
Committed: Thu Feb 25 21:23:41 2016 -0800

----------------------------------------------------------------------
 R/pkg/NAMESPACE                           |  1 +
 R/pkg/R/generics.R                        |  7 +++++
 R/pkg/R/stats.R                           | 39 ++++++++++++++++++++++++++
 R/pkg/inst/tests/testthat/test_sparkSQL.R |  8 ++++++
 4 files changed, 55 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/50e60e36/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 6a3d63f..636d39e 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -111,6 +111,7 @@ exportMethods("%in%",
               "add_months",
               "alias",
               "approxCountDistinct",
+              "approxQuantile",
               "array_contains",
               "asc",
               "ascii",

http://git-wip-us.apache.org/repos/asf/spark/blob/50e60e36/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index ab61bce..3db72b5 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -67,6 +67,13 @@ setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 # @export
 setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
 
+# @rdname statfunctions
+# @export
+setGeneric("approxQuantile",
+           function(x, col, probabilities, relativeError) {
+             standardGeneric("approxQuantile")
+           })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })

http://git-wip-us.apache.org/repos/asf/spark/blob/50e60e36/R/pkg/R/stats.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 2e80768..edf7293 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -130,6 +130,45 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
             collect(dataFrame(sct))
           })
 
+#' approxQuantile
+#'
+#' Calculates the approximate quantiles of a numerical column of a DataFrame.
+#'
+#' The result of this algorithm has the following deterministic bound:
+#' If the DataFrame has N elements and if we request the quantile at probability `p` up to error
+#' `err`, then the algorithm will return a sample `x` from the DataFrame so that the *exact* rank
+#' of `x` is close to (p * N). More precisely,
+#'   floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).
+#' This method implements a variation of the Greenwald-Khanna algorithm (with some speed
+#' optimizations). The algorithm was first present in [[http://dx.doi.org/10.1145/375663.375670
+#' Space-efficient Online Computation of Quantile Summaries]] by Greenwald and Khanna.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param col The name of the numerical column.
+#' @param probabilities A list of quantile probabilities. Each number must belong to [0, 1].
+#'                      For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
+#' @param relativeError The relative target precision to achieve (>= 0). If set to zero,
+#'                      the exact quantiles are computed, which could be very expensive.
+#'                      Note that values greater than 1 are accepted but give the same result as 1.
+#' @return The approximate quantiles at the given probabilities.
+#'
+#' @rdname statfunctions
+#' @name approxQuantile
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+#' }
+setMethod("approxQuantile",
+          signature(x = "DataFrame", col = "character",
+                    probabilities = "numeric", relativeError = "numeric"),
+          function(x, col, probabilities, relativeError) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            callJMethod(statFunctions, "approxQuantile", col,
+                        as.list(probabilities), relativeError)
+          })
+
 #' sampleBy
 #'
 #' Returns a stratified sample without replacement based on the fraction given on each stratum.

http://git-wip-us.apache.org/repos/asf/spark/blob/50e60e36/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index cc11810..236bae6 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -1785,6 +1785,14 @@ test_that("sampleBy() on a DataFrame", {
   expect_identical(as.list(result[2, ]), list(key = "1", count = 7))
 })
 
+test_that("approxQuantile() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) { i })
+  df <- createDataFrame(sqlContext, l, "key")
+  quantiles <- approxQuantile(df, "key", c(0.5, 0.8), 0.0)
+  expect_equal(quantiles[[1]], 50)
+  expect_equal(quantiles[[2]], 80)
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table not found: blah", retError), TRUE)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org