You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2017/03/05 20:37:07 UTC
spark git commit: [SPARK-19795][SPARKR] add column functions to_json,
from_json
Repository: spark
Updated Branches:
refs/heads/master 14bb398fa -> 80d5338b3
[SPARK-19795][SPARKR] add column functions to_json, from_json
## What changes were proposed in this pull request?
Add column functions: to_json, from_json, and tests covering error cases.
## How was this patch tested?
unit tests, manual
Author: Felix Cheung <fe...@hotmail.com>
Closes #17134 from felixcheung/rtojson.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80d5338b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80d5338b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80d5338b
Branch: refs/heads/master
Commit: 80d5338b32e856870cf187ce17bc87335d690761
Parents: 14bb398
Author: Felix Cheung <fe...@hotmail.com>
Authored: Sun Mar 5 12:37:02 2017 -0800
Committer: Felix Cheung <fe...@apache.org>
Committed: Sun Mar 5 12:37:02 2017 -0800
----------------------------------------------------------------------
R/pkg/NAMESPACE | 2 +
R/pkg/R/functions.R | 57 ++++++++++++++++++++++++++
R/pkg/R/generics.R | 8 ++++
R/pkg/inst/tests/testthat/test_sparkSQL.R | 43 +++++++++++++++----
4 files changed, 103 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/80d5338b/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 81e1936..871f8e4 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -229,6 +229,7 @@ exportMethods("%in%",
"floor",
"format_number",
"format_string",
+ "from_json",
"from_unixtime",
"from_utc_timestamp",
"getField",
@@ -327,6 +328,7 @@ exportMethods("%in%",
"toDegrees",
"toRadians",
"to_date",
+ "to_json",
"to_timestamp",
"to_utc_timestamp",
"translate",
http://git-wip-us.apache.org/repos/asf/spark/blob/80d5338b/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9e50844..edf2bcf 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -1793,6 +1793,33 @@ setMethod("to_date",
column(jc)
})
+#' to_json
+#'
+#' Converts a column containing a \code{structType} into a Column of JSON string.
+#' Resolving the Column can fail if an unsupported type is encountered.
+#'
+#' @param x Column containing the struct
+#' @param ... additional named properties to control how it is converted, accepts the same options
+#' as the JSON data source.
+#'
+#' @family normal_funcs
+#' @rdname to_json
+#' @name to_json
+#' @aliases to_json,Column-method
+#' @export
+#' @examples
+#' \dontrun{
+#' to_json(df$t, dateFormat = 'dd/MM/yyyy')
+#' select(df, to_json(df$t))
+#'}
+#' @note to_json since 2.2.0
+setMethod("to_json", signature(x = "Column"),
+ function(x, ...) {
+ options <- varargsToStrEnv(...)
+ jc <- callJStatic("org.apache.spark.sql.functions", "to_json", x@jc, options)
+ column(jc)
+ })
+
#' to_timestamp
#'
#' Converts the column into a TimestampType. You may optionally specify a format
@@ -2403,6 +2430,36 @@ setMethod("date_format", signature(y = "Column", x = "character"),
column(jc)
})
+#' from_json
+#'
+#' Parses a column containing a JSON string into a Column of \code{structType} with the specified
+#' \code{schema}. If the string is unparseable, the Column will contains the value NA.
+#'
+#' @param x Column containing the JSON string.
+#' @param schema a structType object to use as the schema to use when parsing the JSON string.
+#' @param ... additional named properties to control how the json is parsed, accepts the same
+#' options as the JSON data source.
+#'
+#' @family normal_funcs
+#' @rdname from_json
+#' @name from_json
+#' @aliases from_json,Column,structType-method
+#' @export
+#' @examples
+#' \dontrun{
+#' schema <- structType(structField("name", "string"),
+#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy"))
+#'}
+#' @note from_json since 2.2.0
+setMethod("from_json", signature(x = "Column", schema = "structType"),
+ function(x, schema, ...) {
+ options <- varargsToStrEnv(...)
+ jc <- callJStatic("org.apache.spark.sql.functions",
+ "from_json",
+ x@jc, schema$jobj, options)
+ column(jc)
+ })
+
#' from_utc_timestamp
#'
#' Given a timestamp, which corresponds to a certain time of day in UTC, returns another timestamp
http://git-wip-us.apache.org/repos/asf/spark/blob/80d5338b/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 647cbbd..45bc127 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -991,6 +991,10 @@ setGeneric("format_number", function(y, x) { standardGeneric("format_number") })
#' @export
setGeneric("format_string", function(format, x, ...) { standardGeneric("format_string") })
+#' @rdname from_json
+#' @export
+setGeneric("from_json", function(x, schema, ...) { standardGeneric("from_json") })
+
#' @rdname from_unixtime
#' @export
setGeneric("from_unixtime", function(x, ...) { standardGeneric("from_unixtime") })
@@ -1265,6 +1269,10 @@ setGeneric("toRadians", function(x) { standardGeneric("toRadians") })
#' @export
setGeneric("to_date", function(x, format) { standardGeneric("to_date") })
+#' @rdname to_json
+#' @export
+setGeneric("to_json", function(x, ...) { standardGeneric("to_json") })
+
#' @rdname to_timestamp
#' @export
setGeneric("to_timestamp", function(x, format) { standardGeneric("to_timestamp") })
http://git-wip-us.apache.org/repos/asf/spark/blob/80d5338b/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
index 1dd8c5c..7c09659 100644
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R
@@ -88,6 +88,13 @@ mockLinesComplexType <-
complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
writeLines(mockLinesComplexType, complexTypeJsonPath)
+# For test map type and struct type in DataFrame
+mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+ "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+ "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
test_that("calling sparkRSQL.init returns existing SQL context", {
sqlContext <- suppressWarnings(sparkRSQL.init(sc))
expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
@@ -466,13 +473,6 @@ test_that("create DataFrame from a data.frame with complex types", {
expect_equal(ldf$an_envir, collected$an_envir)
})
-# For test map type and struct type in DataFrame
-mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
- "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
- "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
-mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesMapType, mapTypeJsonPath)
-
test_that("Collect DataFrame with complex types", {
# ArrayType
df <- read.json(complexTypeJsonPath)
@@ -1337,6 +1337,33 @@ test_that("column functions", {
df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
+
+ # Test to_json(), from_json()
+ df <- read.json(mapTypeJsonPath)
+ j <- collect(select(df, alias(to_json(df$info), "json")))
+ expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
+ df <- as.DataFrame(j)
+ schema <- structType(structField("age", "integer"),
+ structField("height", "double"))
+ s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
+ expect_equal(ncol(s), 1)
+ expect_equal(nrow(s), 3)
+ expect_is(s[[1]][[1]], "struct")
+ expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+
+ # passing option
+ df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
+ schema2 <- structType(structField("date", "date"))
+ expect_error(tryCatch(collect(select(df, from_json(df$col, schema2))),
+ error = function(e) { stop(e) }),
+ paste0(".*(java.lang.NumberFormatException: For input string:).*"))
+ s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
+ expect_is(s[[1]][[1]]$date, "Date")
+ expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
+
+ # check for unparseable
+ df <- as.DataFrame(list(list("a" = "")))
+ expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
})
test_that("column binary mathfunctions", {
@@ -2867,5 +2894,7 @@ unlink(parquetPath)
unlink(orcPath)
unlink(jsonPath)
unlink(jsonPathNa)
+unlink(complexTypeJsonPath)
+unlink(mapTypeJsonPath)
sparkR.session.stop()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org