You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by fe...@apache.org on 2017/06/11 07:00:37 UTC
[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Repository: spark
Updated Branches:
refs/heads/master 5301a19a0 -> dc4c35183
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_streaming.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
new file mode 100644
index 0000000..b20b431
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -0,0 +1,167 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("Structured Streaming")
+
+# Tests for Structured Streaming functions in SparkR
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+jsonSubDir <- file.path("sparkr-test", "json", "")
+if (.Platform$OS.type == "windows") {
+ # file.path removes the empty separator on Windows, adds it back
+ jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
+}
+jsonDir <- file.path(tempdir(), jsonSubDir)
+dir.create(jsonDir, recursive = TRUE)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+ "{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+writeLines(mockLines, jsonPath)
+
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}")
+jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+
+schema <- structType(structField("name", "string"),
+ structField("age", "integer"),
+ structField("count", "double"))
+
+test_that("read.stream, write.stream, awaitTermination, stopQuery", {
+ skip_on_cran()
+
+ df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+ expect_true(isStreaming(df))
+ counts <- count(group_by(df, "name"))
+ q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete")
+
+ expect_false(awaitTermination(q, 5 * 1000))
+ callJMethod(q@ssq, "processAllAvailable")
+ expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
+
+ writeLines(mockLinesNa, jsonPathNa)
+ awaitTermination(q, 5 * 1000)
+ callJMethod(q@ssq, "processAllAvailable")
+ expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
+
+ stopQuery(q)
+ expect_true(awaitTermination(q, 1))
+ expect_error(awaitTermination(q), NA)
+})
+
+test_that("print from explain, lastProgress, status, isActive", {
+ skip_on_cran()
+
+ df <- read.stream("json", path = jsonDir, schema = schema)
+ expect_true(isStreaming(df))
+ counts <- count(group_by(df, "name"))
+ q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete")
+
+ awaitTermination(q, 5 * 1000)
+ callJMethod(q@ssq, "processAllAvailable")
+
+ expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
+ expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q)))))
+ expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)))))
+
+ expect_equal(queryName(q), "people2")
+ expect_true(isActive(q))
+
+ stopQuery(q)
+})
+
+test_that("Stream other format", {
+ skip_on_cran()
+
+ parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+ df <- read.df(jsonPath, "json", schema)
+ write.df(df, parquetPath, "parquet", "overwrite")
+
+ df <- read.stream(path = parquetPath, schema = schema)
+ expect_true(isStreaming(df))
+ counts <- count(group_by(df, "name"))
+ q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete")
+
+ expect_false(awaitTermination(q, 5 * 1000))
+ callJMethod(q@ssq, "processAllAvailable")
+ expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
+
+ expect_equal(queryName(q), "people3")
+ expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet",
+ capture.output(lastProgress(q)))))
+ expect_true(isActive(q))
+
+ stopQuery(q)
+ expect_true(awaitTermination(q, 1))
+ expect_false(isActive(q))
+
+ unlink(parquetPath)
+})
+
+test_that("Non-streaming DataFrame", {
+ skip_on_cran()
+
+ c <- as.DataFrame(cars)
+ expect_false(isStreaming(c))
+
+ expect_error(write.stream(c, "memory", queryName = "people", outputMode = "complete"),
+ paste0(".*(writeStream : analysis error - 'writeStream' can be called only on ",
+ "streaming Dataset/DataFrame).*"))
+})
+
+test_that("Unsupported operation", {
+ skip_on_cran()
+
+ # memory sink without aggregation
+ df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+ expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
+ paste0(".*(start : analysis error - Complete output mode not supported when there ",
+ "are no streaming aggregations on streaming DataFrames/Datasets).*"))
+})
+
+test_that("Terminated by error", {
+ skip_on_cran()
+
+ df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = -1)
+ counts <- count(group_by(df, "name"))
+ # This would not fail before returning with a StreamingQuery,
+ # but could dump error log at just about the same time
+ expect_error(q <- write.stream(counts, "memory", queryName = "people4", outputMode = "complete"),
+ NA)
+
+ expect_error(awaitTermination(q, 5 * 1000),
+ paste0(".*(awaitTermination : streaming query error - Invalid value '-1' for option",
+ " 'maxFilesPerTrigger', must be a positive integer).*"))
+
+ expect_true(any(grepl("\"message\" : \"Terminated with exception: Invalid value",
+ capture.output(status(q)))))
+ expect_true(any(grepl("Streaming query has no progress", capture.output(lastProgress(q)))))
+ expect_equal(queryName(q), "people4")
+ expect_false(isActive(q))
+
+ stopQuery(q)
+})
+
+unlink(jsonPath)
+unlink(jsonPathNa)
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_take.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_take.R b/R/pkg/tests/fulltests/test_take.R
new file mode 100644
index 0000000..c00723b
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_take.R
@@ -0,0 +1,71 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("tests RDD function take()")
+
+# Mock data
+numVector <- c(-10:97)
+numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
+strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
+ "violated, but I'm not. No, in fact, I think this is a friendly",
+ "message, like \"Hey, wanna play?\" and yes, I want to play. ",
+ "I really, really do.")
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
+ "other times it helps me control the chaos.",
+ "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
+ "raising me. But they're both dead now. I didn't kill them. Honest.")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+test_that("take() gives back the original elements in correct count and order", {
+ skip_on_cran()
+
+ numVectorRDD <- parallelize(sc, numVector, 10)
+ # case: number of elements to take is less than the size of the first partition
+ expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1)))
+ # case: number of elements to take is the same as the size of the first partition
+ expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11)))
+ # case: number of elements to take is greater than all elements
+ expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector))
+ expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), as.list(numVector))
+
+ numListRDD <- parallelize(sc, numList, 1)
+ numListRDD2 <- parallelize(sc, numList, 4)
+ expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3))
+ expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5))
+ expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1)))
+ expect_equal(takeRDD(numListRDD2, 999), numList)
+
+ strVectorRDD <- parallelize(sc, strVector, 2)
+ strVectorRDD2 <- parallelize(sc, strVector, 3)
+ expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector))
+ expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2)))
+
+ strListRDD <- parallelize(sc, strList, 4)
+ strListRDD2 <- parallelize(sc, strList, 1)
+ expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3)))
+ expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1)))
+
+ expect_equal(length(takeRDD(strListRDD, 0)), 0)
+ expect_equal(length(takeRDD(strVectorRDD, 0)), 0)
+ expect_equal(length(takeRDD(numListRDD, 0)), 0)
+ expect_equal(length(takeRDD(numVectorRDD, 0)), 0)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_textFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_textFile.R b/R/pkg/tests/fulltests/test_textFile.R
new file mode 100644
index 0000000..e8a961c
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_textFile.R
@@ -0,0 +1,182 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("the textFile() function")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("textFile() on a local file returns an RDD", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName)
+ expect_is(rdd, "RDD")
+ expect_true(countRDD(rdd) > 0)
+ expect_equal(countRDD(rdd), 2)
+
+ unlink(fileName)
+})
+
+test_that("textFile() followed by a collect() returns the same content", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName)
+ expect_equal(collectRDD(rdd), as.list(mockFile))
+
+ unlink(fileName)
+})
+
+test_that("textFile() word count works as expected", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName)
+
+ words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+ wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+ counts <- reduceByKey(wordCount, "+", 2L)
+ output <- collectRDD(counts)
+ expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1),
+ list("Spark", 2))
+ expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
+
+ unlink(fileName)
+})
+
+test_that("several transformations on RDD created by textFile()", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName) # RDD
+ for (i in 1:10) {
+ # PipelinedRDD initially created from RDD
+ rdd <- lapply(rdd, function(x) paste(x, x))
+ }
+ collectRDD(rdd)
+
+ unlink(fileName)
+})
+
+test_that("textFile() followed by a saveAsTextFile() returns the same content", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName1)
+
+ rdd <- textFile(sc, fileName1, 1L)
+ saveAsTextFile(rdd, fileName2)
+ rdd <- textFile(sc, fileName2)
+ expect_equal(collectRDD(rdd), as.list(mockFile))
+
+ unlink(fileName1)
+ unlink(fileName2)
+})
+
+test_that("saveAsTextFile() on a parallelized list works as expected", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ l <- list(1, 2, 3)
+ rdd <- parallelize(sc, l, 1L)
+ saveAsTextFile(rdd, fileName)
+ rdd <- textFile(sc, fileName)
+ expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)}))
+
+ unlink(fileName)
+})
+
+test_that("textFile() and saveAsTextFile() word count works as expected", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName1)
+
+ rdd <- textFile(sc, fileName1)
+
+ words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+ wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+ counts <- reduceByKey(wordCount, "+", 2L)
+
+ saveAsTextFile(counts, fileName2)
+ rdd <- textFile(sc, fileName2)
+
+ output <- collectRDD(rdd)
+ expected <- list(list("awesome.", 1), list("Spark", 2),
+ list("pretty.", 1), list("is", 2))
+ expectedStr <- lapply(expected, function(x) { toString(x) })
+ expect_equal(sortKeyValueList(output), sortKeyValueList(expectedStr))
+
+ unlink(fileName1)
+ unlink(fileName2)
+})
+
+test_that("textFile() on multiple paths", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines("Spark is pretty.", fileName1)
+ writeLines("Spark is awesome.", fileName2)
+
+ rdd <- textFile(sc, c(fileName1, fileName2))
+ expect_equal(countRDD(rdd), 2)
+
+ unlink(fileName1)
+ unlink(fileName2)
+})
+
+test_that("Pipelined operations on RDDs created using textFile", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName)
+
+ lengths <- lapply(rdd, function(x) { length(x) })
+ expect_equal(collectRDD(lengths), list(1, 1))
+
+ lengthsPipelined <- lapply(lengths, function(x) { x + 10 })
+ expect_equal(collectRDD(lengthsPipelined), list(11, 11))
+
+ lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 })
+ expect_equal(collectRDD(lengths30), list(31, 31))
+
+ lengths20 <- lapply(lengths, function(x) { x + 20 })
+ expect_equal(collectRDD(lengths20), list(21, 21))
+
+ unlink(fileName)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R
new file mode 100644
index 0000000..6197ae7
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_utils.R
@@ -0,0 +1,248 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in utils.R")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+test_that("convertJListToRList() gives back (deserializes) the original JLists
+ of strings and integers", {
+ skip_on_cran()
+ # It's hard to manually create a Java List using rJava, since it does not
+ # support generics well. Instead, we rely on collectRDD() returning a
+ # JList.
+ nums <- as.list(1:10)
+ rdd <- parallelize(sc, nums, 1L)
+ jList <- callJMethod(rdd@jrdd, "collect")
+ rList <- convertJListToRList(jList, flatten = TRUE)
+ expect_equal(rList, nums)
+
+ strs <- as.list("hello", "spark")
+ rdd <- parallelize(sc, strs, 2L)
+ jList <- callJMethod(rdd@jrdd, "collect")
+ rList <- convertJListToRList(jList, flatten = TRUE)
+ expect_equal(rList, strs)
+})
+
+test_that("serializeToBytes on RDD", {
+ skip_on_cran()
+ # File content
+ mockFile <- c("Spark is pretty.", "Spark is awesome.")
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ text.rdd <- textFile(sc, fileName)
+ expect_equal(getSerializedMode(text.rdd), "string")
+ ser.rdd <- serializeToBytes(text.rdd)
+ expect_equal(collectRDD(ser.rdd), as.list(mockFile))
+ expect_equal(getSerializedMode(ser.rdd), "byte")
+
+ unlink(fileName)
+})
+
+test_that("cleanClosure on R functions", {
+ y <- c(1, 2, 3)
+ g <- function(x) { x + 1 }
+ f <- function(x) { g(x) + y }
+ newF <- cleanClosure(f)
+ env <- environment(newF)
+ expect_equal(length(ls(env)), 2) # y, g
+ actual <- get("y", envir = env, inherits = FALSE)
+ expect_equal(actual, y)
+ actual <- get("g", envir = env, inherits = FALSE)
+ expect_equal(actual, g)
+
+ # Test for nested enclosures and package variables.
+ env2 <- new.env()
+ funcEnv <- new.env(parent = env2)
+ f <- function(x) { log(g(x) + y) }
+ environment(f) <- funcEnv # enclosing relationship: f -> funcEnv -> env2 -> .GlobalEnv
+ newF <- cleanClosure(f)
+ env <- environment(newF)
+ expect_equal(length(ls(env)), 2) # "min" should not be included
+ actual <- get("y", envir = env, inherits = FALSE)
+ expect_equal(actual, y)
+ actual <- get("g", envir = env, inherits = FALSE)
+ expect_equal(actual, g)
+
+ base <- c(1, 2, 3)
+ l <- list(field = matrix(1))
+ field <- matrix(2)
+ defUse <- 3
+ g <- function(x) { x + y }
+ f <- function(x) {
+ defUse <- base::as.integer(x) + 1 # Test for access operators `::`.
+ lapply(x, g) + 1 # Test for capturing function call "g"'s closure as a argument of lapply.
+ l$field[1, 1] <- 3 # Test for access operators `$`.
+ res <- defUse + l$field[1, ] # Test for def-use chain of "defUse", and "" symbol.
+ f(res) # Test for recursive calls.
+ }
+ newF <- cleanClosure(f)
+ env <- environment(newF)
+ # TODO(shivaram): length(ls(env)) is 4 here for some reason and `lapply` is included in `env`.
+ # Disabling this test till we debug this.
+ #
+ # nolint start
+ # expect_equal(length(ls(env)), 3) # Only "g", "l" and "f". No "base", "field" or "defUse".
+ # nolint end
+ expect_true("g" %in% ls(env))
+ expect_true("l" %in% ls(env))
+ expect_true("f" %in% ls(env))
+ expect_equal(get("l", envir = env, inherits = FALSE), l)
+ # "y" should be in the environemnt of g.
+ newG <- get("g", envir = env, inherits = FALSE)
+ env <- environment(newG)
+ expect_equal(length(ls(env)), 1)
+ actual <- get("y", envir = env, inherits = FALSE)
+ expect_equal(actual, y)
+
+ # Test for function (and variable) definitions.
+ f <- function(x) {
+ g <- function(y) { y * 2 }
+ g(x)
+ }
+ newF <- cleanClosure(f)
+ env <- environment(newF)
+ expect_equal(length(ls(env)), 0) # "y" and "g" should not be included.
+
+ # Test for overriding variables in base namespace (Issue: SparkR-196).
+ nums <- as.list(1:10)
+ rdd <- parallelize(sc, nums, 2L)
+ t <- 4 # Override base::t in .GlobalEnv.
+ f <- function(x) { x > t }
+ newF <- cleanClosure(f)
+ env <- environment(newF)
+ expect_equal(ls(env), "t")
+ expect_equal(get("t", envir = env, inherits = FALSE), t)
+ actual <- collectRDD(lapply(rdd, f))
+ expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6)))
+ expect_equal(actual, expected)
+
+ # Test for broadcast variables.
+ a <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+ aBroadcast <- broadcastRDD(sc, a)
+ normMultiply <- function(x) { norm(aBroadcast$value) * x }
+ newnormMultiply <- SparkR:::cleanClosure(normMultiply)
+ env <- environment(newnormMultiply)
+ expect_equal(ls(env), "aBroadcast")
+ expect_equal(get("aBroadcast", envir = env, inherits = FALSE), aBroadcast)
+})
+
+test_that("varargsToJProperties", {
+ jprops <- newJObject("java.util.Properties")
+ expect_true(class(jprops) == "jobj")
+
+ jprops <- varargsToJProperties(abc = "123")
+ expect_true(class(jprops) == "jobj")
+ expect_equal(callJMethod(jprops, "getProperty", "abc"), "123")
+
+ jprops <- varargsToJProperties(abc = "abc", b = 1)
+ expect_equal(callJMethod(jprops, "getProperty", "abc"), "abc")
+ expect_equal(callJMethod(jprops, "getProperty", "b"), "1")
+
+ jprops <- varargsToJProperties()
+ expect_equal(callJMethod(jprops, "size"), 0L)
+})
+
+test_that("convertToJSaveMode", {
+ s <- convertToJSaveMode("error")
+ expect_true(class(s) == "jobj")
+ expect_match(capture.output(print.jobj(s)), "Java ref type org.apache.spark.sql.SaveMode id ")
+ expect_error(convertToJSaveMode("foo"),
+ 'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
+})
+
+test_that("captureJVMException", {
+ skip_on_cran()
+
+ method <- "createStructField"
+ expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", method,
+ "col", "unknown", TRUE),
+ error = function(e) {
+ captureJVMException(e, method)
+ }),
+ "parse error - .*DataType unknown.*not supported.")
+})
+
+test_that("hashCode", {
+ skip_on_cran()
+
+ expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA)
+})
+
+test_that("overrideEnvs", {
+ config <- new.env()
+ config[["spark.master"]] <- "foo"
+ config[["config_only"]] <- "ok"
+ param <- new.env()
+ param[["spark.master"]] <- "local"
+ param[["param_only"]] <- "blah"
+ overrideEnvs(config, param)
+ expect_equal(config[["spark.master"]], "local")
+ expect_equal(config[["param_only"]], "blah")
+ expect_equal(config[["config_only"]], "ok")
+})
+
+test_that("rbindRaws", {
+
+ # Mixed Column types
+ r <- serialize(1:5, connection = NULL)
+ r1 <- serialize(1, connection = NULL)
+ r2 <- serialize(letters, connection = NULL)
+ r3 <- serialize(1:10, connection = NULL)
+ inputData <- list(list(1L, r1, "a", r), list(2L, r2, "b", r),
+ list(3L, r3, "c", r))
+ expected <- data.frame(V1 = 1:3)
+ expected$V2 <- list(r1, r2, r3)
+ expected$V3 <- c("a", "b", "c")
+ expected$V4 <- list(r, r, r)
+ result <- rbindRaws(inputData)
+ expect_equal(expected, result)
+
+ # Single binary column
+ input <- list(list(r1), list(r2), list(r3))
+ expected <- subset(expected, select = "V2")
+ result <- setNames(rbindRaws(input), "V2")
+ expect_equal(expected, result)
+
+})
+
+test_that("varargsToStrEnv", {
+ strenv <- varargsToStrEnv(a = 1, b = 1.1, c = TRUE, d = "abcd")
+ env <- varargsToEnv(a = "1", b = "1.1", c = "true", d = "abcd")
+ expect_equal(strenv, env)
+ expect_error(varargsToStrEnv(a = list(1, "a")),
+ paste0("Unsupported type for a : list. Supported types are logical, ",
+ "numeric, character and NULL."))
+ expect_warning(varargsToStrEnv(a = 1, 2, 3, 4), "Unnamed arguments ignored: 2, 3, 4.")
+ expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
+})
+
+test_that("basenameSansExtFromUrl", {
+ x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
+ "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
+ expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
+ z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
+ expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
+})
+
+sparkR.session.stop()
+
+message("--- End test (utils) ", as.POSIXct(Sys.time(), tz = "GMT"))
+message("elapsed ", (proc.time() - timer_ptm)[3])
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/run-all.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index f0bef4f..d48e36c 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -43,3 +43,11 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
}
test_package("SparkR")
+
+if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+ # for testthat 1.0.2 later, change reporter from "summary" to default_reporter()
+ testthat:::run_tests("SparkR",
+ file.path(sparkRDir, "pkg", "tests", "fulltests"),
+ NULL,
+ "summary")
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
deleted file mode 100644
index b05fdd3..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ /dev/null
@@ -1,480 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib regression algorithms, except for tree-based algorithms")
-
-# Tests for MLlib regression algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("formula of spark.glm", {
- skip_on_cran()
-
- training <- suppressWarnings(createDataFrame(iris))
- # directly calling the spark API
- # dot minus and intercept vs native glm
- model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # feature interaction vs native glm
- model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # glm should work with long formula
- training <- suppressWarnings(createDataFrame(iris))
- training$LongLongLongLongLongName <- training$Sepal_Width
- training$VeryLongLongLongLonLongName <- training$Sepal_Length
- training$AnotherLongLongLongLongName <- training$Species
- model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
- AnotherLongLongLongLongName)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("spark.glm and predict", {
- training <- suppressWarnings(createDataFrame(iris))
- # gaussian family
- model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # poisson family
- model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = poisson(link = identity))
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
- rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
- data = iris, family = poisson(link = identity)), iris))
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # Gamma family
- x <- runif(100, -1, 1)
- y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
- df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
- model <- glm(y ~ x, family = Gamma, df)
- out <- capture.output(print(summary(model)))
- expect_true(any(grepl("Dispersion parameter for gamma family", out)))
-
- # tweedie family
- model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
- family = "tweedie", var.power = 1.2, link.power = 0.0)
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
-
- # manual calculation of the R predicted values to avoid dependence on statmod
- #' library(statmod)
- #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
- #' family = tweedie(var.power = 1.2, link.power = 0.0))
- #' print(coef(rModel))
-
- rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
- rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
- data = iris) %*% rCoef))
- expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
-
- # Test stats::predict is working
- x <- rnorm(15)
- y <- x + rnorm(15)
- expect_equal(length(predict(lm(y ~ x))), 15)
-})
-
-test_that("spark.glm summary", {
- # gaussian family
- training <- suppressWarnings(createDataFrame(iris))
- stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
-
- # test summary coefficients return matrix type
- expect_true(class(stats$coefficients) == "matrix")
- expect_true(class(stats$coefficients[, 1]) == "numeric")
-
- coefs <- stats$coefficients
- rCoefs <- rStats$coefficients
- expect_true(all(abs(rCoefs - coefs) < 1e-4))
- expect_true(all(
- rownames(stats$coefficients) ==
- c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
- expect_equal(stats$dispersion, rStats$dispersion)
- expect_equal(stats$null.deviance, rStats$null.deviance)
- expect_equal(stats$deviance, rStats$deviance)
- expect_equal(stats$df.null, rStats$df.null)
- expect_equal(stats$df.residual, rStats$df.residual)
- expect_equal(stats$aic, rStats$aic)
-
- out <- capture.output(print(stats))
- expect_match(out[2], "Deviance Residuals:")
- expect_true(any(grepl("AIC: 59.22", out)))
-
- # binomial family
- df <- suppressWarnings(createDataFrame(iris))
- training <- df[df$Species %in% c("versicolor", "virginica"), ]
- stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
- family = binomial(link = "logit")))
-
- rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
- rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
- family = binomial(link = "logit")))
-
- coefs <- stats$coefficients
- rCoefs <- rStats$coefficients
- expect_true(all(abs(rCoefs - coefs) < 1e-4))
- expect_true(all(
- rownames(stats$coefficients) ==
- c("(Intercept)", "Sepal_Length", "Sepal_Width")))
- expect_equal(stats$dispersion, rStats$dispersion)
- expect_equal(stats$null.deviance, rStats$null.deviance)
- expect_equal(stats$deviance, rStats$deviance)
- expect_equal(stats$df.null, rStats$df.null)
- expect_equal(stats$df.residual, rStats$df.residual)
- expect_equal(stats$aic, rStats$aic)
-
- # Test spark.glm works with weighted dataset
- a1 <- c(0, 1, 2, 3)
- a2 <- c(5, 2, 1, 3)
- w <- c(1, 2, 3, 4)
- b <- c(1, 0, 1, 0)
- data <- as.data.frame(cbind(a1, a2, w, b))
- df <- createDataFrame(data)
-
- stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
- rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
-
- coefs <- stats$coefficients
- rCoefs <- rStats$coefficients
- expect_true(all(abs(rCoefs - coefs) < 1e-3))
- expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
- expect_equal(stats$dispersion, rStats$dispersion)
- expect_equal(stats$null.deviance, rStats$null.deviance)
- expect_equal(stats$deviance, rStats$deviance)
- expect_equal(stats$df.null, rStats$df.null)
- expect_equal(stats$df.residual, rStats$df.residual)
- expect_equal(stats$aic, rStats$aic)
-
- # Test summary works on base GLM models
- baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
- baseSummary <- summary(baseModel)
- expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
-
- # Test spark.glm works with regularization parameter
- data <- as.data.frame(cbind(a1, a2, b))
- df <- suppressWarnings(createDataFrame(data))
- regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
- expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
-
- # Test spark.glm works on collinear data
- A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
- b <- c(1, 2, 3, 4)
- data <- as.data.frame(cbind(A, b))
- df <- createDataFrame(data)
- stats <- summary(spark.glm(df, b ~ . - 1))
- coefs <- stats$coefficients
- expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
-})
-
-test_that("spark.glm save/load", {
- skip_on_cran()
-
- training <- suppressWarnings(createDataFrame(iris))
- m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
- s <- summary(m)
-
- modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
- write.ml(m, modelPath)
- expect_error(write.ml(m, modelPath))
- write.ml(m, modelPath, overwrite = TRUE)
- m2 <- read.ml(modelPath)
- s2 <- summary(m2)
-
- expect_equal(s$coefficients, s2$coefficients)
- expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
- expect_equal(s$dispersion, s2$dispersion)
- expect_equal(s$null.deviance, s2$null.deviance)
- expect_equal(s$deviance, s2$deviance)
- expect_equal(s$df.null, s2$df.null)
- expect_equal(s$df.residual, s2$df.residual)
- expect_equal(s$aic, s2$aic)
- expect_equal(s$iter, s2$iter)
- expect_true(!s$is.loaded)
- expect_true(s2$is.loaded)
-
- unlink(modelPath)
-})
-
-test_that("formula of glm", {
- skip_on_cran()
-
- training <- suppressWarnings(createDataFrame(iris))
- # dot minus and intercept vs native glm
- model <- glm(Sepal_Width ~ . - Species + 0, data = training)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # feature interaction vs native glm
- model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # glm should work with long formula
- training <- suppressWarnings(createDataFrame(iris))
- training$LongLongLongLongLongName <- training$Sepal_Width
- training$VeryLongLongLongLonLongName <- training$Sepal_Length
- training$AnotherLongLongLongLongName <- training$Species
- model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
- data = training)
- vals <- collect(select(predict(model, training), "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("glm and predict", {
- skip_on_cran()
-
- training <- suppressWarnings(createDataFrame(iris))
- # gaussian family
- model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
- rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # poisson family
- model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
- family = poisson(link = identity))
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
- rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
- data = iris, family = poisson(link = identity)), iris))
- expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
- # tweedie family
- model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
- family = "tweedie", var.power = 1.2, link.power = 0.0)
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
- vals <- collect(select(prediction, "prediction"))
-
- # manual calculation of the R predicted values to avoid dependence on statmod
- #' library(statmod)
- #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
- #' family = tweedie(var.power = 1.2, link.power = 0.0))
- #' print(coef(rModel))
-
- rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
- rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
- data = iris) %*% rCoef))
- expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
-
- # Test stats::predict is working
- x <- rnorm(15)
- y <- x + rnorm(15)
- expect_equal(length(predict(lm(y ~ x))), 15)
-})
-
-test_that("glm summary", {
- skip_on_cran()
-
- # gaussian family
- training <- suppressWarnings(createDataFrame(iris))
- stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
-
- rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
-
- coefs <- stats$coefficients
- rCoefs <- rStats$coefficients
- expect_true(all(abs(rCoefs - coefs) < 1e-4))
- expect_true(all(
- rownames(stats$coefficients) ==
- c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
- expect_equal(stats$dispersion, rStats$dispersion)
- expect_equal(stats$null.deviance, rStats$null.deviance)
- expect_equal(stats$deviance, rStats$deviance)
- expect_equal(stats$df.null, rStats$df.null)
- expect_equal(stats$df.residual, rStats$df.residual)
- expect_equal(stats$aic, rStats$aic)
-
- # binomial family
- df <- suppressWarnings(createDataFrame(iris))
- training <- df[df$Species %in% c("versicolor", "virginica"), ]
- stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
- family = binomial(link = "logit")))
-
- rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
- rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
- family = binomial(link = "logit")))
-
- coefs <- stats$coefficients
- rCoefs <- rStats$coefficients
- expect_true(all(abs(rCoefs - coefs) < 1e-4))
- expect_true(all(
- rownames(stats$coefficients) ==
- c("(Intercept)", "Sepal_Length", "Sepal_Width")))
- expect_equal(stats$dispersion, rStats$dispersion)
- expect_equal(stats$null.deviance, rStats$null.deviance)
- expect_equal(stats$deviance, rStats$deviance)
- expect_equal(stats$df.null, rStats$df.null)
- expect_equal(stats$df.residual, rStats$df.residual)
- expect_equal(stats$aic, rStats$aic)
-
- # Test summary works on base GLM models
- baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
- baseSummary <- summary(baseModel)
- expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
-})
-
-test_that("glm save/load", {
- skip_on_cran()
-
- training <- suppressWarnings(createDataFrame(iris))
- m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
- s <- summary(m)
-
- modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
- write.ml(m, modelPath)
- expect_error(write.ml(m, modelPath))
- write.ml(m, modelPath, overwrite = TRUE)
- m2 <- read.ml(modelPath)
- s2 <- summary(m2)
-
- expect_equal(s$coefficients, s2$coefficients)
- expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
- expect_equal(s$dispersion, s2$dispersion)
- expect_equal(s$null.deviance, s2$null.deviance)
- expect_equal(s$deviance, s2$deviance)
- expect_equal(s$df.null, s2$df.null)
- expect_equal(s$df.residual, s2$df.residual)
- expect_equal(s$aic, s2$aic)
- expect_equal(s$iter, s2$iter)
- expect_true(!s$is.loaded)
- expect_true(s2$is.loaded)
-
- unlink(modelPath)
-})
-
-test_that("spark.isoreg", {
- label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
- feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
- weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
- data <- as.data.frame(cbind(label, feature, weight))
- df <- createDataFrame(data)
-
- model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
- weightCol = "weight")
- # only allow one variable on the right hand side of the formula
- expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
- result <- summary(model)
- expect_equal(result$predictions, list(7, 5, 4, 4, 1))
-
- # Test model prediction
- predict_data <- list(list(-2.0), list(-1.0), list(0.5),
- list(0.75), list(1.0), list(2.0), list(9.0))
- predict_df <- createDataFrame(predict_data, c("feature"))
- predict_result <- collect(select(predict(model, predict_df), "prediction"))
- expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-isoreg", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- expect_equal(result, summary(model2))
-
- unlink(modelPath)
- }
-})
-
-test_that("spark.survreg", {
- # R code to reproduce the result.
- #
- #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
- #' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
- #' library(survival)
- #' model <- survreg(Surv(time, status) ~ x + sex, rData)
- #' summary(model)
- #' predict(model, data)
- #
- # -- output of 'summary(model)'
- #
- # Value Std. Error z p
- # (Intercept) 1.315 0.270 4.88 1.07e-06
- # x -0.190 0.173 -1.10 2.72e-01
- # sex -0.253 0.329 -0.77 4.42e-01
- # Log(scale) -1.160 0.396 -2.93 3.41e-03
- #
- # -- output of 'predict(model, data)'
- #
- # 1 2 3 4 5 6 7
- # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
- #
- data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
- list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
- df <- createDataFrame(data, c("time", "status", "x", "sex"))
- model <- spark.survreg(df, Surv(time, status) ~ x + sex)
- stats <- summary(model)
- coefs <- as.vector(stats$coefficients[, 1])
- rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
- expect_equal(coefs, rCoefs, tolerance = 1e-4)
- expect_true(all(
- rownames(stats$coefficients) ==
- c("(Intercept)", "x", "sex", "Log(scale)")))
- p <- collect(select(predict(model, df), "prediction"))
- expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
- 2.390146, 2.891269, 2.891269), tolerance = 1e-4)
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- coefs2 <- as.vector(stats2$coefficients[, 1])
- expect_equal(coefs, coefs2)
- expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
-
- unlink(modelPath)
- }
-
- # Test survival::survreg
- if (requireNamespace("survival", quietly = TRUE)) {
- rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
- x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
- expect_error(
- model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
- NA)
- expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
- }
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_stat.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_stat.R b/R/pkg/inst/tests/testthat/test_mllib_stat.R
deleted file mode 100644
index 1600833..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_stat.R
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib statistics algorithms")
-
-# Tests for MLlib statistics algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.kstest", {
- data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
- df <- createDataFrame(data)
- testResult <- spark.kstest(df, "test", "norm")
- stats <- summary(testResult)
-
- rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
-
- expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
- expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
- expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
-
- testResult <- spark.kstest(df, "test", "norm", -0.5)
- stats <- summary(testResult)
-
- rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
-
- expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
- expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
- expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
-
- # Test print.summary.KSTest
- printStats <- capture.output(print.summary.KSTest(stats))
- expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
- expect_match(printStats[5],
- "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_tree.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_tree.R b/R/pkg/inst/tests/testthat/test_mllib_tree.R
deleted file mode 100644
index 31427ee..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_tree.R
+++ /dev/null
@@ -1,320 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib tree-based algorithms")
-
-# Tests for MLlib tree-based algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
- sparkHome <- sparkR.conf("spark.home")
- file.path(sparkHome, x)
-}
-
-test_that("spark.gbt", {
- skip_on_cran()
-
- # regression
- data <- suppressWarnings(createDataFrame(longley))
- model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
- predictions <- collect(predict(model, data))
- expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
- 63.221, 63.639, 64.989, 63.761,
- 66.019, 67.857, 68.169, 66.513,
- 68.655, 69.564, 69.331, 70.551),
- tolerance = 1e-4)
- stats <- summary(model)
- expect_equal(stats$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
- expect_equal(stats$formula, "Employed ~ .")
- expect_equal(stats$numFeatures, 6)
- expect_equal(length(stats$treeWeights), 20)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$formula, stats2$formula)
- expect_equal(stats$numFeatures, stats2$numFeatures)
- expect_equal(stats$features, stats2$features)
- expect_equal(stats$featureImportances, stats2$featureImportances)
- expect_equal(stats$maxDepth, stats2$maxDepth)
- expect_equal(stats$numTrees, stats2$numTrees)
- expect_equal(stats$treeWeights, stats2$treeWeights)
-
- unlink(modelPath)
- }
-
- # classification
- # label must be binary - GBTClassifier currently only supports binary classification.
- iris2 <- iris[iris$Species != "virginica", ]
- data <- suppressWarnings(createDataFrame(iris2))
- model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
- stats <- summary(model)
- expect_equal(stats$numFeatures, 2)
- expect_equal(stats$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
- expect_error(capture.output(stats), NA)
- expect_true(length(capture.output(stats)) > 6)
- predictions <- collect(predict(model, data))$prediction
- # test string prediction values
- expect_equal(length(grep("setosa", predictions)), 50)
- expect_equal(length(grep("versicolor", predictions)), 50)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$depth, stats2$depth)
- expect_equal(stats$numNodes, stats2$numNodes)
- expect_equal(stats$numClasses, stats2$numClasses)
-
- unlink(modelPath)
- }
-
- iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
- df <- suppressWarnings(createDataFrame(iris2))
- m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
- s <- summary(m)
- # test numeric prediction values
- expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
- expect_equal(s$numFeatures, 5)
- expect_equal(s$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
-
- # spark.gbt classification can work on libsvm data
- if (not_cran_or_windows_with_hadoop()) {
- data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
- source = "libsvm")
- model <- spark.gbt(data, label ~ features, "classification")
- expect_equal(summary(model)$numFeatures, 692)
- }
-})
-
-test_that("spark.randomForest", {
- # regression
- data <- suppressWarnings(createDataFrame(longley))
- model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
- numTrees = 1)
-
- predictions <- collect(predict(model, data))
- expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
- 63.221, 63.639, 64.989, 63.761,
- 66.019, 67.857, 68.169, 66.513,
- 68.655, 69.564, 69.331, 70.551),
- tolerance = 1e-4)
-
- stats <- summary(model)
- expect_equal(stats$numTrees, 1)
- expect_equal(stats$maxDepth, 5)
- expect_error(capture.output(stats), NA)
- expect_true(length(capture.output(stats)) > 6)
-
- model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
- numTrees = 20, seed = 123)
- predictions <- collect(predict(model, data))
- expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
- 63.53160, 64.05470, 65.12710, 64.30450,
- 66.70910, 67.86125, 68.08700, 67.21865,
- 68.89275, 69.53180, 69.39640, 69.68250),
- tolerance = 1e-4)
- stats <- summary(model)
- expect_equal(stats$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$formula, stats2$formula)
- expect_equal(stats$numFeatures, stats2$numFeatures)
- expect_equal(stats$features, stats2$features)
- expect_equal(stats$featureImportances, stats2$featureImportances)
- expect_equal(stats$numTrees, stats2$numTrees)
- expect_equal(stats$maxDepth, stats2$maxDepth)
- expect_equal(stats$treeWeights, stats2$treeWeights)
-
- unlink(modelPath)
- }
-
- # classification
- data <- suppressWarnings(createDataFrame(iris))
- model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
- maxDepth = 5, maxBins = 16)
-
- stats <- summary(model)
- expect_equal(stats$numFeatures, 2)
- expect_equal(stats$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
- expect_error(capture.output(stats), NA)
- expect_true(length(capture.output(stats)) > 6)
- # Test string prediction values
- predictions <- collect(predict(model, data))$prediction
- expect_equal(length(grep("setosa", predictions)), 50)
- expect_equal(length(grep("versicolor", predictions)), 50)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$depth, stats2$depth)
- expect_equal(stats$numNodes, stats2$numNodes)
- expect_equal(stats$numClasses, stats2$numClasses)
-
- unlink(modelPath)
- }
-
- # Test numeric response variable
- labelToIndex <- function(species) {
- switch(as.character(species),
- setosa = 0.0,
- versicolor = 1.0,
- virginica = 2.0
- )
- }
- iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
- data <- suppressWarnings(createDataFrame(iris[-5]))
- model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
- maxDepth = 5, maxBins = 16)
- stats <- summary(model)
- expect_equal(stats$numFeatures, 2)
- expect_equal(stats$numTrees, 20)
- expect_equal(stats$maxDepth, 5)
-
- # Test numeric prediction values
- predictions <- collect(predict(model, data))$prediction
- expect_equal(length(grep("1.0", predictions)), 50)
- expect_equal(length(grep("2.0", predictions)), 50)
-
- # spark.randomForest classification can work on libsvm data
- if (not_cran_or_windows_with_hadoop()) {
- data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
- source = "libsvm")
- model <- spark.randomForest(data, label ~ features, "classification")
- expect_equal(summary(model)$numFeatures, 4)
- }
-})
-
-test_that("spark.decisionTree", {
- skip_on_cran()
-
- # regression
- data <- suppressWarnings(createDataFrame(longley))
- model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
-
- predictions <- collect(predict(model, data))
- expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
- 63.221, 63.639, 64.989, 63.761,
- 66.019, 67.857, 68.169, 66.513,
- 68.655, 69.564, 69.331, 70.551),
- tolerance = 1e-4)
-
- stats <- summary(model)
- expect_equal(stats$maxDepth, 5)
- expect_error(capture.output(stats), NA)
- expect_true(length(capture.output(stats)) > 6)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-decisionTreeRegression", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$formula, stats2$formula)
- expect_equal(stats$numFeatures, stats2$numFeatures)
- expect_equal(stats$features, stats2$features)
- expect_equal(stats$featureImportances, stats2$featureImportances)
- expect_equal(stats$maxDepth, stats2$maxDepth)
-
- unlink(modelPath)
- }
-
- # classification
- data <- suppressWarnings(createDataFrame(iris))
- model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
- maxDepth = 5, maxBins = 16)
-
- stats <- summary(model)
- expect_equal(stats$numFeatures, 2)
- expect_equal(stats$maxDepth, 5)
- expect_error(capture.output(stats), NA)
- expect_true(length(capture.output(stats)) > 6)
- # Test string prediction values
- predictions <- collect(predict(model, data))$prediction
- expect_equal(length(grep("setosa", predictions)), 50)
- expect_equal(length(grep("versicolor", predictions)), 50)
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-decisionTreeClassification", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$depth, stats2$depth)
- expect_equal(stats$numNodes, stats2$numNodes)
- expect_equal(stats$numClasses, stats2$numClasses)
-
- unlink(modelPath)
- }
-
- # Test numeric response variable
- labelToIndex <- function(species) {
- switch(as.character(species),
- setosa = 0.0,
- versicolor = 1.0,
- virginica = 2.0
- )
- }
- iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
- data <- suppressWarnings(createDataFrame(iris[-5]))
- model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
- maxDepth = 5, maxBins = 16)
- stats <- summary(model)
- expect_equal(stats$numFeatures, 2)
- expect_equal(stats$maxDepth, 5)
-
- # Test numeric prediction values
- predictions <- collect(predict(model, data))$prediction
- expect_equal(length(grep("1.0", predictions)), 50)
- expect_equal(length(grep("2.0", predictions)), 50)
-
- # spark.decisionTree classification can work on libsvm data
- if (not_cran_or_windows_with_hadoop()) {
- data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
- source = "libsvm")
- model <- spark.decisionTree(data, label ~ features, "classification")
- expect_equal(summary(model)$numFeatures, 4)
- }
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_parallelize_collect.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/testthat/test_parallelize_collect.R
deleted file mode 100644
index 52d4c93..0000000
--- a/R/pkg/inst/tests/testthat/test_parallelize_collect.R
+++ /dev/null
@@ -1,120 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("parallelize() and collect()")
-
-# Mock data
-numVector <- c(-10:97)
-numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
-strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
- "violated, but I'm not. No, in fact, I think this is a friendly",
- "message, like \"Hey, wanna play?\" and yes, I want to play. ",
- "I really, really do.")
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
- "other times it helps me control the chaos.",
- "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
- "raising me. But they're both dead now. I didn't kill them. Honest.")
-
-numPairs <- list(list(1, 1), list(1, 2), list(2, 2), list(2, 3))
-strPairs <- list(list(strList, strList), list(strList, strList))
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-jsc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Tests
-
-test_that("parallelize() on simple vectors and lists returns an RDD", {
- skip_on_cran()
-
- numVectorRDD <- parallelize(jsc, numVector, 1)
- numVectorRDD2 <- parallelize(jsc, numVector, 10)
- numListRDD <- parallelize(jsc, numList, 1)
- numListRDD2 <- parallelize(jsc, numList, 4)
- strVectorRDD <- parallelize(jsc, strVector, 2)
- strVectorRDD2 <- parallelize(jsc, strVector, 3)
- strListRDD <- parallelize(jsc, strList, 4)
- strListRDD2 <- parallelize(jsc, strList, 1)
-
- rdds <- c(numVectorRDD,
- numVectorRDD2,
- numListRDD,
- numListRDD2,
- strVectorRDD,
- strVectorRDD2,
- strListRDD,
- strListRDD2)
-
- for (rdd in rdds) {
- expect_is(rdd, "RDD")
- expect_true(.hasSlot(rdd, "jrdd")
- && inherits(rdd@jrdd, "jobj")
- && isInstanceOf(rdd@jrdd, "org.apache.spark.api.java.JavaRDD"))
- }
-})
-
-test_that("collect(), following a parallelize(), gives back the original collections", {
- skip_on_cran()
-
- numVectorRDD <- parallelize(jsc, numVector, 10)
- expect_equal(collectRDD(numVectorRDD), as.list(numVector))
-
- numListRDD <- parallelize(jsc, numList, 1)
- numListRDD2 <- parallelize(jsc, numList, 4)
- expect_equal(collectRDD(numListRDD), as.list(numList))
- expect_equal(collectRDD(numListRDD2), as.list(numList))
-
- strVectorRDD <- parallelize(jsc, strVector, 2)
- strVectorRDD2 <- parallelize(jsc, strVector, 3)
- expect_equal(collectRDD(strVectorRDD), as.list(strVector))
- expect_equal(collectRDD(strVectorRDD2), as.list(strVector))
-
- strListRDD <- parallelize(jsc, strList, 4)
- strListRDD2 <- parallelize(jsc, strList, 1)
- expect_equal(collectRDD(strListRDD), as.list(strList))
- expect_equal(collectRDD(strListRDD2), as.list(strList))
-})
-
-test_that("regression: collect() following a parallelize() does not drop elements", {
- skip_on_cran()
-
- # 10 %/% 6 = 1, ceiling(10 / 6) = 2
- collLen <- 10
- numPart <- 6
- expected <- runif(collLen)
- actual <- collectRDD(parallelize(jsc, expected, numPart))
- expect_equal(actual, as.list(expected))
-})
-
-test_that("parallelize() and collect() work for lists of pairs (pairwise data)", {
- skip_on_cran()
-
- # use the pairwise logical to indicate pairwise data
- numPairsRDDD1 <- parallelize(jsc, numPairs, 1)
- numPairsRDDD2 <- parallelize(jsc, numPairs, 2)
- numPairsRDDD3 <- parallelize(jsc, numPairs, 3)
- expect_equal(collectRDD(numPairsRDDD1), numPairs)
- expect_equal(collectRDD(numPairsRDDD2), numPairs)
- expect_equal(collectRDD(numPairsRDDD3), numPairs)
- # can also leave out the parameter name, if the params are supplied in order
- strPairsRDDD1 <- parallelize(jsc, strPairs, 1)
- strPairsRDDD2 <- parallelize(jsc, strPairs, 2)
- expect_equal(collectRDD(strPairsRDDD1), strPairs)
- expect_equal(collectRDD(strPairsRDDD2), strPairs)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_rdd.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R
deleted file mode 100644
index fb244e1..0000000
--- a/R/pkg/inst/tests/testthat/test_rdd.R
+++ /dev/null
@@ -1,906 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("basic RDD functions")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-nums <- 1:10
-rdd <- parallelize(sc, nums, 2L)
-
-intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
-intRdd <- parallelize(sc, intPairs, 2L)
-
-test_that("get number of partitions in RDD", {
- skip_on_cran()
-
- expect_equal(getNumPartitionsRDD(rdd), 2)
- expect_equal(getNumPartitionsRDD(intRdd), 2)
-})
-
-test_that("first on RDD", {
- skip_on_cran()
-
- expect_equal(firstRDD(rdd), 1)
- newrdd <- lapply(rdd, function(x) x + 1)
- expect_equal(firstRDD(newrdd), 2)
-})
-
-test_that("count and length on RDD", {
- skip_on_cran()
-
- expect_equal(countRDD(rdd), 10)
- expect_equal(lengthRDD(rdd), 10)
-})
-
-test_that("count by values and keys", {
- skip_on_cran()
-
- mods <- lapply(rdd, function(x) { x %% 3 })
- actual <- countByValue(mods)
- expected <- list(list(0, 3L), list(1, 4L), list(2, 3L))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- actual <- countByKey(intRdd)
- expected <- list(list(2L, 2L), list(1L, 2L))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("lapply on RDD", {
- skip_on_cran()
-
- multiples <- lapply(rdd, function(x) { 2 * x })
- actual <- collectRDD(multiples)
- expect_equal(actual, as.list(nums * 2))
-})
-
-test_that("lapplyPartition on RDD", {
- skip_on_cran()
-
- sums <- lapplyPartition(rdd, function(part) { sum(unlist(part)) })
- actual <- collectRDD(sums)
- expect_equal(actual, list(15, 40))
-})
-
-test_that("mapPartitions on RDD", {
- skip_on_cran()
-
- sums <- mapPartitions(rdd, function(part) { sum(unlist(part)) })
- actual <- collectRDD(sums)
- expect_equal(actual, list(15, 40))
-})
-
-test_that("flatMap() on RDDs", {
- skip_on_cran()
-
- flat <- flatMap(intRdd, function(x) { list(x, x) })
- actual <- collectRDD(flat)
- expect_equal(actual, rep(intPairs, each = 2))
-})
-
-test_that("filterRDD on RDD", {
- skip_on_cran()
-
- filtered.rdd <- filterRDD(rdd, function(x) { x %% 2 == 0 })
- actual <- collectRDD(filtered.rdd)
- expect_equal(actual, list(2, 4, 6, 8, 10))
-
- filtered.rdd <- Filter(function(x) { x[[2]] < 0 }, intRdd)
- actual <- collectRDD(filtered.rdd)
- expect_equal(actual, list(list(1L, -1)))
-
- # Filter out all elements.
- filtered.rdd <- filterRDD(rdd, function(x) { x > 10 })
- actual <- collectRDD(filtered.rdd)
- expect_equal(actual, list())
-})
-
-test_that("lookup on RDD", {
- skip_on_cran()
-
- vals <- lookup(intRdd, 1L)
- expect_equal(vals, list(-1, 200))
-
- vals <- lookup(intRdd, 3L)
- expect_equal(vals, list())
-})
-
-test_that("several transformations on RDD (a benchmark on PipelinedRDD)", {
- skip_on_cran()
-
- rdd2 <- rdd
- for (i in 1:12)
- rdd2 <- lapplyPartitionsWithIndex(
- rdd2, function(partIndex, part) {
- part <- as.list(unlist(part) * partIndex + i)
- })
- rdd2 <- lapply(rdd2, function(x) x + x)
- actual <- collectRDD(rdd2)
- expected <- list(24, 24, 24, 24, 24,
- 168, 170, 172, 174, 176)
- expect_equal(actual, expected)
-})
-
-test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkpoint()", {
- skip_on_cran()
-
- # RDD
- rdd2 <- rdd
- # PipelinedRDD
- rdd2 <- lapplyPartitionsWithIndex(
- rdd2,
- function(partIndex, part) {
- part <- as.list(unlist(part) * partIndex)
- })
-
- cacheRDD(rdd2)
- expect_true(rdd2@env$isCached)
- rdd2 <- lapply(rdd2, function(x) x)
- expect_false(rdd2@env$isCached)
-
- unpersistRDD(rdd2)
- expect_false(rdd2@env$isCached)
-
- persistRDD(rdd2, "MEMORY_AND_DISK")
- expect_true(rdd2@env$isCached)
- rdd2 <- lapply(rdd2, function(x) x)
- expect_false(rdd2@env$isCached)
-
- unpersistRDD(rdd2)
- expect_false(rdd2@env$isCached)
-
- tempDir <- tempfile(pattern = "checkpoint")
- setCheckpointDirSC(sc, tempDir)
- checkpointRDD(rdd2)
- expect_true(rdd2@env$isCheckpointed)
-
- rdd2 <- lapply(rdd2, function(x) x)
- expect_false(rdd2@env$isCached)
- expect_false(rdd2@env$isCheckpointed)
-
- # make sure the data is collectable
- collectRDD(rdd2)
-
- unlink(tempDir)
-})
-
-test_that("reduce on RDD", {
- skip_on_cran()
-
- sum <- reduce(rdd, "+")
- expect_equal(sum, 55)
-
- # Also test with an inline function
- sumInline <- reduce(rdd, function(x, y) { x + y })
- expect_equal(sumInline, 55)
-})
-
-test_that("lapply with dependency", {
- skip_on_cran()
-
- fa <- 5
- multiples <- lapply(rdd, function(x) { fa * x })
- actual <- collectRDD(multiples)
-
- expect_equal(actual, as.list(nums * 5))
-})
-
-test_that("lapplyPartitionsWithIndex on RDDs", {
- skip_on_cran()
-
- func <- function(partIndex, part) { list(partIndex, Reduce("+", part)) }
- actual <- collectRDD(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE)
- expect_equal(actual, list(list(0, 15), list(1, 40)))
-
- pairsRDD <- parallelize(sc, list(list(1, 2), list(3, 4), list(4, 8)), 1L)
- partitionByParity <- function(key) { if (key %% 2 == 1) 0 else 1 }
- mkTup <- function(partIndex, part) { list(partIndex, part) }
- actual <- collectRDD(lapplyPartitionsWithIndex(
- partitionByRDD(pairsRDD, 2L, partitionByParity),
- mkTup),
- FALSE)
- expect_equal(actual, list(list(0, list(list(1, 2), list(3, 4))),
- list(1, list(list(4, 8)))))
-})
-
-test_that("sampleRDD() on RDDs", {
- skip_on_cran()
-
- expect_equal(unlist(collectRDD(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums)
-})
-
-test_that("takeSample() on RDDs", {
- skip_on_cran()
-
- # ported from RDDSuite.scala, modified seeds
- data <- parallelize(sc, 1:100, 2L)
- for (seed in 4:5) {
- s <- takeSample(data, FALSE, 20L, seed)
- expect_equal(length(s), 20L)
- expect_equal(length(unique(s)), 20L)
- for (elem in s) {
- expect_true(elem >= 1 && elem <= 100)
- }
- }
- for (seed in 4:5) {
- s <- takeSample(data, FALSE, 200L, seed)
- expect_equal(length(s), 100L)
- expect_equal(length(unique(s)), 100L)
- for (elem in s) {
- expect_true(elem >= 1 && elem <= 100)
- }
- }
- for (seed in 4:5) {
- s <- takeSample(data, TRUE, 20L, seed)
- expect_equal(length(s), 20L)
- for (elem in s) {
- expect_true(elem >= 1 && elem <= 100)
- }
- }
- for (seed in 4:5) {
- s <- takeSample(data, TRUE, 100L, seed)
- expect_equal(length(s), 100L)
- # Chance of getting all distinct elements is astronomically low, so test we
- # got less than 100
- expect_true(length(unique(s)) < 100L)
- }
- for (seed in 4:5) {
- s <- takeSample(data, TRUE, 200L, seed)
- expect_equal(length(s), 200L)
- # Chance of getting all distinct elements is still quite low, so test we
- # got less than 100
- expect_true(length(unique(s)) < 100L)
- }
-})
-
-test_that("mapValues() on pairwise RDDs", {
- skip_on_cran()
-
- multiples <- mapValues(intRdd, function(x) { x * 2 })
- actual <- collectRDD(multiples)
- expected <- lapply(intPairs, function(x) {
- list(x[[1]], x[[2]] * 2)
- })
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("flatMapValues() on pairwise RDDs", {
- skip_on_cran()
-
- l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4))))
- actual <- collectRDD(flatMapValues(l, function(x) { x }))
- expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
-
- # Generate x to x+1 for every value
- actual <- collectRDD(flatMapValues(intRdd, function(x) { x: (x + 1) }))
- expect_equal(actual,
- list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101),
- list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
-})
-
-test_that("reduceByKeyLocally() on PairwiseRDDs", {
- skip_on_cran()
-
- pairs <- parallelize(sc, list(list(1, 2), list(1.1, 3), list(1, 4)), 2L)
- actual <- reduceByKeyLocally(pairs, "+")
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list(1, 6), list(1.1, 3))))
-
- pairs <- parallelize(sc, list(list("abc", 1.2), list(1.1, 0), list("abc", 1.3),
- list("bb", 5)), 4L)
- actual <- reduceByKeyLocally(pairs, "+")
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list("abc", 2.5), list(1.1, 0), list("bb", 5))))
-})
-
-test_that("distinct() on RDDs", {
- skip_on_cran()
-
- nums.rep2 <- rep(1:10, 2)
- rdd.rep2 <- parallelize(sc, nums.rep2, 2L)
- uniques <- distinctRDD(rdd.rep2)
- actual <- sort(unlist(collectRDD(uniques)))
- expect_equal(actual, nums)
-})
-
-test_that("maximum() on RDDs", {
- skip_on_cran()
-
- max <- maximum(rdd)
- expect_equal(max, 10)
-})
-
-test_that("minimum() on RDDs", {
- skip_on_cran()
-
- min <- minimum(rdd)
- expect_equal(min, 1)
-})
-
-test_that("sumRDD() on RDDs", {
- skip_on_cran()
-
- sum <- sumRDD(rdd)
- expect_equal(sum, 55)
-})
-
-test_that("keyBy on RDDs", {
- skip_on_cran()
-
- func <- function(x) { x * x }
- keys <- keyBy(rdd, func)
- actual <- collectRDD(keys)
- expect_equal(actual, lapply(nums, function(x) { list(func(x), x) }))
-})
-
-test_that("repartition/coalesce on RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, 1:20, 4L) # each partition contains 5 elements
-
- # repartition
- r1 <- repartitionRDD(rdd, 2)
- expect_equal(getNumPartitionsRDD(r1), 2L)
- count <- length(collectPartition(r1, 0L))
- expect_true(count >= 8 && count <= 12)
-
- r2 <- repartitionRDD(rdd, 6)
- expect_equal(getNumPartitionsRDD(r2), 6L)
- count <- length(collectPartition(r2, 0L))
- expect_true(count >= 0 && count <= 4)
-
- # coalesce
- r3 <- coalesceRDD(rdd, 1)
- expect_equal(getNumPartitionsRDD(r3), 1L)
- count <- length(collectPartition(r3, 0L))
- expect_equal(count, 20)
-})
-
-test_that("sortBy() on RDDs", {
- skip_on_cran()
-
- sortedRdd <- sortBy(rdd, function(x) { x * x }, ascending = FALSE)
- actual <- collectRDD(sortedRdd)
- expect_equal(actual, as.list(sort(nums, decreasing = TRUE)))
-
- rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
- sortedRdd2 <- sortBy(rdd2, function(x) { x * x })
- actual <- collectRDD(sortedRdd2)
- expect_equal(actual, as.list(nums))
-})
-
-test_that("takeOrdered() on RDDs", {
- skip_on_cran()
-
- l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
- rdd <- parallelize(sc, l)
- actual <- takeOrdered(rdd, 6L)
- expect_equal(actual, as.list(sort(unlist(l)))[1:6])
-
- l <- list("e", "d", "c", "d", "a")
- rdd <- parallelize(sc, l)
- actual <- takeOrdered(rdd, 3L)
- expect_equal(actual, as.list(sort(unlist(l)))[1:3])
-})
-
-test_that("top() on RDDs", {
- skip_on_cran()
-
- l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
- rdd <- parallelize(sc, l)
- actual <- top(rdd, 6L)
- expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:6])
-
- l <- list("e", "d", "c", "d", "a")
- rdd <- parallelize(sc, l)
- actual <- top(rdd, 3L)
- expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:3])
-})
-
-test_that("fold() on RDDs", {
- skip_on_cran()
-
- actual <- fold(rdd, 0, "+")
- expect_equal(actual, Reduce("+", nums, 0))
-
- rdd <- parallelize(sc, list())
- actual <- fold(rdd, 0, "+")
- expect_equal(actual, 0)
-})
-
-test_that("aggregateRDD() on RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, list(1, 2, 3, 4))
- zeroValue <- list(0, 0)
- seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
- combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
- actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
- expect_equal(actual, list(10, 4))
-
- rdd <- parallelize(sc, list())
- actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
- expect_equal(actual, list(0, 0))
-})
-
-test_that("zipWithUniqueId() on RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
- actual <- collectRDD(zipWithUniqueId(rdd))
- expected <- list(list("a", 0), list("b", 1), list("c", 4),
- list("d", 2), list("e", 5))
- expect_equal(actual, expected)
-
- rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
- actual <- collectRDD(zipWithUniqueId(rdd))
- expected <- list(list("a", 0), list("b", 1), list("c", 2),
- list("d", 3), list("e", 4))
- expect_equal(actual, expected)
-})
-
-test_that("zipWithIndex() on RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
- actual <- collectRDD(zipWithIndex(rdd))
- expected <- list(list("a", 0), list("b", 1), list("c", 2),
- list("d", 3), list("e", 4))
- expect_equal(actual, expected)
-
- rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
- actual <- collectRDD(zipWithIndex(rdd))
- expected <- list(list("a", 0), list("b", 1), list("c", 2),
- list("d", 3), list("e", 4))
- expect_equal(actual, expected)
-})
-
-test_that("glom() on RDD", {
- skip_on_cran()
-
- rdd <- parallelize(sc, as.list(1:4), 2L)
- actual <- collectRDD(glom(rdd))
- expect_equal(actual, list(list(1, 2), list(3, 4)))
-})
-
-test_that("keys() on RDDs", {
- skip_on_cran()
-
- keys <- keys(intRdd)
- actual <- collectRDD(keys)
- expect_equal(actual, lapply(intPairs, function(x) { x[[1]] }))
-})
-
-test_that("values() on RDDs", {
- skip_on_cran()
-
- values <- values(intRdd)
- actual <- collectRDD(values)
- expect_equal(actual, lapply(intPairs, function(x) { x[[2]] }))
-})
-
-test_that("pipeRDD() on RDDs", {
- skip_on_cran()
-
- actual <- collectRDD(pipeRDD(rdd, "more"))
- expected <- as.list(as.character(1:10))
- expect_equal(actual, expected)
-
- trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n"))
- actual <- collectRDD(pipeRDD(trailed.rdd, "sort"))
- expected <- list("", "1", "2", "3")
- expect_equal(actual, expected)
-
- rev.nums <- 9:0
- rev.rdd <- parallelize(sc, rev.nums, 2L)
- actual <- collectRDD(pipeRDD(rev.rdd, "sort"))
- expected <- as.list(as.character(c(5:9, 0:4)))
- expect_equal(actual, expected)
-})
-
-test_that("zipRDD() on RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, 0:4, 2)
- rdd2 <- parallelize(sc, 1000:1004, 2)
- actual <- collectRDD(zipRDD(rdd1, rdd2))
- expect_equal(actual,
- list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)))
-
- mockFile <- c("Spark is pretty.", "Spark is awesome.")
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName, 1)
- actual <- collectRDD(zipRDD(rdd, rdd))
- expected <- lapply(mockFile, function(x) { list(x, x) })
- expect_equal(actual, expected)
-
- rdd1 <- parallelize(sc, 0:1, 1)
- actual <- collectRDD(zipRDD(rdd1, rdd))
- expected <- lapply(0:1, function(x) { list(x, mockFile[x + 1]) })
- expect_equal(actual, expected)
-
- rdd1 <- map(rdd, function(x) { x })
- actual <- collectRDD(zipRDD(rdd, rdd1))
- expected <- lapply(mockFile, function(x) { list(x, x) })
- expect_equal(actual, expected)
-
- unlink(fileName)
-})
-
-test_that("cartesian() on RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, 1:3)
- actual <- collectRDD(cartesian(rdd, rdd))
- expect_equal(sortKeyValueList(actual),
- list(
- list(1, 1), list(1, 2), list(1, 3),
- list(2, 1), list(2, 2), list(2, 3),
- list(3, 1), list(3, 2), list(3, 3)))
-
- # test case where one RDD is empty
- emptyRdd <- parallelize(sc, list())
- actual <- collectRDD(cartesian(rdd, emptyRdd))
- expect_equal(actual, list())
-
- mockFile <- c("Spark is pretty.", "Spark is awesome.")
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName)
- actual <- collectRDD(cartesian(rdd, rdd))
- expected <- list(
- list("Spark is awesome.", "Spark is pretty."),
- list("Spark is awesome.", "Spark is awesome."),
- list("Spark is pretty.", "Spark is pretty."),
- list("Spark is pretty.", "Spark is awesome."))
- expect_equal(sortKeyValueList(actual), expected)
-
- rdd1 <- parallelize(sc, 0:1)
- actual <- collectRDD(cartesian(rdd1, rdd))
- expect_equal(sortKeyValueList(actual),
- list(
- list(0, "Spark is pretty."),
- list(0, "Spark is awesome."),
- list(1, "Spark is pretty."),
- list(1, "Spark is awesome.")))
-
- rdd1 <- map(rdd, function(x) { x })
- actual <- collectRDD(cartesian(rdd, rdd1))
- expect_equal(sortKeyValueList(actual), expected)
-
- unlink(fileName)
-})
-
-test_that("subtract() on RDDs", {
- skip_on_cran()
-
- l <- list(1, 1, 2, 2, 3, 4)
- rdd1 <- parallelize(sc, l)
-
- # subtract by itself
- actual <- collectRDD(subtract(rdd1, rdd1))
- expect_equal(actual, list())
-
- # subtract by an empty RDD
- rdd2 <- parallelize(sc, list())
- actual <- collectRDD(subtract(rdd1, rdd2))
- expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
- l)
-
- rdd2 <- parallelize(sc, list(2, 4))
- actual <- collectRDD(subtract(rdd1, rdd2))
- expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
- list(1, 1, 3))
-
- l <- list("a", "a", "b", "b", "c", "d")
- rdd1 <- parallelize(sc, l)
- rdd2 <- parallelize(sc, list("b", "d"))
- actual <- collectRDD(subtract(rdd1, rdd2))
- expect_equal(as.list(sort(as.vector(actual, mode = "character"))),
- list("a", "a", "c"))
-})
-
-test_that("subtractByKey() on pairwise RDDs", {
- skip_on_cran()
-
- l <- list(list("a", 1), list("b", 4),
- list("b", 5), list("a", 2))
- rdd1 <- parallelize(sc, l)
-
- # subtractByKey by itself
- actual <- collectRDD(subtractByKey(rdd1, rdd1))
- expect_equal(actual, list())
-
- # subtractByKey by an empty RDD
- rdd2 <- parallelize(sc, list())
- actual <- collectRDD(subtractByKey(rdd1, rdd2))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(l))
-
- rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1)))
- actual <- collectRDD(subtractByKey(rdd1, rdd2))
- expect_equal(actual,
- list(list("b", 4), list("b", 5)))
-
- l <- list(list(1, 1), list(2, 4),
- list(2, 5), list(1, 2))
- rdd1 <- parallelize(sc, l)
- rdd2 <- parallelize(sc, list(list(1, 3), list(3, 1)))
- actual <- collectRDD(subtractByKey(rdd1, rdd2))
- expect_equal(actual,
- list(list(2, 4), list(2, 5)))
-})
-
-test_that("intersection() on RDDs", {
- skip_on_cran()
-
- # intersection with self
- actual <- collectRDD(intersection(rdd, rdd))
- expect_equal(sort(as.integer(actual)), nums)
-
- # intersection with an empty RDD
- emptyRdd <- parallelize(sc, list())
- actual <- collectRDD(intersection(rdd, emptyRdd))
- expect_equal(actual, list())
-
- rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5))
- rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8))
- actual <- collectRDD(intersection(rdd1, rdd2))
- expect_equal(sort(as.integer(actual)), 1:3)
-})
-
-test_that("join() on pairwise RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
- rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
- actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3)))))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
- rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
- actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3)))))
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
- rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
- actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
- expect_equal(actual, list())
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
- rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
- actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
- expect_equal(actual, list())
-})
-
-test_that("leftOuterJoin() on pairwise RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
- rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
- actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
- rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
- actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
- rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
- actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
- rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
- actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-})
-
-test_that("rightOuterJoin() on pairwise RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3)))
- rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
- actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3)))
- rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
- actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
- rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
- actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
- rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
- actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
-})
-
-test_that("fullOuterJoin() on pairwise RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3)))
- rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
- actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list(1, list(2, 1)), list(1, list(3, 1)),
- list(2, list(NULL, 4)), list(3, list(3, NULL)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1)))
- rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
- actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
- expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)),
- list("a", list(3, 1)), list("c", list(1, NULL)))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
- rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
- actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)),
- list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
- rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
- actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)),
- list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
-})
-
-test_that("sortByKey() on pairwise RDDs", {
- skip_on_cran()
-
- numPairsRdd <- map(rdd, function(x) { list (x, x) })
- sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
- actual <- collectRDD(sortedRdd)
- numPairs <- lapply(nums, function(x) { list (x, x) })
- expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
-
- rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
- numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
- sortedRdd2 <- sortByKey(numPairsRdd2)
- actual <- collectRDD(sortedRdd2)
- expect_equal(actual, numPairs)
-
- # sort by string keys
- l <- list(list("a", 1), list("b", 2), list("1", 3), list("d", 4), list("2", 5))
- rdd3 <- parallelize(sc, l, 2L)
- sortedRdd3 <- sortByKey(rdd3)
- actual <- collectRDD(sortedRdd3)
- expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
- # test on the boundary cases
-
- # boundary case 1: the RDD to be sorted has only 1 partition
- rdd4 <- parallelize(sc, l, 1L)
- sortedRdd4 <- sortByKey(rdd4)
- actual <- collectRDD(sortedRdd4)
- expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
- # boundary case 2: the sorted RDD has only 1 partition
- rdd5 <- parallelize(sc, l, 2L)
- sortedRdd5 <- sortByKey(rdd5, numPartitions = 1L)
- actual <- collectRDD(sortedRdd5)
- expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
- # boundary case 3: the RDD to be sorted has only 1 element
- l2 <- list(list("a", 1))
- rdd6 <- parallelize(sc, l2, 2L)
- sortedRdd6 <- sortByKey(rdd6)
- actual <- collectRDD(sortedRdd6)
- expect_equal(actual, l2)
-
- # boundary case 4: the RDD to be sorted has 0 element
- l3 <- list()
- rdd7 <- parallelize(sc, l3, 2L)
- sortedRdd7 <- sortByKey(rdd7)
- actual <- collectRDD(sortedRdd7)
- expect_equal(actual, l3)
-})
-
-test_that("collectAsMap() on a pairwise RDD", {
- skip_on_cran()
-
- rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
- vals <- collectAsMap(rdd)
- expect_equal(vals, list(`1` = 2, `3` = 4))
-
- rdd <- parallelize(sc, list(list("a", 1), list("b", 2)))
- vals <- collectAsMap(rdd)
- expect_equal(vals, list(a = 1, b = 2))
-
- rdd <- parallelize(sc, list(list(1.1, 2.2), list(1.2, 2.4)))
- vals <- collectAsMap(rdd)
- expect_equal(vals, list(`1.1` = 2.2, `1.2` = 2.4))
-
- rdd <- parallelize(sc, list(list(1, "a"), list(2, "b")))
- vals <- collectAsMap(rdd)
- expect_equal(vals, list(`1` = "a", `2` = "b"))
-})
-
-test_that("show()", {
- skip_on_cran()
-
- rdd <- parallelize(sc, list(1:10))
- expect_output(showRDD(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+")
-})
-
-test_that("sampleByKey() on pairwise RDDs", {
- skip_on_cran()
-
- rdd <- parallelize(sc, 1:2000)
- pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list("a", x) else list("b", x) })
- fractions <- list(a = 0.2, b = 0.1)
- sample <- sampleByKey(pairsRDD, FALSE, fractions, 1618L)
- expect_equal(100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")), TRUE)
- expect_equal(50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")), TRUE)
- expect_equal(lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0, TRUE)
- expect_equal(lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000, TRUE)
- expect_equal(lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0, TRUE)
- expect_equal(lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000, TRUE)
-
- rdd <- parallelize(sc, 1:2000)
- pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list(2, x) else list(3, x) })
- fractions <- list(`2` = 0.2, `3` = 0.1)
- sample <- sampleByKey(pairsRDD, TRUE, fractions, 1618L)
- expect_equal(100 < length(lookup(sample, 2)) && 300 > length(lookup(sample, 2)), TRUE)
- expect_equal(50 < length(lookup(sample, 3)) && 150 > length(lookup(sample, 3)), TRUE)
- expect_equal(lookup(sample, 2)[which.min(lookup(sample, 2))] >= 0, TRUE)
- expect_equal(lookup(sample, 2)[which.max(lookup(sample, 2))] <= 2000, TRUE)
- expect_equal(lookup(sample, 3)[which.min(lookup(sample, 3))] >= 0, TRUE)
- expect_equal(lookup(sample, 3)[which.max(lookup(sample, 3))] <= 2000, TRUE)
-})
-
-test_that("Test correct concurrency of RRDD.compute()", {
- skip_on_cran()
-
- rdd <- parallelize(sc, 1:1000, 100)
- jrdd <- getJRDD(lapply(rdd, function(x) { x }), "row")
- zrdd <- callJMethod(jrdd, "zip", jrdd)
- count <- callJMethod(zrdd, "count")
- expect_equal(count, 1000)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_shuffle.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/testthat/test_shuffle.R
deleted file mode 100644
index 18320ea..0000000
--- a/R/pkg/inst/tests/testthat/test_shuffle.R
+++ /dev/null
@@ -1,248 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("partitionBy, groupByKey, reduceByKey etc.")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
-intRdd <- parallelize(sc, intPairs, 2L)
-
-doublePairs <- list(list(1.5, -1), list(2.5, 100), list(2.5, 1), list(1.5, 200))
-doubleRdd <- parallelize(sc, doublePairs, 2L)
-
-numPairs <- list(list(1L, 100), list(2L, 200), list(4L, -1), list(3L, 1),
- list(3L, 0))
-numPairsRdd <- parallelize(sc, numPairs, length(numPairs))
-
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge and ",
- "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ")
-strListRDD <- parallelize(sc, strList, 4)
-
-test_that("groupByKey for integers", {
- skip_on_cran()
-
- grouped <- groupByKey(intRdd, 2L)
-
- actual <- collectRDD(grouped)
-
- expected <- list(list(2L, list(100, 1)), list(1L, list(-1, 200)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("groupByKey for doubles", {
- skip_on_cran()
-
- grouped <- groupByKey(doubleRdd, 2L)
-
- actual <- collectRDD(grouped)
-
- expected <- list(list(1.5, list(-1, 200)), list(2.5, list(100, 1)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("reduceByKey for ints", {
- skip_on_cran()
-
- reduced <- reduceByKey(intRdd, "+", 2L)
-
- actual <- collectRDD(reduced)
-
- expected <- list(list(2L, 101), list(1L, 199))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("reduceByKey for doubles", {
- skip_on_cran()
-
- reduced <- reduceByKey(doubleRdd, "+", 2L)
- actual <- collectRDD(reduced)
-
- expected <- list(list(1.5, 199), list(2.5, 101))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for ints", {
- skip_on_cran()
-
- reduced <- combineByKey(intRdd, function(x) { x }, "+", "+", 2L)
-
- actual <- collectRDD(reduced)
-
- expected <- list(list(2L, 101), list(1L, 199))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for doubles", {
- skip_on_cran()
-
- reduced <- combineByKey(doubleRdd, function(x) { x }, "+", "+", 2L)
- actual <- collectRDD(reduced)
-
- expected <- list(list(1.5, 199), list(2.5, 101))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for characters", {
- skip_on_cran()
-
- stringKeyRDD <- parallelize(sc,
- list(list("max", 1L), list("min", 2L),
- list("other", 3L), list("max", 4L)), 2L)
- reduced <- combineByKey(stringKeyRDD,
- function(x) { x }, "+", "+", 2L)
- actual <- collectRDD(reduced)
-
- expected <- list(list("max", 5L), list("min", 2L), list("other", 3L))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("aggregateByKey", {
- skip_on_cran()
-
- # test aggregateByKey for int keys
- rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
-
- zeroValue <- list(0, 0)
- seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
- combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
- aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
-
- actual <- collectRDD(aggregatedRDD)
-
- expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- # test aggregateByKey for string keys
- rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))
-
- zeroValue <- list(0, 0)
- seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
- combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
- aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
-
- actual <- collectRDD(aggregatedRDD)
-
- expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("foldByKey", {
- skip_on_cran()
-
- # test foldByKey for int keys
- folded <- foldByKey(intRdd, 0, "+", 2L)
-
- actual <- collectRDD(folded)
-
- expected <- list(list(2L, 101), list(1L, 199))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- # test foldByKey for double keys
- folded <- foldByKey(doubleRdd, 0, "+", 2L)
-
- actual <- collectRDD(folded)
-
- expected <- list(list(1.5, 199), list(2.5, 101))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- # test foldByKey for string keys
- stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))
-
- stringKeyRDD <- parallelize(sc, stringKeyPairs)
- folded <- foldByKey(stringKeyRDD, 0, "+", 2L)
-
- actual <- collectRDD(folded)
-
- expected <- list(list("b", 101), list("a", 199))
- expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
- # test foldByKey for empty pair RDD
- rdd <- parallelize(sc, list())
- folded <- foldByKey(rdd, 0, "+", 2L)
- actual <- collectRDD(folded)
- expected <- list()
- expect_equal(actual, expected)
-
- # test foldByKey for RDD with only 1 pair
- rdd <- parallelize(sc, list(list(1, 1)))
- folded <- foldByKey(rdd, 0, "+", 2L)
- actual <- collectRDD(folded)
- expected <- list(list(1, 1))
- expect_equal(actual, expected)
-})
-
-test_that("partitionBy() partitions data correctly", {
- skip_on_cran()
-
- # Partition by magnitude
- partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
-
- resultRDD <- partitionByRDD(numPairsRdd, 2L, partitionByMagnitude)
-
- expected_first <- list(list(1, 100), list(2, 200)) # key less than 3
- expected_second <- list(list(4, -1), list(3, 1), list(3, 0)) # key greater than or equal 3
- actual_first <- collectPartition(resultRDD, 0L)
- actual_second <- collectPartition(resultRDD, 1L)
-
- expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
- expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-test_that("partitionBy works with dependencies", {
- skip_on_cran()
-
- kOne <- 1
- partitionByParity <- function(key) { if (key %% 2 == kOne) 7 else 4 }
-
- # Partition by parity
- resultRDD <- partitionByRDD(numPairsRdd, numPartitions = 2L, partitionByParity)
-
- # keys even; 100 %% 2 == 0
- expected_first <- list(list(2, 200), list(4, -1))
- # keys odd; 3 %% 2 == 1
- expected_second <- list(list(1, 100), list(3, 1), list(3, 0))
- actual_first <- collectPartition(resultRDD, 0L)
- actual_second <- collectPartition(resultRDD, 1L)
-
- expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
- expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-test_that("test partitionBy with string keys", {
- skip_on_cran()
-
- words <- flatMap(strListRDD, function(line) { strsplit(line, " ")[[1]] })
- wordCount <- lapply(words, function(word) { list(word, 1L) })
-
- resultRDD <- partitionByRDD(wordCount, 2L)
- expected_first <- list(list("Dexter", 1), list("Dexter", 1))
- expected_second <- list(list("and", 1), list("and", 1))
-
- actual_first <- Filter(function(item) { item[[1]] == "Dexter" },
- collectPartition(resultRDD, 0L))
- actual_second <- Filter(function(item) { item[[1]] == "and" },
- collectPartition(resultRDD, 1L))
-
- expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
- expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkR.R b/R/pkg/inst/tests/testthat/test_sparkR.R
deleted file mode 100644
index a40981c..0000000
--- a/R/pkg/inst/tests/testthat/test_sparkR.R
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in sparkR.R")
-
-test_that("sparkCheckInstall", {
- skip_on_cran()
-
- # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
- # and the SparkR job was submitted by "spark-submit"
- sparkHome <- paste0(tempdir(), "/", "sparkHome")
- dir.create(sparkHome)
- master <- ""
- deployMode <- ""
- expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
- unlink(sparkHome, recursive = TRUE)
-
- # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
- # and the SparkR job was submitted by "spark-submit"
- sparkHome <- ""
- master <- ""
- deployMode <- ""
- expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
-
- # "yarn-client, mesos-client" mode, SPARK_HOME was not set
- sparkHome <- ""
- master <- "yarn-client"
- deployMode <- ""
- expect_error(sparkCheckInstall(sparkHome, master, deployMode))
- sparkHome <- ""
- master <- ""
- deployMode <- "client"
- expect_error(sparkCheckInstall(sparkHome, master, deployMode))
-})
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN
## What changes were proposed in this pull request?
Move all existing tests to non-installed directory so that it will never run by installing SparkR package
For a follow-up PR:
- remove all skip_on_cran() calls in tests
- clean up test timer
- improve or change basic tests that do run on CRAN (if anyone has suggestion)
It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) into the source package but `R CMD INSTALL` on such source package does not install these tests (and so `R CMD check` does not run them)
## How was this patch tested?
- [x] unit tests, Jenkins
- [x] AppVeyor
- [x] make a source package, install it, `R CMD check` it - verify the full tests are not installed or run
Author: Felix Cheung <fe...@hotmail.com>
Closes #18264 from felixcheung/rtestset.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c3518
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c3518
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c3518
Branch: refs/heads/master
Commit: dc4c351837879dab26ad8fb471dc51c06832a9e4
Parents: 5301a19
Author: Felix Cheung <fe...@hotmail.com>
Authored: Sun Jun 11 00:00:33 2017 -0700
Committer: Felix Cheung <fe...@apache.org>
Committed: Sun Jun 11 00:00:33 2017 -0700
----------------------------------------------------------------------
R/pkg/inst/tests/testthat/jarTest.R | 32 -
R/pkg/inst/tests/testthat/packageInAJarTest.R | 30 -
R/pkg/inst/tests/testthat/test_Serde.R | 85 -
R/pkg/inst/tests/testthat/test_Windows.R | 32 -
R/pkg/inst/tests/testthat/test_basic.R | 90 +
R/pkg/inst/tests/testthat/test_binaryFile.R | 100 -
.../inst/tests/testthat/test_binary_function.R | 110 -
R/pkg/inst/tests/testthat/test_broadcast.R | 55 -
R/pkg/inst/tests/testthat/test_client.R | 51 -
R/pkg/inst/tests/testthat/test_context.R | 226 --
R/pkg/inst/tests/testthat/test_includePackage.R | 64 -
R/pkg/inst/tests/testthat/test_jvm_api.R | 36 -
.../tests/testthat/test_mllib_classification.R | 396 --
.../inst/tests/testthat/test_mllib_clustering.R | 328 --
R/pkg/inst/tests/testthat/test_mllib_fpm.R | 85 -
.../tests/testthat/test_mllib_recommendation.R | 67 -
.../inst/tests/testthat/test_mllib_regression.R | 480 ---
R/pkg/inst/tests/testthat/test_mllib_stat.R | 53 -
R/pkg/inst/tests/testthat/test_mllib_tree.R | 320 --
.../tests/testthat/test_parallelize_collect.R | 120 -
R/pkg/inst/tests/testthat/test_rdd.R | 906 -----
R/pkg/inst/tests/testthat/test_shuffle.R | 248 --
R/pkg/inst/tests/testthat/test_sparkR.R | 48 -
R/pkg/inst/tests/testthat/test_sparkSQL.R | 3474 ------------------
R/pkg/inst/tests/testthat/test_streaming.R | 167 -
R/pkg/inst/tests/testthat/test_take.R | 71 -
R/pkg/inst/tests/testthat/test_textFile.R | 182 -
R/pkg/inst/tests/testthat/test_utils.R | 248 --
R/pkg/tests/fulltests/jarTest.R | 32 +
R/pkg/tests/fulltests/packageInAJarTest.R | 30 +
R/pkg/tests/fulltests/test_Serde.R | 85 +
R/pkg/tests/fulltests/test_Windows.R | 32 +
R/pkg/tests/fulltests/test_binaryFile.R | 100 +
R/pkg/tests/fulltests/test_binary_function.R | 110 +
R/pkg/tests/fulltests/test_broadcast.R | 55 +
R/pkg/tests/fulltests/test_client.R | 51 +
R/pkg/tests/fulltests/test_context.R | 226 ++
R/pkg/tests/fulltests/test_includePackage.R | 64 +
R/pkg/tests/fulltests/test_jvm_api.R | 36 +
.../tests/fulltests/test_mllib_classification.R | 396 ++
R/pkg/tests/fulltests/test_mllib_clustering.R | 328 ++
R/pkg/tests/fulltests/test_mllib_fpm.R | 85 +
.../tests/fulltests/test_mllib_recommendation.R | 67 +
R/pkg/tests/fulltests/test_mllib_regression.R | 480 +++
R/pkg/tests/fulltests/test_mllib_stat.R | 53 +
R/pkg/tests/fulltests/test_mllib_tree.R | 320 ++
.../tests/fulltests/test_parallelize_collect.R | 120 +
R/pkg/tests/fulltests/test_rdd.R | 906 +++++
R/pkg/tests/fulltests/test_shuffle.R | 248 ++
R/pkg/tests/fulltests/test_sparkR.R | 48 +
R/pkg/tests/fulltests/test_sparkSQL.R | 3474 ++++++++++++++++++
R/pkg/tests/fulltests/test_streaming.R | 167 +
R/pkg/tests/fulltests/test_take.R | 71 +
R/pkg/tests/fulltests/test_textFile.R | 182 +
R/pkg/tests/fulltests/test_utils.R | 248 ++
R/pkg/tests/run-all.R | 8 +
56 files changed, 8112 insertions(+), 8014 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/jarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R
deleted file mode 100644
index e2241e0..0000000
--- a/R/pkg/inst/tests/testthat/jarTest.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-library(SparkR)
-
-sc <- sparkR.session(master = "local[1]")
-
-helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
- "helloWorld",
- "Dave")
-stopifnot(identical(helloTest, "Hello Dave"))
-
-basicFunction <- SparkR:::callJStatic("sparkrtest.DummyClass",
- "addStuff",
- 2L,
- 2L)
-stopifnot(basicFunction == 4L)
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/packageInAJarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R
deleted file mode 100644
index ac70626..0000000
--- a/R/pkg/inst/tests/testthat/packageInAJarTest.R
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-library(SparkR)
-library(sparkPackageTest)
-
-sparkR.session(master = "local[1]")
-
-run1 <- myfunc(5L)
-
-run2 <- myfunc(-4L)
-
-sparkR.session.stop()
-
-if (run1 != 6) quit(save = "no", status = 1)
-
-if (run2 != -3) quit(save = "no", status = 1)
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_Serde.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R
deleted file mode 100644
index 6e160fa..0000000
--- a/R/pkg/inst/tests/testthat/test_Serde.R
+++ /dev/null
@@ -1,85 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("SerDe functionality")
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("SerDe of primitive types", {
- skip_on_cran()
-
- x <- callJStatic("SparkRHandler", "echo", 1L)
- expect_equal(x, 1L)
- expect_equal(class(x), "integer")
-
- x <- callJStatic("SparkRHandler", "echo", 1)
- expect_equal(x, 1)
- expect_equal(class(x), "numeric")
-
- x <- callJStatic("SparkRHandler", "echo", TRUE)
- expect_true(x)
- expect_equal(class(x), "logical")
-
- x <- callJStatic("SparkRHandler", "echo", "abc")
- expect_equal(x, "abc")
- expect_equal(class(x), "character")
-})
-
-test_that("SerDe of list of primitive types", {
- skip_on_cran()
-
- x <- list(1L, 2L, 3L)
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
- expect_equal(class(y[[1]]), "integer")
-
- x <- list(1, 2, 3)
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
- expect_equal(class(y[[1]]), "numeric")
-
- x <- list(TRUE, FALSE)
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
- expect_equal(class(y[[1]]), "logical")
-
- x <- list("a", "b", "c")
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
- expect_equal(class(y[[1]]), "character")
-
- # Empty list
- x <- list()
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
-})
-
-test_that("SerDe of list of lists", {
- skip_on_cran()
-
- x <- list(list(1L, 2L, 3L), list(1, 2, 3),
- list(TRUE, FALSE), list("a", "b", "c"))
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
-
- # List of empty lists
- x <- list(list(), list())
- y <- callJStatic("SparkRHandler", "echo", x)
- expect_equal(x, y)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_Windows.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R
deleted file mode 100644
index 00d684e..0000000
--- a/R/pkg/inst/tests/testthat/test_Windows.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-context("Windows-specific tests")
-
-test_that("sparkJars tag in SparkContext", {
- skip_on_cran()
-
- if (.Platform$OS.type != "windows") {
- skip("This test is only for Windows, skipped")
- }
-
- testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE)
- abcPath <- testOutput[1]
- expect_equal(abcPath, "a\\b\\c")
-})
-
-message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT"))
-message("elapsed ", (proc.time() - timer_ptm)[3])
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_basic.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R
new file mode 100644
index 0000000..de47162
--- /dev/null
+++ b/R/pkg/inst/tests/testthat/test_basic.R
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("basic tests for CRAN")
+
+test_that("create DataFrame from list or data.frame", {
+ sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+ i <- 4
+ df <- createDataFrame(data.frame(dummy = 1:i))
+ expect_equal(count(df), i)
+
+ l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+ df <- createDataFrame(l)
+ expect_equal(columns(df), c("a", "b"))
+
+ a <- 1:3
+ b <- c("a", "b", "c")
+ ldf <- data.frame(a, b)
+ df <- createDataFrame(ldf)
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+ expect_equal(count(df), 3)
+ ldf2 <- collect(df)
+ expect_equal(ldf$a, ldf2$a)
+
+ mtcarsdf <- createDataFrame(mtcars)
+ expect_equivalent(collect(mtcarsdf), mtcars)
+
+ bytes <- as.raw(c(1, 2, 3))
+ df <- createDataFrame(list(list(bytes)))
+ expect_equal(collect(df)[[1]][[1]], bytes)
+
+ sparkR.session.stop()
+})
+
+test_that("spark.glm and predict", {
+ sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+ training <- suppressWarnings(createDataFrame(iris))
+ # gaussian family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # Gamma family
+ x <- runif(100, -1, 1)
+ y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+ df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+ model <- glm(y ~ x, family = Gamma, df)
+ out <- capture.output(print(summary(model)))
+ expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
+ # tweedie family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+ family = "tweedie", var.power = 1.2, link.power = 0.0)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+
+ # manual calculation of the R predicted values to avoid dependence on statmod
+ #' library(statmod)
+ #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+ #' family = tweedie(var.power = 1.2, link.power = 0.0))
+ #' print(coef(rModel))
+
+ rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+ rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+ data = iris) %*% rCoef))
+ expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+ sparkR.session.stop()
+})
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_binaryFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R
deleted file mode 100644
index 00954fa..0000000
--- a/R/pkg/inst/tests/testthat/test_binaryFile.R
+++ /dev/null
@@ -1,100 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions on binary files")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("saveAsObjectFile()/objectFile() following textFile() works", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName1)
-
- rdd <- textFile(sc, fileName1, 1)
- saveAsObjectFile(rdd, fileName2)
- rdd <- objectFile(sc, fileName2)
- expect_equal(collectRDD(rdd), as.list(mockFile))
-
- unlink(fileName1)
- unlink(fileName2, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-
- l <- list(1, 2, 3)
- rdd <- parallelize(sc, l, 1)
- saveAsObjectFile(rdd, fileName)
- rdd <- objectFile(sc, fileName)
- expect_equal(collectRDD(rdd), l)
-
- unlink(fileName, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() following RDD transformations works", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName1)
-
- rdd <- textFile(sc, fileName1)
-
- words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
- wordCount <- lapply(words, function(word) { list(word, 1L) })
-
- counts <- reduceByKey(wordCount, "+", 2L)
-
- saveAsObjectFile(counts, fileName2)
- counts <- objectFile(sc, fileName2)
-
- output <- collectRDD(counts)
- expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1),
- list("is", 2))
- expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
-
- unlink(fileName1)
- unlink(fileName2, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-
- rdd1 <- parallelize(sc, "Spark is pretty.")
- saveAsObjectFile(rdd1, fileName1)
- rdd2 <- parallelize(sc, "Spark is awesome.")
- saveAsObjectFile(rdd2, fileName2)
-
- rdd <- objectFile(sc, c(fileName1, fileName2))
- expect_equal(countRDD(rdd), 2)
-
- unlink(fileName1, recursive = TRUE)
- unlink(fileName2, recursive = TRUE)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_binary_function.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R
deleted file mode 100644
index 236cb38..0000000
--- a/R/pkg/inst/tests/testthat/test_binary_function.R
+++ /dev/null
@@ -1,110 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("binary functions")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-nums <- 1:10
-rdd <- parallelize(sc, nums, 2L)
-
-# File content
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("union on two RDDs", {
- skip_on_cran()
-
- actual <- collectRDD(unionRDD(rdd, rdd))
- expect_equal(actual, as.list(rep(nums, 2)))
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- text.rdd <- textFile(sc, fileName)
- union.rdd <- unionRDD(rdd, text.rdd)
- actual <- collectRDD(union.rdd)
- expect_equal(actual, c(as.list(nums), mockFile))
- expect_equal(getSerializedMode(union.rdd), "byte")
-
- rdd <- map(text.rdd, function(x) {x})
- union.rdd <- unionRDD(rdd, text.rdd)
- actual <- collectRDD(union.rdd)
- expect_equal(actual, as.list(c(mockFile, mockFile)))
- expect_equal(getSerializedMode(union.rdd), "byte")
-
- unlink(fileName)
-})
-
-test_that("cogroup on two RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
- rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
- cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
- actual <- collectRDD(cogroup.rdd)
- expect_equal(actual,
- list(list(1, list(list(1), list(2, 3))), list(2, list(list(4), list()))))
-
- rdd1 <- parallelize(sc, list(list("a", 1), list("a", 4)))
- rdd2 <- parallelize(sc, list(list("b", 2), list("a", 3)))
- cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
- actual <- collectRDD(cogroup.rdd)
-
- expected <- list(list("b", list(list(), list(2))), list("a", list(list(1, 4), list(3))))
- expect_equal(sortKeyValueList(actual),
- sortKeyValueList(expected))
-})
-
-test_that("zipPartitions() on RDDs", {
- skip_on_cran()
-
- rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2
- rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4
- rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6
- actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
- func = function(x, y, z) { list(list(x, y, z))} ))
- expect_equal(actual,
- list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6))))
-
- mockFile <- c("Spark is pretty.", "Spark is awesome.")
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName, 1)
- actual <- collectRDD(zipPartitions(rdd, rdd,
- func = function(x, y) { list(paste(x, y, sep = "\n")) }))
- expected <- list(paste(mockFile, mockFile, sep = "\n"))
- expect_equal(actual, expected)
-
- rdd1 <- parallelize(sc, 0:1, 1)
- actual <- collectRDD(zipPartitions(rdd1, rdd,
- func = function(x, y) { list(x + nchar(y)) }))
- expected <- list(0:1 + nchar(mockFile))
- expect_equal(actual, expected)
-
- rdd <- map(rdd, function(x) { x })
- actual <- collectRDD(zipPartitions(rdd, rdd1,
- func = function(x, y) { list(y + nchar(x)) }))
- expect_equal(actual, expected)
-
- unlink(fileName)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_broadcast.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R
deleted file mode 100644
index 2c96740..0000000
--- a/R/pkg/inst/tests/testthat/test_broadcast.R
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("broadcast variables")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Partitioned data
-nums <- 1:2
-rrdd <- parallelize(sc, nums, 2L)
-
-test_that("using broadcast variable", {
- skip_on_cran()
-
- randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
- randomMatBr <- broadcastRDD(sc, randomMat)
-
- useBroadcast <- function(x) {
- sum(SparkR:::value(randomMatBr) * x)
- }
- actual <- collectRDD(lapply(rrdd, useBroadcast))
- expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
- expect_equal(actual, expected)
-})
-
-test_that("without using broadcast variable", {
- skip_on_cran()
-
- randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
-
- useBroadcast <- function(x) {
- sum(randomMat * x)
- }
- actual <- collectRDD(lapply(rrdd, useBroadcast))
- expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
- expect_equal(actual, expected)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_client.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R
deleted file mode 100644
index 3d53beb..0000000
--- a/R/pkg/inst/tests/testthat/test_client.R
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in client.R")
-
-test_that("adding spark-testing-base as a package works", {
- skip_on_cran()
-
- args <- generateSparkSubmitArgs("", "", "", "",
- "holdenk:spark-testing-base:1.3.0_0.0.5")
- expect_equal(gsub("[[:space:]]", "", args),
- gsub("[[:space:]]", "",
- "--packages holdenk:spark-testing-base:1.3.0_0.0.5"))
-})
-
-test_that("no package specified doesn't add packages flag", {
- skip_on_cran()
-
- args <- generateSparkSubmitArgs("", "", "", "", "")
- expect_equal(gsub("[[:space:]]", "", args),
- "")
-})
-
-test_that("multiple packages don't produce a warning", {
- skip_on_cran()
-
- expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA)
-})
-
-test_that("sparkJars sparkPackages as character vectors", {
- skip_on_cran()
-
- args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
- c("com.databricks:spark-avro_2.10:2.0.1"))
- expect_match(args, "--jars one.jar,two.jar,three.jar")
- expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
-})
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
deleted file mode 100644
index f6d9f54..0000000
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ /dev/null
@@ -1,226 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("test functions in sparkR.R")
-
-test_that("Check masked functions", {
- skip_on_cran()
-
- # Check that we are not masking any new function from base, stats, testthat unexpectedly
- # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
- # hard for users to use base R functions. Please check when in doubt.
- namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
- namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
- "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
- "summary", "transform", "drop", "window", "as.data.frame", "union", "not")
- if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
- namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
- }
- masked <- conflicts(detail = TRUE)$`package:SparkR`
- expect_true("describe" %in% masked) # only when with testthat..
- func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
- funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
- maskedBySparkR <- masked[funcSparkROrEmpty]
- expect_equal(length(maskedBySparkR), length(namesOfMasked))
- # make the 2 lists the same length so expect_equal will print their content
- l <- max(length(maskedBySparkR), length(namesOfMasked))
- length(maskedBySparkR) <- l
- length(namesOfMasked) <- l
- expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
- # above are those reported as masked when `library(SparkR)`
- # note that many of these methods are still callable without base:: or stats:: prefix
- # there should be a test for each of these, except followings, which are currently "broken"
- funcHasAny <- unlist(lapply(masked, function(x) {
- any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
- }))
- maskedCompletely <- masked[!funcHasAny]
- expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
- l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
- length(maskedCompletely) <- l
- length(namesOfMaskedCompletely) <- l
- expect_equal(sort(maskedCompletely, na.last = TRUE),
- sort(namesOfMaskedCompletely, na.last = TRUE))
-})
-
-test_that("repeatedly starting and stopping SparkR", {
- skip_on_cran()
-
- for (i in 1:4) {
- sc <- suppressWarnings(sparkR.init(master = sparkRTestMaster))
- rdd <- parallelize(sc, 1:20, 2L)
- expect_equal(countRDD(rdd), 20)
- suppressWarnings(sparkR.stop())
- }
-})
-
-test_that("repeatedly starting and stopping SparkSession", {
- for (i in 1:4) {
- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
- df <- createDataFrame(data.frame(dummy = 1:i))
- expect_equal(count(df), i)
- sparkR.session.stop()
- }
-})
-
-test_that("rdd GC across sparkR.stop", {
- skip_on_cran()
-
- sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0
- rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
- rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
- sparkR.session.stop()
-
- sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0 again
-
- # GC rdd1 before creating rdd3 and rdd2 after
- rm(rdd1)
- gc()
-
- rdd3 <- parallelize(sc, 1:20, 2L) # rdd3 should get id 1 now
- rdd4 <- parallelize(sc, 1:10, 2L) # rdd4 should get id 2 now
-
- rm(rdd2)
- gc()
-
- countRDD(rdd3)
- countRDD(rdd4)
- sparkR.session.stop()
-})
-
-test_that("job group functions can be called", {
- skip_on_cran()
-
- sc <- sparkR.sparkContext(master = sparkRTestMaster)
- setJobGroup("groupId", "job description", TRUE)
- cancelJobGroup("groupId")
- clearJobGroup()
-
- suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
- suppressWarnings(cancelJobGroup(sc, "groupId"))
- suppressWarnings(clearJobGroup(sc))
- sparkR.session.stop()
-})
-
-test_that("utility function can be called", {
- skip_on_cran()
-
- sparkR.sparkContext(master = sparkRTestMaster)
- setLogLevel("ERROR")
- sparkR.session.stop()
-})
-
-test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
- skip_on_cran()
-
- e <- new.env()
- e[["spark.driver.memory"]] <- "512m"
- ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
- expect_equal("--driver-memory \"512m\" sparkrmain", ops)
-
- e[["spark.driver.memory"]] <- "5g"
- e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
- e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
- e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
- e[["random"]] <- "skipthis"
- ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
- # nolint start
- expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
- "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
- "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
- # nolint end
-
- e[["spark.driver.extraClassPath"]] <- "/" # too short
- ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
- # nolint start
- expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
- "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
- " --driver-memory 4g sparkr-shell2"))
- # nolint end
-})
-
-test_that("sparkJars sparkPackages as comma-separated strings", {
- skip_on_cran()
-
- expect_warning(processSparkJars(" a, b "))
- jars <- suppressWarnings(processSparkJars(" a, b "))
- expect_equal(lapply(jars, basename), list("a", "b"))
-
- jars <- suppressWarnings(processSparkJars(" abc ,, def "))
- expect_equal(lapply(jars, basename), list("abc", "def"))
-
- jars <- suppressWarnings(processSparkJars(c(" abc ,, def ", "", "xyz", " ", "a,b")))
- expect_equal(lapply(jars, basename), list("abc", "def", "xyz", "a", "b"))
-
- p <- processSparkPackages(c("ghi", "lmn"))
- expect_equal(p, c("ghi", "lmn"))
-
- # check normalizePath
- f <- dir()[[1]]
- expect_warning(processSparkJars(f), NA)
- expect_match(processSparkJars(f), f)
-})
-
-test_that("spark.lapply should perform simple transforms", {
- sparkR.sparkContext(master = sparkRTestMaster)
- doubled <- spark.lapply(1:10, function(x) { 2 * x })
- expect_equal(doubled, as.list(2 * 1:10))
- sparkR.session.stop()
-})
-
-test_that("add and get file to be downloaded with Spark job on every node", {
- skip_on_cran()
-
- sparkR.sparkContext(master = sparkRTestMaster)
- # Test add file.
- path <- tempfile(pattern = "hello", fileext = ".txt")
- filename <- basename(path)
- words <- "Hello World!"
- writeLines(words, path)
- spark.addFile(path)
- download_path <- spark.getSparkFiles(filename)
- expect_equal(readLines(download_path), words)
-
- # Test spark.getSparkFiles works well on executors.
- seq <- seq(from = 1, to = 10, length.out = 5)
- f <- function(seq) { spark.getSparkFiles(filename) }
- results <- spark.lapply(seq, f)
- for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
-
- unlink(path)
-
- # Test add directory recursively.
- path <- paste0(tempdir(), "/", "recursive_dir")
- dir.create(path)
- dir_name <- basename(path)
- path1 <- paste0(path, "/", "hello.txt")
- file.create(path1)
- sub_path <- paste0(path, "/", "sub_hello")
- dir.create(sub_path)
- path2 <- paste0(sub_path, "/", "sub_hello.txt")
- file.create(path2)
- words <- "Hello World!"
- sub_words <- "Sub Hello World!"
- writeLines(words, path1)
- writeLines(sub_words, path2)
- spark.addFile(path, recursive = TRUE)
- download_path1 <- spark.getSparkFiles(paste0(dir_name, "/", "hello.txt"))
- expect_equal(readLines(download_path1), words)
- download_path2 <- spark.getSparkFiles(paste0(dir_name, "/", "sub_hello/sub_hello.txt"))
- expect_equal(readLines(download_path2), sub_words)
- unlink(path, recursive = TRUE)
- sparkR.session.stop()
-})
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_includePackage.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/testthat/test_includePackage.R
deleted file mode 100644
index d7d9eee..0000000
--- a/R/pkg/inst/tests/testthat/test_includePackage.R
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("include R packages")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Partitioned data
-nums <- 1:2
-rdd <- parallelize(sc, nums, 2L)
-
-test_that("include inside function", {
- skip_on_cran()
-
- # Only run the test if plyr is installed.
- if ("plyr" %in% rownames(installed.packages())) {
- suppressPackageStartupMessages(library(plyr))
- generateData <- function(x) {
- suppressPackageStartupMessages(library(plyr))
- attach(airquality)
- result <- transform(Ozone, logOzone = log(Ozone))
- result
- }
-
- data <- lapplyPartition(rdd, generateData)
- actual <- collectRDD(data)
- }
-})
-
-test_that("use include package", {
- skip_on_cran()
-
- # Only run the test if plyr is installed.
- if ("plyr" %in% rownames(installed.packages())) {
- suppressPackageStartupMessages(library(plyr))
- generateData <- function(x) {
- attach(airquality)
- result <- transform(Ozone, logOzone = log(Ozone))
- result
- }
-
- includePackage(sc, plyr)
- data <- lapplyPartition(rdd, generateData)
- actual <- collectRDD(data)
- }
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_jvm_api.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_jvm_api.R b/R/pkg/inst/tests/testthat/test_jvm_api.R
deleted file mode 100644
index 8b3b4f7..0000000
--- a/R/pkg/inst/tests/testthat/test_jvm_api.R
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("JVM API")
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("Create and call methods on object", {
- jarr <- sparkR.newJObject("java.util.ArrayList")
- # Add an element to the array
- sparkR.callJMethod(jarr, "add", 1L)
- # Check if get returns the same element
- expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L)
-})
-
-test_that("Call static methods", {
- # Convert a boolean to a string
- strTrue <- sparkR.callJStatic("java.lang.String", "valueOf", TRUE)
- expect_equal(strTrue, "true")
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R
deleted file mode 100644
index 82e588d..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_classification.R
+++ /dev/null
@@ -1,396 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib classification algorithms, except for tree-based algorithms")
-
-# Tests for MLlib classification algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
- sparkHome <- sparkR.conf("spark.home")
- file.path(sparkHome, x)
-}
-
-test_that("spark.svmLinear", {
- skip_on_cran()
-
- df <- suppressWarnings(createDataFrame(iris))
- training <- df[df$Species %in% c("versicolor", "virginica"), ]
- model <- spark.svmLinear(training, Species ~ ., regParam = 0.01, maxIter = 10)
- summary <- summary(model)
-
- # test summary coefficients return matrix type
- expect_true(class(summary$coefficients) == "matrix")
- expect_true(class(summary$coefficients[, 1]) == "numeric")
-
- coefs <- summary$coefficients[, "Estimate"]
- expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085)
- expect_true(all(abs(coefs - expected_coefs) < 0.1))
-
- # Test prediction with string label
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
- expected <- c("versicolor", "versicolor", "versicolor", "virginica", "virginica",
- "virginica", "virginica", "virginica", "virginica", "virginica")
- expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
-
- # Test model save and load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- coefs <- summary(model)$coefficients
- coefs2 <- summary(model2)$coefficients
- expect_equal(coefs, coefs2)
- unlink(modelPath)
- }
-
- # Test prediction with numeric label
- label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
- feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
- data <- as.data.frame(cbind(label, feature))
- df <- createDataFrame(data)
- model <- spark.svmLinear(df, label ~ feature, regParam = 0.1)
- prediction <- collect(select(predict(model, df), "prediction"))
- expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
-
-})
-
-test_that("spark.logit", {
- # R code to reproduce the result.
- # nolint start
- #' library(glmnet)
- #' iris.x = as.matrix(iris[, 1:4])
- #' iris.y = as.factor(as.character(iris[, 5]))
- #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
- #' coef(logit)
- #
- # $setosa
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # 1.0981324
- # Sepal.Length -0.2909860
- # Sepal.Width 0.5510907
- # Petal.Length -0.1915217
- # Petal.Width -0.4211946
- #
- # $versicolor
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # 1.520061e+00
- # Sepal.Length 2.524501e-02
- # Sepal.Width -5.310313e-01
- # Petal.Length 3.656543e-02
- # Petal.Width -3.144464e-05
- #
- # $virginica
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # -2.61819385
- # Sepal.Length 0.26574097
- # Sepal.Width -0.02005932
- # Petal.Length 0.15495629
- # Petal.Width 0.42122607
- # nolint end
-
- # Test multinomial logistic regression againt three classes
- df <- suppressWarnings(createDataFrame(iris))
- model <- spark.logit(df, Species ~ ., regParam = 0.5)
- summary <- summary(model)
-
- # test summary coefficients return matrix type
- expect_true(class(summary$coefficients) == "matrix")
- expect_true(class(summary$coefficients[, 1]) == "numeric")
-
- versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
- virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
- setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
- versicolorCoefs <- summary$coefficients[, "versicolor"]
- virginicaCoefs <- summary$coefficients[, "virginica"]
- setosaCoefs <- summary$coefficients[, "setosa"]
- expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
- expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
- expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
-
- # Test model save and load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- coefs <- summary(model)$coefficients
- coefs2 <- summary(model2)$coefficients
- expect_equal(coefs, coefs2)
- unlink(modelPath)
- }
-
- # R code to reproduce the result.
- # nolint start
- #' library(glmnet)
- #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
- #' iris.x = as.matrix(iris2[, 1:4])
- #' iris.y = as.factor(as.character(iris2[, 5]))
- #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
- #' coef(logit)
- #
- # $versicolor
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # 3.93844796
- # Sepal.Length -0.13538675
- # Sepal.Width -0.02386443
- # Petal.Length -0.35076451
- # Petal.Width -0.77971954
- #
- # $virginica
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # -3.93844796
- # Sepal.Length 0.13538675
- # Sepal.Width 0.02386443
- # Petal.Length 0.35076451
- # Petal.Width 0.77971954
- #
- #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
- #' coef(logit)
- #
- # 5 x 1 sparse Matrix of class "dgCMatrix"
- # s0
- # (Intercept) -6.0824412
- # Sepal.Length 0.2458260
- # Sepal.Width 0.1642093
- # Petal.Length 0.4759487
- # Petal.Width 1.0383948
- #
- # nolint end
-
- # Test multinomial logistic regression againt two classes
- df <- suppressWarnings(createDataFrame(iris))
- training <- df[df$Species %in% c("versicolor", "virginica"), ]
- model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
- summary <- summary(model)
- versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
- virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
- versicolorCoefs <- summary$coefficients[, "versicolor"]
- virginicaCoefs <- summary$coefficients[, "virginica"]
- expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
- expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
-
- # Test binomial logistic regression againt two classes
- model <- spark.logit(training, Species ~ ., regParam = 0.5)
- summary <- summary(model)
- coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
- coefs <- summary$coefficients[, "Estimate"]
- expect_true(all(abs(coefsR - coefs) < 0.1))
-
- # Test prediction with string label
- prediction <- predict(model, training)
- expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
- expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
- "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
- expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
-
- # Test prediction with numeric label
- label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
- feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
- data <- as.data.frame(cbind(label, feature))
- df <- createDataFrame(data)
- model <- spark.logit(df, label ~ feature)
- prediction <- collect(select(predict(model, df), "prediction"))
- expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
-
- # Test prediction with weightCol
- weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
- data2 <- as.data.frame(cbind(label, feature, weight))
- df2 <- createDataFrame(data2)
- model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
- prediction2 <- collect(select(predict(model2, df2), "prediction"))
- expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
-})
-
-test_that("spark.mlp", {
- skip_on_cran()
-
- df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
- source = "libsvm")
- model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
- solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
-
- # Test summary method
- summary <- summary(model)
- expect_equal(summary$numOfInputs, 4)
- expect_equal(summary$numOfOutputs, 3)
- expect_equal(summary$layers, c(4, 5, 4, 3))
- expect_equal(length(summary$weights), 64)
- expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
- tolerance = 1e-6)
-
- # Test predict method
- mlpTestDF <- df
- mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- summary2 <- summary(model2)
-
- expect_equal(summary2$numOfInputs, 4)
- expect_equal(summary2$numOfOutputs, 3)
- expect_equal(summary2$layers, c(4, 5, 4, 3))
- expect_equal(length(summary2$weights), 64)
-
- unlink(modelPath)
- }
-
- # Test default parameter
- model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
- mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 10),
- c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
- # Test illegal parameter
- expect_error(spark.mlp(df, label ~ features, layers = NULL),
- "layers must be a integer vector with length > 1.")
- expect_error(spark.mlp(df, label ~ features, layers = c()),
- "layers must be a integer vector with length > 1.")
- expect_error(spark.mlp(df, label ~ features, layers = c(3)),
- "layers must be a integer vector with length > 1.")
-
- # Test random seed
- # default seed
- model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
- mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 10),
- c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
- # seed equals 10
- model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
- mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 10),
- c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
- # test initialWeights
- model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
- c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
- mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
- expect_equal(head(mlpPredictions$prediction, 10),
- c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
- # Test formula works well
- df <- suppressWarnings(createDataFrame(iris))
- model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
- layers = c(4, 3))
- summary <- summary(model)
- expect_equal(summary$numOfInputs, 4)
- expect_equal(summary$numOfOutputs, 3)
- expect_equal(summary$layers, c(4, 3))
- expect_equal(length(summary$weights), 15)
-})
-
-test_that("spark.naiveBayes", {
- # R code to reproduce the result.
- # We do not support instance weights yet. So we ignore the frequencies.
- #
- #' library(e1071)
- #' t <- as.data.frame(Titanic)
- #' t1 <- t[t$Freq > 0, -5]
- #' m <- naiveBayes(Survived ~ ., data = t1)
- #' m
- #' predict(m, t1)
- #
- # -- output of 'm'
- #
- # A-priori probabilities:
- # Y
- # No Yes
- # 0.4166667 0.5833333
- #
- # Conditional probabilities:
- # Class
- # Y 1st 2nd 3rd Crew
- # No 0.2000000 0.2000000 0.4000000 0.2000000
- # Yes 0.2857143 0.2857143 0.2857143 0.1428571
- #
- # Sex
- # Y Male Female
- # No 0.5 0.5
- # Yes 0.5 0.5
- #
- # Age
- # Y Child Adult
- # No 0.2000000 0.8000000
- # Yes 0.4285714 0.5714286
- #
- # -- output of 'predict(m, t1)'
- #
- # Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No
- #
-
- t <- as.data.frame(Titanic)
- t1 <- t[t$Freq > 0, -5]
- df <- suppressWarnings(createDataFrame(t1))
- m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
- s <- summary(m)
- expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
- expect_equal(sum(s$apriori), 1)
- expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
- p <- collect(select(predict(m, df), "prediction"))
- expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
- "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
- "Yes", "Yes", "No", "No"))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
- write.ml(m, modelPath)
- expect_error(write.ml(m, modelPath))
- write.ml(m, modelPath, overwrite = TRUE)
- m2 <- read.ml(modelPath)
- s2 <- summary(m2)
- expect_equal(s$apriori, s2$apriori)
- expect_equal(s$tables, s2$tables)
-
- unlink(modelPath)
- }
-
- # Test e1071::naiveBayes
- if (requireNamespace("e1071", quietly = TRUE)) {
- expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
- expect_equal(as.character(predict(m, t1[1, ])), "Yes")
- }
-
- # Test numeric response variable
- t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1)
- t2 <- t1[-4]
- df <- suppressWarnings(createDataFrame(t2))
- m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0)
- s <- summary(m)
- expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
- expect_equal(sum(s$apriori), 1)
- expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
deleted file mode 100644
index e827e96..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R
+++ /dev/null
@@ -1,328 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib clustering algorithms")
-
-# Tests for MLlib clustering algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
- sparkHome <- sparkR.conf("spark.home")
- file.path(sparkHome, x)
-}
-
-test_that("spark.bisectingKmeans", {
- skip_on_cran()
-
- newIris <- iris
- newIris$Species <- NULL
- training <- suppressWarnings(createDataFrame(newIris))
-
- take(training, 1)
-
- model <- spark.bisectingKmeans(data = training, ~ .)
- sample <- take(select(predict(model, training), "prediction"), 1)
- expect_equal(typeof(sample$prediction), "integer")
- expect_equal(sample$prediction, 1)
-
- # Test fitted works on Bisecting KMeans
- fitted.model <- fitted(model)
- expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
- c(0, 1, 2, 3))
-
- # Test summary works on KMeans
- summary.model <- summary(model)
- cluster <- summary.model$cluster
- k <- summary.model$k
- expect_equal(k, 4)
- expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
- c(0, 1, 2, 3))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- summary2 <- summary(model2)
- expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
- expect_equal(summary.model$coefficients, summary2$coefficients)
- expect_true(!summary.model$is.loaded)
- expect_true(summary2$is.loaded)
-
- unlink(modelPath)
- }
-})
-
-test_that("spark.gaussianMixture", {
- # R code to reproduce the result.
- # nolint start
- #' library(mvtnorm)
- #' set.seed(1)
- #' a <- rmvnorm(7, c(0, 0))
- #' b <- rmvnorm(8, c(10, 10))
- #' data <- rbind(a, b)
- #' model <- mvnormalmixEM(data, k = 2)
- #' model$lambda
- #
- # [1] 0.4666667 0.5333333
- #
- #' model$mu
- #
- # [1] 0.11731091 -0.06192351
- # [1] 10.363673 9.897081
- #
- #' model$sigma
- #
- # [[1]]
- # [,1] [,2]
- # [1,] 0.62049934 0.06880802
- # [2,] 0.06880802 1.27431874
- #
- # [[2]]
- # [,1] [,2]
- # [1,] 0.2961543 0.160783
- # [2,] 0.1607830 1.008878
- #
- #' model$loglik
- #
- # [1] -46.89499
- # nolint end
- data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
- list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
- list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
- list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
- list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
- list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
- list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
- list(9.5218499, 10.4179416))
- df <- createDataFrame(data, c("x1", "x2"))
- model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
- stats <- summary(model)
- rLambda <- c(0.4666667, 0.5333333)
- rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
- rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
- 0.2961543, 0.160783, 0.1607830, 1.008878)
- rLoglik <- -46.89499
- expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
- expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
- expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
- expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
- p <- collect(select(predict(model, df), "prediction"))
- expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats$lambda, stats2$lambda)
- expect_equal(unlist(stats$mu), unlist(stats2$mu))
- expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
- expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
-
- unlink(modelPath)
- }
-})
-
-test_that("spark.kmeans", {
- newIris <- iris
- newIris$Species <- NULL
- training <- suppressWarnings(createDataFrame(newIris))
-
- take(training, 1)
-
- model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
- sample <- take(select(predict(model, training), "prediction"), 1)
- expect_equal(typeof(sample$prediction), "integer")
- expect_equal(sample$prediction, 1)
-
- # Test stats::kmeans is working
- statsModel <- kmeans(x = newIris, centers = 2)
- expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
-
- # Test fitted works on KMeans
- fitted.model <- fitted(model)
- expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
-
- # Test summary works on KMeans
- summary.model <- summary(model)
- cluster <- summary.model$cluster
- k <- summary.model$k
- expect_equal(k, 2)
- expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
-
- # test summary coefficients return matrix type
- expect_true(class(summary.model$coefficients) == "matrix")
- expect_true(class(summary.model$coefficients[1, ]) == "numeric")
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- summary2 <- summary(model2)
- expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
- expect_equal(summary.model$coefficients, summary2$coefficients)
- expect_true(!summary.model$is.loaded)
- expect_true(summary2$is.loaded)
-
- unlink(modelPath)
- }
-
- # Test Kmeans on dataset that is sensitive to seed value
- col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
- col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
- col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
- cols <- as.data.frame(cbind(col1, col2, col3))
- df <- createDataFrame(cols)
-
- model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
- initMode = "random", seed = 1, tol = 1E-5)
- model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
- initMode = "random", seed = 22222, tol = 1E-5)
-
- summary.model1 <- summary(model1)
- summary.model2 <- summary(model2)
- cluster1 <- summary.model1$cluster
- cluster2 <- summary.model2$cluster
- clusterSize1 <- summary.model1$clusterSize
- clusterSize2 <- summary.model2$clusterSize
-
- # The predicted clusters are different
- expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
- c(0, 1, 2, 3))
- expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
- c(0, 1, 2))
- expect_equal(clusterSize1, 4)
- expect_equal(clusterSize2, 3)
-})
-
-test_that("spark.lda with libsvm", {
- text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
- model <- spark.lda(text, optimizer = "em")
-
- stats <- summary(model, 10)
- isDistributed <- stats$isDistributed
- logLikelihood <- stats$logLikelihood
- logPerplexity <- stats$logPerplexity
- vocabSize <- stats$vocabSize
- topics <- stats$topicTopTerms
- weights <- stats$topicTopTermsWeights
- vocabulary <- stats$vocabulary
- trainingLogLikelihood <- stats$trainingLogLikelihood
- logPrior <- stats$logPrior
-
- expect_true(isDistributed)
- expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
- expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
- expect_equal(vocabSize, 11)
- expect_true(is.null(vocabulary))
- expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
- expect_true(logPrior <= 0 & !is.na(logPrior))
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
-
- expect_true(stats2$isDistributed)
- expect_equal(logLikelihood, stats2$logLikelihood)
- expect_equal(logPerplexity, stats2$logPerplexity)
- expect_equal(vocabSize, stats2$vocabSize)
- expect_equal(vocabulary, stats2$vocabulary)
- expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
- expect_equal(logPrior, stats2$logPrior)
-
- unlink(modelPath)
- }
-})
-
-test_that("spark.lda with text input", {
- skip_on_cran()
-
- text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
- model <- spark.lda(text, optimizer = "online", features = "value")
-
- stats <- summary(model)
- isDistributed <- stats$isDistributed
- logLikelihood <- stats$logLikelihood
- logPerplexity <- stats$logPerplexity
- vocabSize <- stats$vocabSize
- topics <- stats$topicTopTerms
- weights <- stats$topicTopTermsWeights
- vocabulary <- stats$vocabulary
- trainingLogLikelihood <- stats$trainingLogLikelihood
- logPrior <- stats$logPrior
-
- expect_false(isDistributed)
- expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
- expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
- expect_equal(vocabSize, 10)
- expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
- expect_true(is.na(trainingLogLikelihood))
- expect_true(is.na(logPrior))
-
- # Test model save/load
- modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
-
- expect_false(stats2$isDistributed)
- expect_equal(logLikelihood, stats2$logLikelihood)
- expect_equal(logPerplexity, stats2$logPerplexity)
- expect_equal(vocabSize, stats2$vocabSize)
- expect_true(all.equal(vocabulary, stats2$vocabulary))
- expect_true(is.na(stats2$trainingLogLikelihood))
- expect_true(is.na(stats2$logPrior))
-
- unlink(modelPath)
-})
-
-test_that("spark.posterior and spark.perplexity", {
- skip_on_cran()
-
- text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
- model <- spark.lda(text, features = "value", k = 3)
-
- # Assert perplexities are equal
- stats <- summary(model)
- logPerplexity <- spark.perplexity(model, text)
- expect_equal(logPerplexity, stats$logPerplexity)
-
- # Assert the sum of every topic distribution is equal to 1
- posterior <- spark.posterior(model, text)
- local.posterior <- collect(posterior)$topicDistribution
- expect_equal(length(local.posterior), sum(unlist(local.posterior)))
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_fpm.R b/R/pkg/inst/tests/testthat/test_mllib_fpm.R
deleted file mode 100644
index 4e10ca1..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_fpm.R
+++ /dev/null
@@ -1,85 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib frequent pattern mining")
-
-# Tests for MLlib frequent pattern mining algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.fpGrowth", {
- data <- selectExpr(createDataFrame(data.frame(items = c(
- "1,2",
- "1,2",
- "1,2,3",
- "1,3"
- ))), "split(items, ',') as items")
-
- model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1)
-
- itemsets <- collect(spark.freqItemsets(model))
-
- expected_itemsets <- data.frame(
- items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))),
- freq = c(2, 2, 3, 3, 4)
- )
-
- expect_equivalent(expected_itemsets, itemsets)
-
- expected_association_rules <- data.frame(
- antecedent = I(list(list("2"), list("3"))),
- consequent = I(list(list("1"), list("1"))),
- confidence = c(1, 1)
- )
-
- expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
-
- new_data <- selectExpr(createDataFrame(data.frame(items = c(
- "1,2",
- "1,3",
- "2,3"
- ))), "split(items, ',') as items")
-
- expected_predictions <- data.frame(
- items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
- prediction = I(list(list(), list(), list("1")))
- )
-
- expect_equivalent(expected_predictions, collect(predict(model, new_data)))
-
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
- write.ml(model, modelPath, overwrite = TRUE)
- loaded_model <- read.ml(modelPath)
-
- expect_equivalent(
- itemsets,
- collect(spark.freqItemsets(loaded_model)))
-
- unlink(modelPath)
- }
-
- model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
- expect_equal(
- count(spark.freqItemsets(model_without_numpartitions)),
- count(spark.freqItemsets(model))
- )
-
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_recommendation.R b/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
deleted file mode 100644
index cc8064f..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
+++ /dev/null
@@ -1,67 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib recommendation algorithms")
-
-# Tests for MLlib recommendation algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.als", {
- data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
- list(2, 1, 1.0), list(2, 2, 5.0))
- df <- createDataFrame(data, c("user", "item", "score"))
- model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
- rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
- stats <- summary(model)
- expect_equal(stats$rank, 10)
- test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
- predictions <- collect(predict(model, test))
-
- expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
- tolerance = 1e-4)
-
- # Test model save/load
- if (not_cran_or_windows_with_hadoop()) {
- modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
- write.ml(model, modelPath)
- expect_error(write.ml(model, modelPath))
- write.ml(model, modelPath, overwrite = TRUE)
- model2 <- read.ml(modelPath)
- stats2 <- summary(model2)
- expect_equal(stats2$rating, "score")
- userFactors <- collect(stats$userFactors)
- itemFactors <- collect(stats$itemFactors)
- userFactors2 <- collect(stats2$userFactors)
- itemFactors2 <- collect(stats2$itemFactors)
-
- orderUser <- order(userFactors$id)
- orderUser2 <- order(userFactors2$id)
- expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
- expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
-
- orderItem <- order(itemFactors$id)
- orderItem2 <- order(itemFactors2$id)
- expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
- expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
-
- unlink(modelPath)
- }
-})
-
-sparkR.session.stop()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
deleted file mode 100644
index c790d02..0000000
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ /dev/null
@@ -1,3474 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("SparkSQL functions")
-
-# Utility function for easily checking the values of a StructField
-checkStructField <- function(actual, expectedName, expectedType, expectedNullable) {
- expect_equal(class(actual), "structField")
- expect_equal(actual$name(), expectedName)
- expect_equal(actual$dataType.toString(), expectedType)
- expect_equal(actual$nullable(), expectedNullable)
-}
-
-markUtf8 <- function(s) {
- Encoding(s) <- "UTF-8"
- s
-}
-
-setHiveContext <- function(sc) {
- if (exists(".testHiveSession", envir = .sparkREnv)) {
- hiveSession <- get(".testHiveSession", envir = .sparkREnv)
- } else {
- # initialize once and reuse
- ssc <- callJMethod(sc, "sc")
- hiveCtx <- tryCatch({
- newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
- },
- error = function(err) {
- skip("Hive is not build with SparkSQL, skipped")
- })
- hiveSession <- callJMethod(hiveCtx, "sparkSession")
- }
- previousSession <- get(".sparkRsession", envir = .sparkREnv)
- assign(".sparkRsession", hiveSession, envir = .sparkREnv)
- assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
- hiveSession
-}
-
-unsetHiveContext <- function() {
- previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
- assign(".sparkRsession", previousSession, envir = .sparkREnv)
- remove(".prevSparkRsession", envir = .sparkREnv)
-}
-
-# Tests for SparkSQL functions in SparkR
-
-filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
-sparkSession <- if (not_cran_or_windows_with_hadoop()) {
- sparkR.session(master = sparkRTestMaster)
- } else {
- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
- }
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockLines <- c("{\"name\":\"Michael\"}",
- "{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
-writeLines(mockLines, jsonPath)
-
-# For test nafunctions, like dropna(), fillna(),...
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}",
- "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
- "{\"name\":null,\"age\":null,\"height\":null}")
-jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesNa, jsonPathNa)
-
-# For test complex types in DataFrame
-mockLinesComplexType <-
- c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
- "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
- "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
-complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesComplexType, complexTypeJsonPath)
-
-# For test map type and struct type in DataFrame
-mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
- "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
- "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
-mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesMapType, mapTypeJsonPath)
-
-if (.Platform$OS.type == "windows") {
- Sys.setenv(TZ = "GMT")
-}
-
-test_that("calling sparkRSQL.init returns existing SQL context", {
- skip_on_cran()
-
- sqlContext <- suppressWarnings(sparkRSQL.init(sc))
- expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
-})
-
-test_that("calling sparkRSQL.init returns existing SparkSession", {
- skip_on_cran()
-
- expect_equal(suppressWarnings(sparkRSQL.init(sc)), sparkSession)
-})
-
-test_that("calling sparkR.session returns existing SparkSession", {
- skip_on_cran()
-
- expect_equal(sparkR.session(), sparkSession)
-})
-
-test_that("infer types and check types", {
- expect_equal(infer_type(1L), "integer")
- expect_equal(infer_type(1.0), "double")
- expect_equal(infer_type("abc"), "string")
- expect_equal(infer_type(TRUE), "boolean")
- expect_equal(infer_type(as.Date("2015-03-11")), "date")
- expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
- expect_equal(infer_type(c(1L, 2L)), "array<integer>")
- expect_equal(infer_type(list(1L, 2L)), "array<integer>")
- expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
- e <- new.env()
- assign("a", 1L, envir = e)
- expect_equal(infer_type(e), "map<string,integer>")
-
- expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
-
- expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
-})
-
-test_that("structType and structField", {
- testField <- structField("a", "string")
- expect_is(testField, "structField")
- expect_equal(testField$name(), "a")
- expect_true(testField$nullable())
-
- testSchema <- structType(testField, structField("b", "integer"))
- expect_is(testSchema, "structType")
- expect_is(testSchema$fields()[[2]], "structField")
- expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
-})
-
-test_that("structField type strings", {
- # positive cases
- primitiveTypes <- list(byte = "ByteType",
- integer = "IntegerType",
- float = "FloatType",
- double = "DoubleType",
- string = "StringType",
- binary = "BinaryType",
- boolean = "BooleanType",
- timestamp = "TimestampType",
- date = "DateType",
- tinyint = "ByteType",
- smallint = "ShortType",
- int = "IntegerType",
- bigint = "LongType",
- decimal = "DecimalType(10,0)")
-
- complexTypes <- list("map<string,integer>" = "MapType(StringType,IntegerType,true)",
- "array<string>" = "ArrayType(StringType,true)",
- "struct<a:string>" = "StructType(StructField(a,StringType,true))")
-
- typeList <- c(primitiveTypes, complexTypes)
- typeStrings <- names(typeList)
-
- for (i in seq_along(typeStrings)){
- typeString <- typeStrings[i]
- expected <- typeList[[i]]
- testField <- structField("_col", typeString)
- expect_is(testField, "structField")
- expect_true(testField$nullable())
- expect_equal(testField$dataType.toString(), expected)
- }
-
- # negative cases
- primitiveErrors <- list(Byte = "Byte",
- INTEGER = "INTEGER",
- numeric = "numeric",
- character = "character",
- raw = "raw",
- logical = "logical",
- short = "short",
- varchar = "varchar",
- long = "long",
- char = "char")
-
- complexErrors <- list("map<string, integer>" = " integer",
- "array<String>" = "String",
- "struct<a:string >" = "string ",
- "map <string,integer>" = "map <string,integer>",
- "array< string>" = " string",
- "struct<a: string>" = " string")
-
- errorList <- c(primitiveErrors, complexErrors)
- typeStrings <- names(errorList)
-
- for (i in seq_along(typeStrings)){
- typeString <- typeStrings[i]
- expected <- paste0("Unsupported type for SparkDataframe: ", errorList[[i]])
- expect_error(structField("_col", typeString), expected)
- }
-})
-
-test_that("create DataFrame from RDD", {
- skip_on_cran()
-
- rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
- df <- createDataFrame(rdd, list("a", "b"))
- dfAsDF <- as.DataFrame(rdd, list("a", "b"))
- expect_is(df, "SparkDataFrame")
- expect_is(dfAsDF, "SparkDataFrame")
- expect_equal(count(df), 10)
- expect_equal(count(dfAsDF), 10)
- expect_equal(nrow(df), 10)
- expect_equal(nrow(dfAsDF), 10)
- expect_equal(ncol(df), 2)
- expect_equal(ncol(dfAsDF), 2)
- expect_equal(dim(df), c(10, 2))
- expect_equal(dim(dfAsDF), c(10, 2))
- expect_equal(columns(df), c("a", "b"))
- expect_equal(columns(dfAsDF), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
- expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
-
- df <- createDataFrame(rdd)
- dfAsDF <- as.DataFrame(rdd)
- expect_is(df, "SparkDataFrame")
- expect_is(dfAsDF, "SparkDataFrame")
- expect_equal(columns(df), c("_1", "_2"))
- expect_equal(columns(dfAsDF), c("_1", "_2"))
-
- schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
- structField(x = "b", type = "string", nullable = TRUE))
- df <- createDataFrame(rdd, schema)
- expect_is(df, "SparkDataFrame")
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
- rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
- df <- createDataFrame(rdd)
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 10)
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
- schema <- structType(structField("name", "string"), structField("age", "integer"),
- structField("height", "float"))
- df <- read.df(jsonPathNa, "json", schema)
- df2 <- createDataFrame(toRDD(df), schema)
- df2AsDF <- as.DataFrame(toRDD(df), schema)
- expect_equal(columns(df2), c("name", "age", "height"))
- expect_equal(columns(df2AsDF), c("name", "age", "height"))
- expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
- expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
- expect_equal(as.list(collect(where(df2, df2$name == "Bob"))),
- list(name = "Bob", age = 16, height = 176.5))
- expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))),
- list(name = "Bob", age = 16, height = 176.5))
-
- localDF <- data.frame(name = c("John", "Smith", "Sarah"),
- age = c(19L, 23L, 18L),
- height = c(176.5, 181.4, 173.7))
- df <- createDataFrame(localDF, schema)
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 3)
- expect_equal(columns(df), c("name", "age", "height"))
- expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
- expect_equal(as.list(collect(where(df, df$name == "John"))),
- list(name = "John", age = 19L, height = 176.5))
- expect_equal(getNumPartitions(df), 1)
-
- df <- as.DataFrame(cars, numPartitions = 2)
- expect_equal(getNumPartitions(df), 2)
- df <- createDataFrame(cars, numPartitions = 3)
- expect_equal(getNumPartitions(df), 3)
- # validate limit by num of rows
- df <- createDataFrame(cars, numPartitions = 60)
- expect_equal(getNumPartitions(df), 50)
- # validate when 1 < (length(coll) / numSlices) << length(coll)
- df <- createDataFrame(cars, numPartitions = 20)
- expect_equal(getNumPartitions(df), 20)
-
- df <- as.DataFrame(data.frame(0))
- expect_is(df, "SparkDataFrame")
- df <- createDataFrame(list(list(1)))
- expect_is(df, "SparkDataFrame")
- df <- as.DataFrame(data.frame(0), numPartitions = 2)
- # no data to partition, goes to 1
- expect_equal(getNumPartitions(df), 1)
-
- setHiveContext(sc)
- sql("CREATE TABLE people (name string, age double, height float)")
- df <- read.df(jsonPathNa, "json", schema)
- insertInto(df, "people")
- expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age,
- c(16))
- expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
- c(176.5))
- sql("DROP TABLE people")
- unsetHiveContext()
-})
-
-test_that("createDataFrame uses files for large objects", {
- skip_on_cran()
-
- # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
- conf <- callJMethod(sparkSession, "conf")
- callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
- df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
- expect_equal(getNumPartitions(df), 3)
-
- # Resetting the conf back to default value
- callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
- expect_equal(dim(df), dim(iris))
-})
-
-test_that("read/write csv as DataFrame", {
- if (not_cran_or_windows_with_hadoop()) {
- csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
- mockLinesCsv <- c("year,make,model,comment,blank",
- "\"2012\",\"Tesla\",\"S\",\"No comment\",",
- "1997,Ford,E350,\"Go get one now they are going fast\",",
- "2015,Chevy,Volt",
- "NA,Dummy,Placeholder")
- writeLines(mockLinesCsv, csvPath)
-
- # default "header" is false, inferSchema to handle "year" as "int"
- df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
- expect_equal(count(df), 4)
- expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
- expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
- sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
-
- # since "year" is "int", let's skip the NA values
- withoutna <- na.omit(df, how = "any", cols = "year")
- expect_equal(count(withoutna), 3)
-
- unlink(csvPath)
- csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
- mockLinesCsv <- c("year,make,model,comment,blank",
- "\"2012\",\"Tesla\",\"S\",\"No comment\",",
- "1997,Ford,E350,\"Go get one now they are going fast\",",
- "2015,Chevy,Volt",
- "Empty,Dummy,Placeholder")
- writeLines(mockLinesCsv, csvPath)
-
- df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty")
- expect_equal(count(df2), 4)
- withoutna2 <- na.omit(df2, how = "any", cols = "year")
- expect_equal(count(withoutna2), 3)
- expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
-
- # writing csv file
- csvPath2 <- tempfile(pattern = "csvtest2", fileext = ".csv")
- write.df(df2, path = csvPath2, "csv", header = "true")
- df3 <- read.df(csvPath2, "csv", header = "true")
- expect_equal(nrow(df3), nrow(df2))
- expect_equal(colnames(df3), colnames(df2))
- csv <- read.csv(file = list.files(csvPath2, pattern = "^part", full.names = T)[[1]])
- expect_equal(colnames(df3), colnames(csv))
-
- unlink(csvPath)
- unlink(csvPath2)
- }
-})
-
-test_that("Support other types for options", {
- skip_on_cran()
-
- csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
- mockLinesCsv <- c("year,make,model,comment,blank",
- "\"2012\",\"Tesla\",\"S\",\"No comment\",",
- "1997,Ford,E350,\"Go get one now they are going fast\",",
- "2015,Chevy,Volt",
- "NA,Dummy,Placeholder")
- writeLines(mockLinesCsv, csvPath)
-
- csvDf <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
- expected <- read.df(csvPath, "csv", header = TRUE, inferSchema = TRUE)
- expect_equal(collect(csvDf), collect(expected))
-
- expect_error(read.df(csvPath, "csv", header = TRUE, maxColumns = 3))
- unlink(csvPath)
-})
-
-test_that("convert NAs to null type in DataFrames", {
- rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
- df <- createDataFrame(rdd, list("a", "b"))
- expect_true(is.na(collect(df)[2, "a"]))
- expect_equal(collect(df)[2, "b"], 4L)
-
- l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
- df <- createDataFrame(l)
- expect_equal(collect(df)[2, "x"], 1L)
- expect_true(is.na(collect(df)[2, "y"]))
-
- rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
- df <- createDataFrame(rdd, list("a", "b"))
- expect_true(is.na(collect(df)[2, "a"]))
- expect_equal(collect(df)[2, "b"], 4)
-
- l <- data.frame(x = 1, y = c(1, NA_real_, 3))
- df <- createDataFrame(l)
- expect_equal(collect(df)[2, "x"], 1)
- expect_true(is.na(collect(df)[2, "y"]))
-
- l <- list("a", "b", NA, "d")
- df <- createDataFrame(l)
- expect_true(is.na(collect(df)[3, "_1"]))
- expect_equal(collect(df)[4, "_1"], "d")
-
- l <- list("a", "b", NA_character_, "d")
- df <- createDataFrame(l)
- expect_true(is.na(collect(df)[3, "_1"]))
- expect_equal(collect(df)[4, "_1"], "d")
-
- l <- list(TRUE, FALSE, NA, TRUE)
- df <- createDataFrame(l)
- expect_true(is.na(collect(df)[3, "_1"]))
- expect_equal(collect(df)[4, "_1"], TRUE)
-})
-
-test_that("toDF", {
- skip_on_cran()
-
- rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
- df <- toDF(rdd, list("a", "b"))
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 10)
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
- df <- toDF(rdd)
- expect_is(df, "SparkDataFrame")
- expect_equal(columns(df), c("_1", "_2"))
-
- schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
- structField(x = "b", type = "string", nullable = TRUE))
- df <- toDF(rdd, schema)
- expect_is(df, "SparkDataFrame")
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
- rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
- df <- toDF(rdd)
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 10)
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-})
-
-test_that("create DataFrame from list or data.frame", {
- l <- list(list(1, 2), list(3, 4))
- df <- createDataFrame(l, c("a", "b"))
- expect_equal(columns(df), c("a", "b"))
-
- l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
- df <- createDataFrame(l)
- expect_equal(columns(df), c("a", "b"))
-
- a <- 1:3
- b <- c("a", "b", "c")
- ldf <- data.frame(a, b)
- df <- createDataFrame(ldf)
- expect_equal(columns(df), c("a", "b"))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
- expect_equal(count(df), 3)
- ldf2 <- collect(df)
- expect_equal(ldf$a, ldf2$a)
-
- irisdf <- suppressWarnings(createDataFrame(iris))
- iris_collected <- collect(irisdf)
- expect_equivalent(iris_collected[, -5], iris[, -5])
- expect_equal(iris_collected$Species, as.character(iris$Species))
-
- mtcarsdf <- createDataFrame(mtcars)
- expect_equivalent(collect(mtcarsdf), mtcars)
-
- bytes <- as.raw(c(1, 2, 3))
- df <- createDataFrame(list(list(bytes)))
- expect_equal(collect(df)[[1]][[1]], bytes)
-})
-
-test_that("create DataFrame with different data types", {
- l <- list(a = 1L, b = 2, c = TRUE, d = "ss", e = as.Date("2012-12-13"),
- f = as.POSIXct("2015-03-15 12:13:14.056"))
- df <- createDataFrame(list(l))
- expect_equal(dtypes(df), list(c("a", "int"), c("b", "double"), c("c", "boolean"),
- c("d", "string"), c("e", "date"), c("f", "timestamp")))
- expect_equal(count(df), 1)
- expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
-})
-
-test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
- df <- data.frame(
- id = 1:2,
- time = c(as.POSIXlt("2016-01-10"), NA),
- date = c(as.Date("2016-10-01"), NA))
-
- DF <- collect(createDataFrame(df))
- expect_true(is.na(DF$date[2]))
- expect_equal(DF$date[1], as.Date("2016-10-01"))
- expect_true(is.na(DF$time[2]))
- expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
-})
-
-test_that("create DataFrame with complex types", {
- e <- new.env()
- assign("n", 3L, envir = e)
-
- s <- listToStruct(list(a = "aa", b = 3L))
-
- l <- list(as.list(1:10), list("a", "b"), e, s)
- df <- createDataFrame(list(l), c("a", "b", "c", "d"))
- expect_equal(dtypes(df), list(c("a", "array<int>"),
- c("b", "array<string>"),
- c("c", "map<string,int>"),
- c("d", "struct<a:string,b:int>")))
- expect_equal(count(df), 1)
- ldf <- collect(df)
- expect_equal(names(ldf), c("a", "b", "c", "d"))
- expect_equal(ldf[1, 1][[1]], l[[1]])
- expect_equal(ldf[1, 2][[1]], l[[2]])
-
- e <- ldf$c[[1]]
- expect_equal(class(e), "environment")
- expect_equal(ls(e), "n")
- expect_equal(e$n, 3L)
-
- s <- ldf$d[[1]]
- expect_equal(class(s), "struct")
- expect_equal(s$a, "aa")
- expect_equal(s$b, 3L)
-})
-
-test_that("create DataFrame from a data.frame with complex types", {
- skip_on_cran()
-
- ldf <- data.frame(row.names = 1:2)
- ldf$a_list <- list(list(1, 2), list(3, 4))
- ldf$an_envir <- c(as.environment(list(a = 1, b = 2)), as.environment(list(c = 3)))
-
- sdf <- createDataFrame(ldf)
- collected <- collect(sdf)
-
- expect_identical(ldf[, 1, FALSE], collected[, 1, FALSE])
- expect_equal(ldf$an_envir, collected$an_envir)
-})
-
-test_that("Collect DataFrame with complex types", {
- skip_on_cran()
-
- # ArrayType
- df <- read.json(complexTypeJsonPath)
- ldf <- collect(df)
- expect_equal(nrow(ldf), 3)
- expect_equal(ncol(ldf), 3)
- expect_equal(names(ldf), c("c1", "c2", "c3"))
- expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
- expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
- expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
-
- # MapType
- schema <- structType(structField("name", "string"),
- structField("info", "map<string,double>"))
- df <- read.df(mapTypeJsonPath, "json", schema)
- expect_equal(dtypes(df), list(c("name", "string"),
- c("info", "map<string,double>")))
- ldf <- collect(df)
- expect_equal(nrow(ldf), 3)
- expect_equal(ncol(ldf), 2)
- expect_equal(names(ldf), c("name", "info"))
- expect_equal(ldf$name, c("Bob", "Alice", "David"))
- bob <- ldf$info[[1]]
- expect_equal(class(bob), "environment")
- expect_equal(bob$age, 16)
- expect_equal(bob$height, 176.5)
-
- # StructType
- df <- read.json(mapTypeJsonPath)
- expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
- c("name", "string")))
- ldf <- collect(df)
- expect_equal(nrow(ldf), 3)
- expect_equal(ncol(ldf), 2)
- expect_equal(names(ldf), c("info", "name"))
- expect_equal(ldf$name, c("Bob", "Alice", "David"))
- bob <- ldf$info[[1]]
- expect_equal(class(bob), "struct")
- expect_equal(bob$age, 16)
- expect_equal(bob$height, 176.5)
-})
-
-test_that("read/write json files", {
- if (not_cran_or_windows_with_hadoop()) {
- # Test read.df
- df <- read.df(jsonPath, "json")
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 3)
-
- # Test read.df with a user defined schema
- schema <- structType(structField("name", type = "string"),
- structField("age", type = "double"))
-
- df1 <- read.df(jsonPath, "json", schema)
- expect_is(df1, "SparkDataFrame")
- expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
-
- # Test loadDF
- df2 <- loadDF(jsonPath, "json", schema)
- expect_is(df2, "SparkDataFrame")
- expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
-
- # Test read.json
- df <- read.json(jsonPath)
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 3)
-
- # Test write.df
- jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json")
- write.df(df, jsonPath2, "json", mode = "overwrite")
-
- # Test write.json
- jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json")
- write.json(df, jsonPath3)
-
- # Test read.json()/jsonFile() works with multiple input paths
- jsonDF1 <- read.json(c(jsonPath2, jsonPath3))
- expect_is(jsonDF1, "SparkDataFrame")
- expect_equal(count(jsonDF1), 6)
- # Suppress warnings because jsonFile is deprecated
- jsonDF2 <- suppressWarnings(jsonFile(c(jsonPath2, jsonPath3)))
- expect_is(jsonDF2, "SparkDataFrame")
- expect_equal(count(jsonDF2), 6)
-
- unlink(jsonPath2)
- unlink(jsonPath3)
- }
-})
-
-test_that("read/write json files - compression option", {
- skip_on_cran()
-
- df <- read.df(jsonPath, "json")
-
- jsonPath <- tempfile(pattern = "jsonPath", fileext = ".json")
- write.json(df, jsonPath, compression = "gzip")
- jsonDF <- read.json(jsonPath)
- expect_is(jsonDF, "SparkDataFrame")
- expect_equal(count(jsonDF), count(df))
- expect_true(length(list.files(jsonPath, pattern = ".gz")) > 0)
-
- unlink(jsonPath)
-})
-
-test_that("jsonRDD() on a RDD with json string", {
- skip_on_cran()
-
- sqlContext <- suppressWarnings(sparkRSQL.init(sc))
- rdd <- parallelize(sc, mockLines)
- expect_equal(countRDD(rdd), 3)
- df <- suppressWarnings(jsonRDD(sqlContext, rdd))
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 3)
-
- rdd2 <- flatMap(rdd, function(x) c(x, x))
- df <- suppressWarnings(jsonRDD(sqlContext, rdd2))
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 6)
-})
-
-test_that("test tableNames and tables", {
- count <- count(listTables())
-
- df <- read.json(jsonPath)
- createOrReplaceTempView(df, "table1")
- expect_equal(length(tableNames()), count + 1)
- expect_equal(length(tableNames("default")), count + 1)
-
- tables <- listTables()
- expect_equal(count(tables), count + 1)
- expect_equal(count(tables()), count(tables))
- expect_true("tableName" %in% colnames(tables()))
- expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables())))
-
- suppressWarnings(registerTempTable(df, "table2"))
- tables <- listTables()
- expect_equal(count(tables), count + 2)
- suppressWarnings(dropTempTable("table1"))
- expect_true(dropTempView("table2"))
-
- tables <- listTables()
- expect_equal(count(tables), count + 0)
-})
-
-test_that(
- "createOrReplaceTempView() results in a queryable table and sql() results in a new DataFrame", {
- df <- read.json(jsonPath)
- createOrReplaceTempView(df, "table1")
- newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
- expect_is(newdf, "SparkDataFrame")
- expect_equal(count(newdf), 1)
- expect_true(dropTempView("table1"))
-
- createOrReplaceTempView(df, "dfView")
- sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
- out <- capture.output(sqlCast)
- expect_true(is.data.frame(sqlCast))
- expect_equal(names(sqlCast)[1], "x")
- expect_equal(nrow(sqlCast), 1)
- expect_equal(ncol(sqlCast), 1)
- expect_equal(out[1], " x")
- expect_equal(out[2], "1 2")
- expect_true(dropTempView("dfView"))
-})
-
-test_that("test cache, uncache and clearCache", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- createOrReplaceTempView(df, "table1")
- cacheTable("table1")
- uncacheTable("table1")
- clearCache()
- expect_true(dropTempView("table1"))
-
- expect_error(uncacheTable("foo"),
- "Error in uncacheTable : no such table - Table or view 'foo' not found in database 'default'")
-})
-
-test_that("insertInto() on a registered table", {
- if (not_cran_or_windows_with_hadoop()) {
- df <- read.df(jsonPath, "json")
- write.df(df, parquetPath, "parquet", "overwrite")
- dfParquet <- read.df(parquetPath, "parquet")
-
- lines <- c("{\"name\":\"Bob\", \"age\":24}",
- "{\"name\":\"James\", \"age\":35}")
- jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".tmp")
- parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
- writeLines(lines, jsonPath2)
- df2 <- read.df(jsonPath2, "json")
- write.df(df2, parquetPath2, "parquet", "overwrite")
- dfParquet2 <- read.df(parquetPath2, "parquet")
-
- createOrReplaceTempView(dfParquet, "table1")
- insertInto(dfParquet2, "table1")
- expect_equal(count(sql("select * from table1")), 5)
- expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
- expect_true(dropTempView("table1"))
-
- createOrReplaceTempView(dfParquet, "table1")
- insertInto(dfParquet2, "table1", overwrite = TRUE)
- expect_equal(count(sql("select * from table1")), 2)
- expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
- expect_true(dropTempView("table1"))
-
- unlink(jsonPath2)
- unlink(parquetPath2)
- }
-})
-
-test_that("tableToDF() returns a new DataFrame", {
- df <- read.json(jsonPath)
- createOrReplaceTempView(df, "table1")
- tabledf <- tableToDF("table1")
- expect_is(tabledf, "SparkDataFrame")
- expect_equal(count(tabledf), 3)
- tabledf2 <- tableToDF("table1")
- expect_equal(count(tabledf2), 3)
- expect_true(dropTempView("table1"))
-})
-
-test_that("toRDD() returns an RRDD", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- testRDD <- toRDD(df)
- expect_is(testRDD, "RDD")
- expect_equal(countRDD(testRDD), 3)
-})
-
-test_that("union on two RDDs created from DataFrames returns an RRDD", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- RDD1 <- toRDD(df)
- RDD2 <- toRDD(df)
- unioned <- unionRDD(RDD1, RDD2)
- expect_is(unioned, "RDD")
- expect_equal(getSerializedMode(unioned), "byte")
- expect_equal(collectRDD(unioned)[[2]]$name, "Andy")
-})
-
-test_that("union on mixed serialization types correctly returns a byte RRDD", {
- skip_on_cran()
-
- # Byte RDD
- nums <- 1:10
- rdd <- parallelize(sc, nums, 2L)
-
- # String RDD
- textLines <- c("Michael",
- "Andy, 30",
- "Justin, 19")
- textPath <- tempfile(pattern = "sparkr-textLines", fileext = ".tmp")
- writeLines(textLines, textPath)
- textRDD <- textFile(sc, textPath)
-
- df <- read.json(jsonPath)
- dfRDD <- toRDD(df)
-
- unionByte <- unionRDD(rdd, dfRDD)
- expect_is(unionByte, "RDD")
- expect_equal(getSerializedMode(unionByte), "byte")
- expect_equal(collectRDD(unionByte)[[1]], 1)
- expect_equal(collectRDD(unionByte)[[12]]$name, "Andy")
-
- unionString <- unionRDD(textRDD, dfRDD)
- expect_is(unionString, "RDD")
- expect_equal(getSerializedMode(unionString), "byte")
- expect_equal(collectRDD(unionString)[[1]], "Michael")
- expect_equal(collectRDD(unionString)[[5]]$name, "Andy")
-})
-
-test_that("objectFile() works with row serialization", {
- skip_on_cran()
-
- objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp")
- df <- read.json(jsonPath)
- dfRDD <- toRDD(df)
- saveAsObjectFile(coalesceRDD(dfRDD, 1L), objectPath)
- objectIn <- objectFile(sc, objectPath)
-
- expect_is(objectIn, "RDD")
- expect_equal(getSerializedMode(objectIn), "byte")
- expect_equal(collectRDD(objectIn)[[2]]$age, 30)
-})
-
-test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- testRDD <- lapply(df, function(row) {
- row$newCol <- row$age + 5
- row
- })
- expect_is(testRDD, "RDD")
- collected <- collectRDD(testRDD)
- expect_equal(collected[[1]]$name, "Michael")
- expect_equal(collected[[2]]$newCol, 35)
-})
-
-test_that("collect() returns a data.frame", {
- df <- read.json(jsonPath)
- rdf <- collect(df)
- expect_true(is.data.frame(rdf))
- expect_equal(names(rdf)[1], "age")
- expect_equal(nrow(rdf), 3)
- expect_equal(ncol(rdf), 2)
-
- # collect() returns data correctly from a DataFrame with 0 row
- df0 <- limit(df, 0)
- rdf <- collect(df0)
- expect_true(is.data.frame(rdf))
- expect_equal(names(rdf)[1], "age")
- expect_equal(nrow(rdf), 0)
- expect_equal(ncol(rdf), 2)
-
- # collect() correctly handles multiple columns with same name
- df <- createDataFrame(list(list(1, 2)), schema = c("name", "name"))
- ldf <- collect(df)
- expect_equal(names(ldf), c("name", "name"))
-})
-
-test_that("limit() returns DataFrame with the correct number of rows", {
- df <- read.json(jsonPath)
- dfLimited <- limit(df, 2)
- expect_is(dfLimited, "SparkDataFrame")
- expect_equal(count(dfLimited), 2)
-})
-
-test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
- df <- read.json(jsonPath)
- expect_equal(nrow(collect(df)), nrow(take(df, 10)))
- expect_equal(ncol(collect(df)), ncol(take(df, 10)))
-})
-
-test_that("collect() support Unicode characters", {
- lines <- c("{\"name\":\"안녕하세요\"}",
- "{\"name\":\"您好\", \"age\":30}",
- "{\"name\":\"こんにちは\", \"age\":19}",
- "{\"name\":\"Xin chào\"}")
-
- jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(lines, jsonPath)
-
- df <- read.df(jsonPath, "json")
- rdf <- collect(df)
- expect_true(is.data.frame(rdf))
- expect_equal(rdf$name[1], markUtf8("안녕하세요"))
- expect_equal(rdf$name[2], markUtf8("您好"))
- expect_equal(rdf$name[3], markUtf8("こんにちは"))
- expect_equal(rdf$name[4], markUtf8("Xin chào"))
-
- df1 <- createDataFrame(rdf)
- expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
-})
-
-test_that("multiple pipeline transformations result in an RDD with the correct values", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- first <- lapply(df, function(row) {
- row$age <- row$age + 5
- row
- })
- second <- lapply(first, function(row) {
- row$testCol <- if (row$age == 35 && !is.na(row$age)) TRUE else FALSE
- row
- })
- expect_is(second, "RDD")
- expect_equal(countRDD(second), 3)
- expect_equal(collectRDD(second)[[2]]$age, 35)
- expect_true(collectRDD(second)[[2]]$testCol)
- expect_false(collectRDD(second)[[3]]$testCol)
-})
-
-test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
- df <- read.json(jsonPath)
- expect_false(df@env$isCached)
- cache(df)
- expect_true(df@env$isCached)
-
- unpersist(df)
- expect_false(df@env$isCached)
-
- persist(df, "MEMORY_AND_DISK")
- expect_true(df@env$isCached)
-
- expect_equal(storageLevel(df),
- "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
-
- unpersist(df)
- expect_false(df@env$isCached)
-
- # make sure the data is collectable
- expect_true(is.data.frame(collect(df)))
-})
-
-test_that("setCheckpointDir(), checkpoint() on a DataFrame", {
- if (not_cran_or_windows_with_hadoop()) {
- checkpointDir <- file.path(tempdir(), "cproot")
- expect_true(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
-
- setCheckpointDir(checkpointDir)
- df <- read.json(jsonPath)
- df <- checkpoint(df)
- expect_is(df, "SparkDataFrame")
- expect_false(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
- }
-})
-
-test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
- df <- read.json(jsonPath)
- testSchema <- schema(df)
- expect_equal(length(testSchema$fields()), 2)
- expect_equal(testSchema$fields()[[1]]$dataType.toString(), "LongType")
- expect_equal(testSchema$fields()[[2]]$dataType.simpleString(), "string")
- expect_equal(testSchema$fields()[[1]]$name(), "age")
-
- testTypes <- dtypes(df)
- expect_equal(length(testTypes[[1]]), 2)
- expect_equal(testTypes[[1]][1], "age")
-
- testCols <- columns(df)
- expect_equal(length(testCols), 2)
- expect_equal(testCols[2], "name")
-
- testNames <- names(df)
- expect_equal(length(testNames), 2)
- expect_equal(testNames[2], "name")
-})
-
-test_that("names() colnames() set the column names", {
- df <- read.json(jsonPath)
- names(df) <- c("col1", "col2")
- expect_equal(colnames(df)[2], "col2")
-
- colnames(df) <- c("col3", "col4")
- expect_equal(names(df)[1], "col3")
-
- expect_error(names(df) <- NULL, "Invalid column names.")
- expect_error(names(df) <- c("sepal.length", "sepal_width"),
- "Column names cannot contain the '.' symbol.")
- expect_error(names(df) <- c(1, 2), "Invalid column names.")
- expect_error(names(df) <- c("a"),
- "Column names must have the same length as the number of columns in the dataset.")
- expect_error(names(df) <- c("1", NA), "Column names cannot be NA.")
-
- expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
- "Column names cannot contain the '.' symbol.")
- expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
- expect_error(colnames(df) <- c("a"),
- "Column names must have the same length as the number of columns in the dataset.")
- expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.")
-
- # Note: if this test is broken, remove check for "." character on colnames<- method
- irisDF <- suppressWarnings(createDataFrame(iris))
- expect_equal(names(irisDF)[1], "Sepal_Length")
-
- # Test base::colnames base::names
- m2 <- cbind(1, 1:4)
- expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2"))
- colnames(m2) <- c("x", "Y")
- expect_equal(colnames(m2), c("x", "Y"))
-
- z <- list(a = 1, b = "c", c = 1:3)
- expect_equal(names(z)[3], "c")
- names(z)[3] <- "c2"
- expect_equal(names(z)[3], "c2")
-
- # Test subset assignment
- colnames(df)[1] <- "col5"
- expect_equal(colnames(df)[1], "col5")
- names(df)[2] <- "col6"
- expect_equal(names(df)[2], "col6")
-})
-
-test_that("head() and first() return the correct data", {
- df <- read.json(jsonPath)
- testHead <- head(df)
- expect_equal(nrow(testHead), 3)
- expect_equal(ncol(testHead), 2)
-
- testHead2 <- head(df, 2)
- expect_equal(nrow(testHead2), 2)
- expect_equal(ncol(testHead2), 2)
-
- testFirst <- first(df)
- expect_equal(nrow(testFirst), 1)
-
- # head() and first() return the correct data on
- # a DataFrame with 0 row
- df0 <- limit(df, 0)
-
- testHead <- head(df0)
- expect_equal(nrow(testHead), 0)
- expect_equal(ncol(testHead), 2)
-
- testFirst <- first(df0)
- expect_equal(nrow(testFirst), 0)
- expect_equal(ncol(testFirst), 2)
-})
-
-test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
- lines <- c("{\"name\":\"Michael\"}",
- "{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"Justin\", \"age\":19}",
- "{\"name\":\"Justin\", \"age\":19}")
- jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(lines, jsonPathWithDup)
-
- df <- read.json(jsonPathWithDup)
- uniques <- distinct(df)
- expect_is(uniques, "SparkDataFrame")
- expect_equal(count(uniques), 3)
-
- uniques2 <- unique(df)
- expect_is(uniques2, "SparkDataFrame")
- expect_equal(count(uniques2), 3)
-
- # Test dropDuplicates()
- df <- createDataFrame(
- list(
- list(2, 1, 2), list(1, 1, 1),
- list(1, 2, 1), list(2, 1, 2),
- list(2, 2, 2), list(2, 2, 1),
- list(2, 1, 1), list(1, 1, 2),
- list(1, 2, 2), list(1, 2, 1)),
- schema = c("key", "value1", "value2"))
- result <- collect(dropDuplicates(df))
- expected <- rbind.data.frame(
- c(1, 1, 1), c(1, 1, 2), c(1, 2, 1),
- c(1, 2, 2), c(2, 1, 1), c(2, 1, 2),
- c(2, 2, 1), c(2, 2, 2))
- names(expected) <- c("key", "value1", "value2")
- expect_equivalent(
- result[order(result$key, result$value1, result$value2), ],
- expected)
-
- result <- collect(dropDuplicates(df, c("key", "value1")))
- expected <- rbind.data.frame(
- c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
- names(expected) <- c("key", "value1", "value2")
- expect_equivalent(
- result[order(result$key, result$value1, result$value2), ],
- expected)
-
- result <- collect(dropDuplicates(df, "key", "value1"))
- expected <- rbind.data.frame(
- c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
- names(expected) <- c("key", "value1", "value2")
- expect_equivalent(
- result[order(result$key, result$value1, result$value2), ],
- expected)
-
- result <- collect(dropDuplicates(df, "key"))
- expected <- rbind.data.frame(
- c(1, 1, 1), c(2, 1, 2))
- names(expected) <- c("key", "value1", "value2")
- expect_equivalent(
- result[order(result$key, result$value1, result$value2), ],
- expected)
-})
-
-test_that("sample on a DataFrame", {
- df <- read.json(jsonPath)
- sampled <- sample(df, FALSE, 1.0)
- expect_equal(nrow(collect(sampled)), count(df))
- expect_is(sampled, "SparkDataFrame")
- sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
- expect_true(count(sampled2) < 3)
-
- count1 <- count(sample(df, FALSE, 0.1, 0))
- count2 <- count(sample(df, FALSE, 0.1, 0))
- expect_equal(count1, count2)
-
- # Also test sample_frac
- sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
- expect_true(count(sampled3) < 3)
-
- # nolint start
- # Test base::sample is working
- #expect_equal(length(sample(1:12)), 12)
- # nolint end
-})
-
-test_that("select operators", {
- df <- select(read.json(jsonPath), "name", "age")
- expect_is(df$name, "Column")
- expect_is(df[[2]], "Column")
- expect_is(df[["age"]], "Column")
-
- expect_warning(df[[1:2]],
- "Subset index has length > 1. Only the first index is used.")
- expect_is(suppressWarnings(df[[1:2]]), "Column")
- expect_warning(df[[c("name", "age")]],
- "Subset index has length > 1. Only the first index is used.")
- expect_is(suppressWarnings(df[[c("name", "age")]]), "Column")
-
- expect_warning(df[[1:2]] <- df[[1]],
- "Subset index has length > 1. Only the first index is used.")
- expect_warning(df[[c("name", "age")]] <- df[[1]],
- "Subset index has length > 1. Only the first index is used.")
-
- expect_is(df[, 1, drop = F], "SparkDataFrame")
- expect_equal(columns(df[, 1, drop = F]), c("name"))
- expect_equal(columns(df[, "age", drop = F]), c("age"))
-
- df2 <- df[, c("age", "name")]
- expect_is(df2, "SparkDataFrame")
- expect_equal(columns(df2), c("age", "name"))
-
- df$age2 <- df$age
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == df$age)), 2)
- df$age2 <- df$age * 2
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
- df$age2 <- df[["age"]] * 3
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == df$age * 3)), 2)
-
- df$age2 <- 21
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == 21)), 3)
-
- df$age2 <- c(22)
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == 22)), 3)
-
- expect_error(df$age3 <- c(22, NA),
- "value must be a Column, literal value as atomic in length of 1, or NULL")
-
- df[["age2"]] <- 23
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == 23)), 3)
-
- df[[3]] <- 24
- expect_equal(columns(df), c("name", "age", "age2"))
- expect_equal(count(where(df, df$age2 == 24)), 3)
-
- df[[3]] <- df$age
- expect_equal(count(where(df, df$age2 == df$age)), 2)
-
- df[["age2"]] <- df[["name"]]
- expect_equal(count(where(df, df$age2 == df$name)), 3)
-
- expect_error(df[["age3"]] <- c(22, 23),
- "value must be a Column, literal value as atomic in length of 1, or NULL")
-
- # Test parameter drop
- expect_equal(class(df[, 1]) == "SparkDataFrame", T)
- expect_equal(class(df[, 1, drop = T]) == "Column", T)
- expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T)
- expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T)
- expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T)
-})
-
-test_that("select with column", {
- df <- read.json(jsonPath)
- df1 <- select(df, "name")
- expect_equal(columns(df1), c("name"))
- expect_equal(count(df1), 3)
-
- df2 <- select(df, df$age)
- expect_equal(columns(df2), c("age"))
- expect_equal(count(df2), 3)
-
- df3 <- select(df, lit("x"))
- expect_equal(columns(df3), c("x"))
- expect_equal(count(df3), 3)
- expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
-
- df4 <- select(df, c("name", "age"))
- expect_equal(columns(df4), c("name", "age"))
- expect_equal(count(df4), 3)
-
- # Test select with alias
- df5 <- alias(df, "table")
-
- expect_equal(columns(select(df5, column("table.name"))), "name")
- expect_equal(columns(select(df5, "table.name")), "name")
-
- # Test that stats::alias is not masked
- expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
-
-
- expect_error(select(df, c("name", "age"), "name"),
- "To select multiple columns, use a character vector or list for col")
-})
-
-test_that("drop column", {
- df <- select(read.json(jsonPath), "name", "age")
- df1 <- drop(df, "name")
- expect_equal(columns(df1), c("age"))
-
- df$age2 <- df$age
- df1 <- drop(df, c("name", "age"))
- expect_equal(columns(df1), c("age2"))
-
- df1 <- drop(df, df$age)
- expect_equal(columns(df1), c("name", "age2"))
-
- df$age2 <- NULL
- expect_equal(columns(df), c("name", "age"))
- df$age3 <- NULL
- expect_equal(columns(df), c("name", "age"))
-
- # Test to make sure base::drop is not masked
- expect_equal(drop(1:3 %*% 2:4), 20)
-})
-
-test_that("subsetting", {
- # read.json returns columns in random order
- df <- select(read.json(jsonPath), "name", "age")
- filtered <- df[df$age > 20, ]
- expect_equal(count(filtered), 1)
- expect_equal(columns(filtered), c("name", "age"))
- expect_equal(collect(filtered)$name, "Andy")
-
- df2 <- df[df$age == 19, 1, drop = F]
- expect_is(df2, "SparkDataFrame")
- expect_equal(count(df2), 1)
- expect_equal(columns(df2), c("name"))
- expect_equal(collect(df2)$name, "Justin")
-
- df3 <- df[df$age > 20, 2, drop = F]
- expect_equal(count(df3), 1)
- expect_equal(columns(df3), c("age"))
-
- df4 <- df[df$age %in% c(19, 30), 1:2]
- expect_equal(count(df4), 2)
- expect_equal(columns(df4), c("name", "age"))
-
- df5 <- df[df$age %in% c(19), c(1, 2)]
- expect_equal(count(df5), 1)
- expect_equal(columns(df5), c("name", "age"))
-
- df6 <- subset(df, df$age %in% c(30), c(1, 2))
- expect_equal(count(df6), 1)
- expect_equal(columns(df6), c("name", "age"))
-
- df7 <- subset(df, select = "name", drop = F)
- expect_equal(count(df7), 3)
- expect_equal(columns(df7), c("name"))
-
- # Test base::subset is working
- expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68)
-})
-
-test_that("selectExpr() on a DataFrame", {
- df <- read.json(jsonPath)
- selected <- selectExpr(df, "age * 2")
- expect_equal(names(selected), "(age * 2)")
- expect_equal(collect(selected), collect(select(df, df$age * 2L)))
-
- selected2 <- selectExpr(df, "name as newName", "abs(age) as age")
- expect_equal(names(selected2), c("newName", "age"))
- expect_equal(count(selected2), 3)
-})
-
-test_that("expr() on a DataFrame", {
- df <- read.json(jsonPath)
- expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123)
-})
-
-test_that("column calculation", {
- df <- read.json(jsonPath)
- d <- collect(select(df, alias(df$age + 1, "age2")))
- expect_equal(names(d), c("age2"))
- df2 <- select(df, lower(df$name), abs(df$age))
- expect_is(df2, "SparkDataFrame")
- expect_equal(count(df2), 3)
-})
-
-test_that("test HiveContext", {
- if (not_cran_or_windows_with_hadoop()) {
- setHiveContext(sc)
-
- schema <- structType(structField("name", "string"), structField("age", "integer"),
- structField("height", "float"))
- createTable("people", source = "json", schema = schema)
- df <- read.df(jsonPathNa, "json", schema)
- insertInto(df, "people")
- expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16))
- sql("DROP TABLE people")
-
- df <- createTable("json", jsonPath, "json")
- expect_is(df, "SparkDataFrame")
- expect_equal(count(df), 3)
- df2 <- sql("select * from json")
- expect_is(df2, "SparkDataFrame")
- expect_equal(count(df2), 3)
-
- jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- saveAsTable(df, "json2", "json", "append", path = jsonPath2)
- df3 <- sql("select * from json2")
- expect_is(df3, "SparkDataFrame")
- expect_equal(count(df3), 3)
- unlink(jsonPath2)
-
- hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- saveAsTable(df, "hivetestbl", path = hivetestDataPath)
- df4 <- sql("select * from hivetestbl")
- expect_is(df4, "SparkDataFrame")
- expect_equal(count(df4), 3)
- unlink(hivetestDataPath)
-
- parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)
- df5 <- sql("select * from parquetest")
- expect_is(df5, "SparkDataFrame")
- expect_equal(count(df5), 3)
- unlink(parquetDataPath)
-
- unsetHiveContext()
- }
-})
-
-test_that("column operators", {
- c <- column("a")
- c2 <- (- c + 1 - 2) * 3 / 4.0
- c3 <- (c + c2 - c2) * c2 %% c2
- c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
- c5 <- c2 ^ c3 ^ c4
- c6 <- c2 %<=>% c3
- c7 <- !c6
-})
-
-test_that("column functions", {
- skip_on_cran()
-
- c <- column("a")
- c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
- c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
- c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
- c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
- c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
- c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
- c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
- c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
- c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
- c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
- c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
- c12 <- variance(c)
- c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
- c14 <- cume_dist() + ntile(1) + corr(c, c1)
- c15 <- dense_rank() + percent_rank() + rank() + row_number()
- c16 <- is.nan(c) + isnan(c) + isNaN(c)
- c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
- c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
- c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
- c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
- c21 <- posexplode_outer(c) + explode_outer(c)
- c22 <- not(c)
-
- # Test if base::is.nan() is exposed
- expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
-
- # Test if base::rank() is exposed
- expect_equal(class(rank())[[1]], "Column")
- expect_equal(rank(1:3), as.numeric(c(1:3)))
-
- df <- read.json(jsonPath)
- df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
- expect_equal(collect(df2)[[2, 1]], TRUE)
- expect_equal(collect(df2)[[2, 2]], FALSE)
- expect_equal(collect(df2)[[3, 1]], FALSE)
- expect_equal(collect(df2)[[3, 2]], TRUE)
-
- # Test that input_file_name()
- actual_names <- sort(collect(distinct(select(df, input_file_name()))))
- expect_equal(length(actual_names), 1)
- expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
-
- df3 <- select(df, between(df$name, c("Apache", "Spark")))
- expect_equal(collect(df3)[[1, 1]], TRUE)
- expect_equal(collect(df3)[[2, 1]], FALSE)
- expect_equal(collect(df3)[[3, 1]], TRUE)
-
- df4 <- select(df, countDistinct(df$age, df$name))
- expect_equal(collect(df4)[[1, 1]], 2)
-
- expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
- expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
- expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
-
- df5 <- createDataFrame(list(list(a = "010101")))
- expect_equal(collect(select(df5, conv(df5$a, 2, 16)))[1, 1], "15")
-
- # Test array_contains() and sort_array()
- df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L))))
- result <- collect(select(df, array_contains(df[[1]], 1L)))[[1]]
- expect_equal(result, c(TRUE, FALSE))
-
- result <- collect(select(df, sort_array(df[[1]], FALSE)))[[1]]
- expect_equal(result, list(list(3L, 2L, 1L), list(6L, 5L, 4L)))
- result <- collect(select(df, sort_array(df[[1]])))[[1]]
- expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L)))
-
- # Test that stats::lag is working
- expect_equal(length(lag(ldeaths, 12)), 72)
-
- # Test struct()
- df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
- schema = c("a", "b", "c"))
- result <- collect(select(df, alias(struct("a", "c"), "d")))
- expected <- data.frame(row.names = 1:2)
- expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
- listToStruct(list(a = 4L, c = 6L)))
- expect_equal(result, expected)
-
- result <- collect(select(df, alias(struct(df$a, df$b), "d")))
- expected <- data.frame(row.names = 1:2)
- expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
- listToStruct(list(a = 4L, b = 5L)))
- expect_equal(result, expected)
-
- # Test encode(), decode()
- bytes <- as.raw(c(0xe5, 0xa4, 0xa7, 0xe5, 0x8d, 0x83, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c))
- df <- createDataFrame(list(list(markUtf8("大千世界"), "utf-8", bytes)),
- schema = c("a", "b", "c"))
- result <- collect(select(df, encode(df$a, "utf-8"), decode(df$c, "utf-8")))
- expect_equal(result[[1]][[1]], bytes)
- expect_equal(result[[2]], markUtf8("大千世界"))
-
- # Test first(), last()
- df <- read.json(jsonPath)
- expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
- expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
- expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
- expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
- expect_equal(collect(select(df, last(df$age)))[[1]], 19)
- expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
- expect_equal(collect(select(df, last("age")))[[1]], 19)
- expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)
-
- # Test bround()
- df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
- expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
- expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
-
- # Test to_json(), from_json()
- df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
- j <- collect(select(df, alias(to_json(df$people), "json")))
- expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
-
- df <- read.json(mapTypeJsonPath)
- j <- collect(select(df, alias(to_json(df$info), "json")))
- expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
- df <- as.DataFrame(j)
- schema <- structType(structField("age", "integer"),
- structField("height", "double"))
- s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
- expect_equal(ncol(s), 1)
- expect_equal(nrow(s), 3)
- expect_is(s[[1]][[1]], "struct")
- expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
-
- # passing option
- df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
- schema2 <- structType(structField("date", "date"))
- s <- collect(select(df, from_json(df$col, schema2)))
- expect_equal(s[[1]][[1]], NA)
- s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
- expect_is(s[[1]][[1]]$date, "Date")
- expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
-
- # check for unparseable
- df <- as.DataFrame(list(list("a" = "")))
- expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
-
- # check if array type in string is correctly supported.
- jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
- df <- as.DataFrame(list(list("people" = jsonArr)))
- schema <- structType(structField("name", "string"))
- arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
- expect_equal(ncol(arr), 1)
- expect_equal(nrow(arr), 1)
- expect_is(arr[[1]][[1]], "list")
- expect_equal(length(arr$arrcol[[1]]), 2)
- expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
- expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
-
- # Test create_array() and create_map()
- df <- as.DataFrame(data.frame(
- x = c(1.0, 2.0), y = c(-1.0, 3.0), z = c(-2.0, 5.0)
- ))
-
- arrs <- collect(select(df, create_array(df$x, df$y, df$z)))
- expect_equal(arrs[, 1], list(list(1, -1, -2), list(2, 3, 5)))
-
- maps <- collect(select(
- df, create_map(lit("x"), df$x, lit("y"), df$y, lit("z"), df$z)))
-
- expect_equal(
- maps[, 1],
- lapply(
- list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)),
- as.environment))
-
- df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
- expect_equal(
- collect(select(df, alias(not(df$is_true), "is_false"))),
- data.frame(is_false = c(FALSE, TRUE, NA))
- )
-})
-
-test_that("column binary mathfunctions", {
- lines <- c("{\"a\":1, \"b\":5}",
- "{\"a\":2, \"b\":6}",
- "{\"a\":3, \"b\":7}",
- "{\"a\":4, \"b\":8}")
- jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(lines, jsonPathWithDup)
- df <- read.json(jsonPathWithDup)
- expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
- expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
- expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
- expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
- ## nolint start
- expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
- expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
- expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
- expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
- ## nolint end
- expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
- expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
- expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
- expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
- expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
- expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
- expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
-})
-
-test_that("string operators", {
- df <- read.json(jsonPath)
- expect_equal(count(where(df, like(df$name, "A%"))), 1)
- expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
- expect_true(first(select(df, startsWith(df$name, "M")))[[1]])
- expect_false(first(select(df, startsWith(df$name, "m")))[[1]])
- expect_true(first(select(df, endsWith(df$name, "el")))[[1]])
- expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
- if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
- expect_true(startsWith("Hello World", "Hello"))
- expect_false(endsWith("Hello World", "a"))
- }
- expect_equal(collect(select(df, cast(df$age, "string")))[[2, 1]], "30")
- expect_equal(collect(select(df, concat(df$name, lit(":"), df$age)))[[2, 1]], "Andy:30")
- expect_equal(collect(select(df, concat_ws(":", df$name)))[[2, 1]], "Andy")
- expect_equal(collect(select(df, concat_ws(":", df$name, df$age)))[[2, 1]], "Andy:30")
- expect_equal(collect(select(df, instr(df$name, "i")))[, 1], c(2, 0, 5))
- expect_equal(collect(select(df, format_number(df$age, 2)))[2, 1], "30.00")
- expect_equal(collect(select(df, sha1(df$name)))[2, 1],
- "ab5a000e88b5d9d0fa2575f5c6263eb93452405d")
- expect_equal(collect(select(df, sha2(df$name, 256)))[2, 1],
- "80f2aed3c618c423ddf05a2891229fba44942d907173152442cf6591441ed6dc")
- expect_equal(collect(select(df, format_string("Name:%s", df$name)))[2, 1], "Name:Andy")
- expect_equal(collect(select(df, format_string("%s, %d", df$name, df$age)))[2, 1], "Andy, 30")
- expect_equal(collect(select(df, regexp_extract(df$name, "(n.y)", 1)))[2, 1], "ndy")
- expect_equal(collect(select(df, regexp_replace(df$name, "(n.y)", "ydn")))[2, 1], "Aydn")
-
- l2 <- list(list(a = "aaads"))
- df2 <- createDataFrame(l2)
- expect_equal(collect(select(df2, locate("aa", df2$a)))[1, 1], 1)
- expect_equal(collect(select(df2, locate("aa", df2$a, 2)))[1, 1], 2)
- expect_equal(collect(select(df2, lpad(df2$a, 8, "#")))[1, 1], "###aaads") # nolint
- expect_equal(collect(select(df2, rpad(df2$a, 8, "#")))[1, 1], "aaads###") # nolint
-
- l3 <- list(list(a = "a.b.c.d"))
- df3 <- createDataFrame(l3)
- expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
- expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
- expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
-
- l4 <- list(list(a = "a.b@c.d 1\\b"))
- df4 <- createDataFrame(l4)
- expect_equal(
- collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
- list(list("a.b@c.d", "1\\b"))
- )
- expect_equal(
- collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
- list(list("a", "b@c", "d 1\\b"))
- )
- expect_equal(
- collect(select(df4, split_string(df4$a, "@")))[1, 1],
- list(list("a.b", "c.d 1\\b"))
- )
- expect_equal(
- collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
- list(list("a.b@c.d 1", "b"))
- )
-
- l5 <- list(list(a = "abc"))
- df5 <- createDataFrame(l5)
- expect_equal(
- collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
- "abc"
- )
- expect_equal(
- collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
- "abcabcabc"
- )
- expect_equal(
- collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
- ""
- )
-})
-
-test_that("date functions on a DataFrame", {
- .originalTimeZone <- Sys.getenv("TZ")
- Sys.setenv(TZ = "UTC")
- l <- list(list(a = 1L, b = as.Date("2012-12-13")),
- list(a = 2L, b = as.Date("2013-12-14")),
- list(a = 3L, b = as.Date("2014-12-15")))
- df <- createDataFrame(l)
- expect_equal(collect(select(df, dayofmonth(df$b)))[, 1], c(13, 14, 15))
- expect_equal(collect(select(df, dayofyear(df$b)))[, 1], c(348, 348, 349))
- expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51))
- expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014))
- expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12))
- expect_equal(collect(select(df, last_day(df$b)))[, 1],
- c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31")))
- expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1],
- c(as.Date("2012-12-17"), as.Date("2013-12-16"), as.Date("2014-12-22")))
- expect_equal(collect(select(df, date_format(df$b, "y")))[, 1], c("2012", "2013", "2014"))
- expect_equal(collect(select(df, add_months(df$b, 3)))[, 1],
- c(as.Date("2013-03-13"), as.Date("2014-03-14"), as.Date("2015-03-15")))
- expect_equal(collect(select(df, date_add(df$b, 1)))[, 1],
- c(as.Date("2012-12-14"), as.Date("2013-12-15"), as.Date("2014-12-16")))
- expect_equal(collect(select(df, date_sub(df$b, 1)))[, 1],
- c(as.Date("2012-12-12"), as.Date("2013-12-13"), as.Date("2014-12-14")))
-
- l2 <- list(list(a = 1L, b = as.POSIXlt("2012-12-13 12:34:00", tz = "UTC")),
- list(a = 2L, b = as.POSIXlt("2014-12-15 01:24:34", tz = "UTC")))
- df2 <- createDataFrame(l2)
- expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
- expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
- expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
- c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC")))
- expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
- c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC")))
- expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
- expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
- expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
-
- l3 <- list(list(a = 1000), list(a = -1000))
- df3 <- createDataFrame(l3)
- result31 <- collect(select(df3, from_unixtime(df3$a)))
- expect_equal(grep("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", result31[, 1], perl = TRUE),
- c(1, 2))
- result32 <- collect(select(df3, from_unixtime(df3$a, "yyyy")))
- expect_equal(grep("\\d{4}", result32[, 1]), c(1, 2))
- Sys.setenv(TZ = .originalTimeZone)
-})
-
-test_that("greatest() and least() on a DataFrame", {
- l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
- df <- createDataFrame(l)
- expect_equal(collect(select(df, greatest(df$a, df$b)))[, 1], c(2, 4))
- expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3))
-})
-
-test_that("time windowing (window()) with all inputs", {
- df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
- df$window <- window(df$t, "5 seconds", "5 seconds", "0 seconds")
- local <- collect(df)$v
- # Not checking time windows because of possible time zone issues. Just checking that the function
- # works
- expect_equal(local, c(1))
-})
-
-test_that("time windowing (window()) with slide duration", {
- df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
- df$window <- window(df$t, "5 seconds", "2 seconds")
- local <- collect(df)$v
- # Not checking time windows because of possible time zone issues. Just checking that the function
- # works
- expect_equal(local, c(1, 1))
-})
-
-test_that("time windowing (window()) with start time", {
- df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
- df$window <- window(df$t, "5 seconds", startTime = "2 seconds")
- local <- collect(df)$v
- # Not checking time windows because of possible time zone issues. Just checking that the function
- # works
- expect_equal(local, c(1))
-})
-
-test_that("time windowing (window()) with just window duration", {
- df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
- df$window <- window(df$t, "5 seconds")
- local <- collect(df)$v
- # Not checking time windows because of possible time zone issues. Just checking that the function
- # works
- expect_equal(local, c(1))
-})
-
-test_that("when(), otherwise() and ifelse() on a DataFrame", {
- l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
- df <- createDataFrame(l)
- expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, 1)))[, 1], c(NA, 1))
- expect_equal(collect(select(df, otherwise(when(df$a > 1, 1), 0)))[, 1], c(0, 1))
- expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
-})
-
-test_that("when(), otherwise() and ifelse() with column on a DataFrame", {
- l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
- df <- createDataFrame(l)
- expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, lit(1))))[, 1], c(NA, 1))
- expect_equal(collect(select(df, otherwise(when(df$a > 1, lit(1)), lit(0))))[, 1], c(0, 1))
- expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, lit(0), lit(1))))[, 1], c(1, 0))
-})
-
-test_that("group by, agg functions", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
- df1 <- agg(df, name = "max", age = "sum")
- expect_equal(1, count(df1))
- df1 <- agg(df, age2 = max(df$age))
- expect_equal(1, count(df1))
- expect_equal(columns(df1), c("age2"))
-
- gd <- groupBy(df, "name")
- expect_is(gd, "GroupedData")
- df2 <- count(gd)
- expect_is(df2, "SparkDataFrame")
- expect_equal(3, count(df2))
-
- # Also test group_by, summarize, mean
- gd1 <- group_by(df, "name")
- expect_is(gd1, "GroupedData")
- df_summarized <- summarize(gd, mean_age = mean(df$age))
- expect_is(df_summarized, "SparkDataFrame")
- expect_equal(3, count(df_summarized))
-
- df3 <- agg(gd, age = "stddev")
- expect_is(df3, "SparkDataFrame")
- df3_local <- collect(df3)
- expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
-
- df4 <- agg(gd, sumAge = sum(df$age))
- expect_is(df4, "SparkDataFrame")
- expect_equal(3, count(df4))
- expect_equal(columns(df4), c("name", "sumAge"))
-
- df5 <- sum(gd, "age")
- expect_is(df5, "SparkDataFrame")
- expect_equal(3, count(df5))
-
- expect_equal(3, count(mean(gd)))
- expect_equal(3, count(max(gd)))
- expect_equal(30, collect(max(gd))[2, 2])
- expect_equal(1, collect(count(gd))[1, 2])
-
- mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
- "{\"name\":\"ID1\", \"value\": \"10\"}",
- "{\"name\":\"ID1\", \"value\": \"22\"}",
- "{\"name\":\"ID2\", \"value\": \"-3\"}")
- jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(mockLines2, jsonPath2)
- gd2 <- groupBy(read.json(jsonPath2), "name")
- df6 <- agg(gd2, value = "sum")
- df6_local <- collect(df6)
- expect_equal(42, df6_local[df6_local$name == "ID1", ][1, 2])
- expect_equal(-3, df6_local[df6_local$name == "ID2", ][1, 2])
-
- df7 <- agg(gd2, value = "stddev")
- df7_local <- collect(df7)
- expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
- expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
-
- mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"Justin\", \"age\":19}",
- "{\"name\":\"Justin\", \"age\":1}")
- jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(mockLines3, jsonPath3)
- df8 <- read.json(jsonPath3)
- gd3 <- groupBy(df8, "name")
- gd3_local <- collect(sum(gd3))
- expect_equal(60, gd3_local[gd3_local$name == "Andy", ][1, 2])
- expect_equal(20, gd3_local[gd3_local$name == "Justin", ][1, 2])
-
- expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
- gd3_local <- collect(agg(gd3, var(df8$age)))
- expect_equal(162, gd3_local[gd3_local$name == "Justin", ][1, 2])
-
- # Test stats::sd, stats::var are working
- expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
- expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
-
- # Test collect_list and collect_set
- gd3_collections_local <- collect(
- agg(gd3, collect_set(df8$age), collect_list(df8$age))
- )
-
- expect_equal(
- unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 2]),
- c(30)
- )
-
- expect_equal(
- unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 3]),
- c(30, 30)
- )
-
- expect_equal(
- sort(unlist(
- gd3_collections_local[gd3_collections_local$name == "Justin", 3]
- )),
- c(1, 19)
- )
-
- unlink(jsonPath2)
- unlink(jsonPath3)
-})
-
-test_that("pivot GroupedData column", {
- df <- createDataFrame(data.frame(
- earnings = c(10000, 10000, 11000, 15000, 12000, 20000, 21000, 22000),
- course = c("R", "Python", "R", "Python", "R", "Python", "R", "Python"),
- year = c(2013, 2013, 2014, 2014, 2015, 2015, 2016, 2016)
- ))
- sum1 <- collect(sum(pivot(groupBy(df, "year"), "course"), "earnings"))
- sum2 <- collect(sum(pivot(groupBy(df, "year"), "course", c("Python", "R")), "earnings"))
- sum3 <- collect(sum(pivot(groupBy(df, "year"), "course", list("Python", "R")), "earnings"))
- sum4 <- collect(sum(pivot(groupBy(df, "year"), "course", "R"), "earnings"))
-
- correct_answer <- data.frame(
- year = c(2013, 2014, 2015, 2016),
- Python = c(10000, 15000, 20000, 22000),
- R = c(10000, 11000, 12000, 21000)
- )
- expect_equal(sum1, correct_answer)
- expect_equal(sum2, correct_answer)
- expect_equal(sum3, correct_answer)
- expect_equal(sum4, correct_answer[, c("year", "R")])
-
- expect_error(collect(sum(pivot(groupBy(df, "year"), "course", c("R", "R")), "earnings")))
- expect_error(collect(sum(pivot(groupBy(df, "year"), "course", list("R", "R")), "earnings")))
-})
-
-test_that("test multi-dimensional aggregations with cube and rollup", {
- df <- createDataFrame(data.frame(
- id = 1:6,
- year = c(2016, 2016, 2016, 2017, 2017, 2017),
- salary = c(10000, 15000, 20000, 22000, 32000, 21000),
- department = c("management", "rnd", "sales", "management", "rnd", "sales")
- ))
-
- actual_cube <- collect(
- orderBy(
- agg(
- cube(df, "year", "department"),
- expr("sum(salary) AS total_salary"),
- expr("avg(salary) AS average_salary"),
- alias(grouping_bit(df$year), "grouping_year"),
- alias(grouping_bit(df$department), "grouping_department"),
- alias(grouping_id(df$year, df$department), "grouping_id")
- ),
- "year", "department"
- )
- )
-
- expected_cube <- data.frame(
- year = c(rep(NA, 4), rep(2016, 4), rep(2017, 4)),
- department = rep(c(NA, "management", "rnd", "sales"), times = 3),
- total_salary = c(
- 120000, # Total
- 10000 + 22000, 15000 + 32000, 20000 + 21000, # Department only
- 20000 + 15000 + 10000, # 2016
- 10000, 15000, 20000, # 2016 each department
- 21000 + 32000 + 22000, # 2017
- 22000, 32000, 21000 # 2017 each department
- ),
- average_salary = c(
- # Total
- mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
- # Mean by department
- mean(c(10000, 22000)), mean(c(15000, 32000)), mean(c(20000, 21000)),
- mean(c(10000, 15000, 20000)), # 2016
- 10000, 15000, 20000, # 2016 each department
- mean(c(21000, 32000, 22000)), # 2017
- 22000, 32000, 21000 # 2017 each department
- ),
- grouping_year = c(
- 1, # global
- 1, 1, 1, # by department
- 0, # 2016
- 0, 0, 0, # 2016 by department
- 0, # 2017
- 0, 0, 0 # 2017 by department
- ),
- grouping_department = c(
- 1, # global
- 0, 0, 0, # by department
- 1, # 2016
- 0, 0, 0, # 2016 by department
- 1, # 2017
- 0, 0, 0 # 2017 by department
- ),
- grouping_id = c(
- 3, # 11
- 2, 2, 2, # 10
- 1, # 01
- 0, 0, 0, # 00
- 1, # 01
- 0, 0, 0 # 00
- ),
- stringsAsFactors = FALSE
- )
-
- expect_equal(actual_cube, expected_cube)
-
- # cube should accept column objects
- expect_equal(
- count(sum(cube(df, df$year, df$department), "salary")),
- 12
- )
-
- # cube without columns should result in a single aggregate
- expect_equal(
- collect(agg(cube(df), expr("sum(salary) as total_salary"))),
- data.frame(total_salary = 120000)
- )
-
- actual_rollup <- collect(
- orderBy(
- agg(
- rollup(df, "year", "department"),
- expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"),
- alias(grouping_bit(df$year), "grouping_year"),
- alias(grouping_bit(df$department), "grouping_department"),
- alias(grouping_id(df$year, df$department), "grouping_id")
- ),
- "year", "department"
- )
- )
-
- expected_rollup <- data.frame(
- year = c(NA, rep(2016, 4), rep(2017, 4)),
- department = c(NA, rep(c(NA, "management", "rnd", "sales"), times = 2)),
- total_salary = c(
- 120000, # Total
- 20000 + 15000 + 10000, # 2016
- 10000, 15000, 20000, # 2016 each department
- 21000 + 32000 + 22000, # 2017
- 22000, 32000, 21000 # 2017 each department
- ),
- average_salary = c(
- # Total
- mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
- mean(c(10000, 15000, 20000)), # 2016
- 10000, 15000, 20000, # 2016 each department
- mean(c(21000, 32000, 22000)), # 2017
- 22000, 32000, 21000 # 2017 each department
- ),
- grouping_year = c(
- 1, # global
- 0, # 2016
- 0, 0, 0, # 2016 each department
- 0, # 2017
- 0, 0, 0 # 2017 each department
- ),
- grouping_department = c(
- 1, # global
- 1, # 2016
- 0, 0, 0, # 2016 each department
- 1, # 2017
- 0, 0, 0 # 2017 each department
- ),
- grouping_id = c(
- 3, # 11
- 1, # 01
- 0, 0, 0, # 00
- 1, # 01
- 0, 0, 0 # 00
- ),
- stringsAsFactors = FALSE
- )
-
- expect_equal(actual_rollup, expected_rollup)
-
- # cube should accept column objects
- expect_equal(
- count(sum(rollup(df, df$year, df$department), "salary")),
- 9
- )
-
- # rollup without columns should result in a single aggregate
- expect_equal(
- collect(agg(rollup(df), expr("sum(salary) as total_salary"))),
- data.frame(total_salary = 120000)
- )
-})
-
-test_that("arrange() and orderBy() on a DataFrame", {
- df <- read.json(jsonPath)
- sorted <- arrange(df, df$age)
- expect_equal(collect(sorted)[1, 2], "Michael")
-
- sorted2 <- arrange(df, "name", decreasing = FALSE)
- expect_equal(collect(sorted2)[2, "age"], 19)
-
- sorted3 <- orderBy(df, asc(df$age))
- expect_true(is.na(first(sorted3)$age))
- expect_equal(collect(sorted3)[2, "age"], 19)
-
- sorted4 <- orderBy(df, desc(df$name))
- expect_equal(first(sorted4)$name, "Michael")
- expect_equal(collect(sorted4)[3, "name"], "Andy")
-
- sorted5 <- arrange(df, "age", "name", decreasing = TRUE)
- expect_equal(collect(sorted5)[1, 2], "Andy")
-
- sorted6 <- arrange(df, "age", "name", decreasing = c(T, F))
- expect_equal(collect(sorted6)[1, 2], "Andy")
-
- sorted7 <- arrange(df, "name", decreasing = FALSE)
- expect_equal(collect(sorted7)[2, "age"], 19)
-})
-
-test_that("filter() on a DataFrame", {
- df <- read.json(jsonPath)
- filtered <- filter(df, "age > 20")
- expect_equal(count(filtered), 1)
- expect_equal(collect(filtered)$name, "Andy")
- filtered2 <- where(df, df$name != "Michael")
- expect_equal(count(filtered2), 2)
- expect_equal(collect(filtered2)$age[2], 19)
-
- # test suites for %in%
- filtered3 <- filter(df, "age in (19)")
- expect_equal(count(filtered3), 1)
- filtered4 <- filter(df, "age in (19, 30)")
- expect_equal(count(filtered4), 2)
- filtered5 <- where(df, df$age %in% c(19))
- expect_equal(count(filtered5), 1)
- filtered6 <- where(df, df$age %in% c(19, 30))
- expect_equal(count(filtered6), 2)
-
- # test suites for %<=>%
- dfNa <- read.json(jsonPathNa)
- expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
- expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
- expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
- expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
- # match NA from two columns
- expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
- expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)
-
- # Test stats::filter is working
- #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
-})
-
-test_that("join(), crossJoin() and merge() on a DataFrame", {
- skip_on_cran()
-
- df <- read.json(jsonPath)
-
- mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
- "{\"name\":\"Andy\", \"test\": \"no\"}",
- "{\"name\":\"Justin\", \"test\": \"yes\"}",
- "{\"name\":\"Bob\", \"test\": \"yes\"}")
- jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(mockLines2, jsonPath2)
- df2 <- read.json(jsonPath2)
-
- # inner join, not cartesian join
- expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
- # cartesian join
- expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
- paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
- " INNER join between logical plans).*"))
-
- joined <- crossJoin(df, df2)
- expect_equal(names(joined), c("age", "name", "name", "test"))
- expect_equal(count(joined), 12)
- expect_equal(names(collect(joined)), c("age", "name", "name", "test"))
-
- joined2 <- join(df, df2, df$name == df2$name)
- expect_equal(names(joined2), c("age", "name", "name", "test"))
- expect_equal(count(joined2), 3)
-
- joined3 <- join(df, df2, df$name == df2$name, "rightouter")
- expect_equal(names(joined3), c("age", "name", "name", "test"))
- expect_equal(count(joined3), 4)
- expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-
- joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
- alias(df$age + 5, "newAge"), df$name, df2$test)
- expect_equal(names(joined4), c("newAge", "name", "test"))
- expect_equal(count(joined4), 4)
- expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
-
- joined5 <- join(df, df2, df$name == df2$name, "leftouter")
- expect_equal(names(joined5), c("age", "name", "name", "test"))
- expect_equal(count(joined5), 3)
- expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
-
- joined6 <- join(df, df2, df$name == df2$name, "inner")
- expect_equal(names(joined6), c("age", "name", "name", "test"))
- expect_equal(count(joined6), 3)
-
- joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
- expect_equal(names(joined7), c("age", "name"))
- expect_equal(count(joined7), 3)
-
- joined8 <- join(df, df2, df$name == df2$name, "left_outer")
- expect_equal(names(joined8), c("age", "name", "name", "test"))
- expect_equal(count(joined8), 3)
- expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
-
- joined9 <- join(df, df2, df$name == df2$name, "right_outer")
- expect_equal(names(joined9), c("age", "name", "name", "test"))
- expect_equal(count(joined9), 4)
- expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
-
- merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
- expect_equal(count(merged), 4)
- expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
- expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19)
-
- merged <- merge(df, df2, suffixes = c("-X", "-Y"))
- expect_equal(count(merged), 3)
- expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
- expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30)
-
- merged <- merge(df, df2, by = "name", suffixes = c("-X", "-Y"), sort = FALSE)
- expect_equal(count(merged), 3)
- expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
- expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael")
-
- merged <- merge(df, df2, by = "name", all = T, sort = T)
- expect_equal(count(merged), 4)
- expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
- expect_equal(collect(orderBy(merged, merged$"name_y"))$"name_x"[1], "Andy")
-
- merged <- merge(df, df2, by = NULL)
- expect_equal(count(merged), 12)
- expect_equal(names(merged), c("age", "name", "name", "test"))
-
- mockLines3 <- c("{\"name\":\"Michael\", \"name_y\":\"Michael\", \"test\": \"yes\"}",
- "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}",
- "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}",
- "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
- jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(mockLines3, jsonPath3)
- df3 <- read.json(jsonPath3)
- expect_error(merge(df, df3),
- paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
- "Please use different suffixes for the intersected columns.", sep = ""))
-
- unlink(jsonPath2)
- unlink(jsonPath3)
-
- # Join with broadcast hint
- df1 <- sql("SELECT * FROM range(10e10)")
- df2 <- sql("SELECT * FROM range(10e10)")
-
- execution_plan <- capture.output(explain(join(df1, df2, df1$id == df2$id)))
- expect_false(any(grepl("BroadcastHashJoin", execution_plan)))
-
- execution_plan_hint <- capture.output(
- explain(join(df1, hint(df2, "broadcast"), df1$id == df2$id))
- )
- expect_true(any(grepl("BroadcastHashJoin", execution_plan_hint)))
-
- execution_plan_broadcast <- capture.output(
- explain(join(df1, broadcast(df2), df1$id == df2$id))
- )
- expect_true(any(grepl("BroadcastHashJoin", execution_plan_broadcast)))
-})
-
-test_that("toJSON() on DataFrame", {
- df <- as.DataFrame(cars)
- df_json <- toJSON(df)
- expect_is(df_json, "SparkDataFrame")
- expect_equal(colnames(df_json), c("value"))
- expect_equal(head(df_json, 1),
- data.frame(value = "{\"speed\":4.0,\"dist\":2.0}", stringsAsFactors = FALSE))
-})
-
-test_that("showDF()", {
- df <- read.json(jsonPath)
- expected <- paste("+----+-------+\n",
- "| age| name|\n",
- "+----+-------+\n",
- "|null|Michael|\n",
- "| 30| Andy|\n",
- "| 19| Justin|\n",
- "+----+-------+\n", sep = "")
- expected2 <- paste("+---+----+\n",
- "|age|name|\n",
- "+---+----+\n",
- "|nul| Mic|\n",
- "| 30| And|\n",
- "| 19| Jus|\n",
- "+---+----+\n", sep = "")
- expect_output(showDF(df), expected)
- expect_output(showDF(df, truncate = 3), expected2)
-})
-
-test_that("isLocal()", {
- df <- read.json(jsonPath)
- expect_false(isLocal(df))
-})
-
-test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
- df <- read.json(jsonPath)
-
- lines <- c("{\"name\":\"Bob\", \"age\":24}",
- "{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"James\", \"age\":35}")
- jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
- writeLines(lines, jsonPath2)
- df2 <- read.df(jsonPath2, "json")
-
- unioned <- arrange(union(df, df2), df$age)
- expect_is(unioned, "SparkDataFrame")
- expect_equal(count(unioned), 6)
- expect_equal(first(unioned)$name, "Michael")
- expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
-
- unioned2 <- arrange(rbind(unioned, df, df2), df$age)
- expect_is(unioned2, "SparkDataFrame")
- expect_equal(count(unioned2), 12)
- expect_equal(first(unioned2)$name, "Michael")
-
- df3 <- df2
- names(df3)[1] <- "newName"
- expect_error(rbind(df, df3),
- "Names of input data frames are different.")
- expect_error(rbind(df, df2, df3),
- "Names of input data frames are different.")
-
- excepted <- arrange(except(df, df2), desc(df$age))
- expect_is(unioned, "SparkDataFrame")
- expect_equal(count(excepted), 2)
- expect_equal(first(excepted)$name, "Justin")
-
- intersected <- arrange(intersect(df, df2), df$age)
- expect_is(unioned, "SparkDataFrame")
- expect_equal(count(intersected), 1)
- expect_equal(first(intersected)$name, "Andy")
-
- # Test base::union is working
- expect_equal(union(c(1:3), c(3:5)), c(1:5))
-
- # Test base::rbind is working
- expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)
-
- # Test base::intersect is working
- expect_equal(length(intersect(1:20, 3:23)), 18)
-
- unlink(jsonPath2)
-})
-
-test_that("withColumn() and withColumnRenamed()", {
- df <- read.json(jsonPath)
- newDF <- withColumn(df, "newAge", df$age + 2)
- expect_equal(length(columns(newDF)), 3)
- expect_equal(columns(newDF)[3], "newAge")
- expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
-
- # Replace existing column
- newDF <- withColumn(df, "age", df$age + 2)
- expect_equal(length(columns(newDF)), 2)
- expect_equal(first(filter(newDF, df$name != "Michael"))$age, 32)
-
- newDF <- withColumn(df, "age", 18)
- expect_equal(length(columns(newDF)), 2)
- expect_equal(first(newDF)$age, 18)
-
- expect_error(withColumn(df, "age", list("a")),
- "Literal value must be atomic in length of 1")
-
- newDF2 <- withColumnRenamed(df, "age", "newerAge")
- expect_equal(length(columns(newDF2)), 2)
- expect_equal(columns(newDF2)[1], "newerAge")
-})
-
-test_that("mutate(), transform(), rename() and names()", {
- df <- read.json(jsonPath)
- newDF <- mutate(df, newAge = df$age + 2)
- expect_equal(length(columns(newDF)), 3)
- expect_equal(columns(newDF)[3], "newAge")
- expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
-
- newDF <- mut
<TRUNCATED>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
new file mode 100644
index 0000000..c790d02
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -0,0 +1,3474 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("SparkSQL functions")
+
+# Utility function for easily checking the values of a StructField
+checkStructField <- function(actual, expectedName, expectedType, expectedNullable) {
+ expect_equal(class(actual), "structField")
+ expect_equal(actual$name(), expectedName)
+ expect_equal(actual$dataType.toString(), expectedType)
+ expect_equal(actual$nullable(), expectedNullable)
+}
+
+markUtf8 <- function(s) {
+ Encoding(s) <- "UTF-8"
+ s
+}
+
+setHiveContext <- function(sc) {
+ if (exists(".testHiveSession", envir = .sparkREnv)) {
+ hiveSession <- get(".testHiveSession", envir = .sparkREnv)
+ } else {
+ # initialize once and reuse
+ ssc <- callJMethod(sc, "sc")
+ hiveCtx <- tryCatch({
+ newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
+ },
+ error = function(err) {
+ skip("Hive is not build with SparkSQL, skipped")
+ })
+ hiveSession <- callJMethod(hiveCtx, "sparkSession")
+ }
+ previousSession <- get(".sparkRsession", envir = .sparkREnv)
+ assign(".sparkRsession", hiveSession, envir = .sparkREnv)
+ assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
+ hiveSession
+}
+
+unsetHiveContext <- function() {
+ previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
+ assign(".sparkRsession", previousSession, envir = .sparkREnv)
+ remove(".prevSparkRsession", envir = .sparkREnv)
+}
+
+# Tests for SparkSQL functions in SparkR
+
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkSession <- if (not_cran_or_windows_with_hadoop()) {
+ sparkR.session(master = sparkRTestMaster)
+ } else {
+ sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+ }
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+ "{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
+writeLines(mockLines, jsonPath)
+
+# For test nafunctions, like dropna(), fillna(),...
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+ "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+ "{\"name\":\"David\",\"age\":60,\"height\":null}",
+ "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
+ "{\"name\":null,\"age\":null,\"height\":null}")
+jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesNa, jsonPathNa)
+
+# For test complex types in DataFrame
+mockLinesComplexType <-
+ c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+ "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+ "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
+# For test map type and struct type in DataFrame
+mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+ "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+ "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
+if (.Platform$OS.type == "windows") {
+ Sys.setenv(TZ = "GMT")
+}
+
+test_that("calling sparkRSQL.init returns existing SQL context", {
+ skip_on_cran()
+
+ sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+ expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
+})
+
+test_that("calling sparkRSQL.init returns existing SparkSession", {
+ skip_on_cran()
+
+ expect_equal(suppressWarnings(sparkRSQL.init(sc)), sparkSession)
+})
+
+test_that("calling sparkR.session returns existing SparkSession", {
+ skip_on_cran()
+
+ expect_equal(sparkR.session(), sparkSession)
+})
+
+test_that("infer types and check types", {
+ expect_equal(infer_type(1L), "integer")
+ expect_equal(infer_type(1.0), "double")
+ expect_equal(infer_type("abc"), "string")
+ expect_equal(infer_type(TRUE), "boolean")
+ expect_equal(infer_type(as.Date("2015-03-11")), "date")
+ expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
+ expect_equal(infer_type(c(1L, 2L)), "array<integer>")
+ expect_equal(infer_type(list(1L, 2L)), "array<integer>")
+ expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
+ e <- new.env()
+ assign("a", 1L, envir = e)
+ expect_equal(infer_type(e), "map<string,integer>")
+
+ expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
+
+ expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
+})
+
+test_that("structType and structField", {
+ testField <- structField("a", "string")
+ expect_is(testField, "structField")
+ expect_equal(testField$name(), "a")
+ expect_true(testField$nullable())
+
+ testSchema <- structType(testField, structField("b", "integer"))
+ expect_is(testSchema, "structType")
+ expect_is(testSchema$fields()[[2]], "structField")
+ expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
+})
+
+test_that("structField type strings", {
+ # positive cases
+ primitiveTypes <- list(byte = "ByteType",
+ integer = "IntegerType",
+ float = "FloatType",
+ double = "DoubleType",
+ string = "StringType",
+ binary = "BinaryType",
+ boolean = "BooleanType",
+ timestamp = "TimestampType",
+ date = "DateType",
+ tinyint = "ByteType",
+ smallint = "ShortType",
+ int = "IntegerType",
+ bigint = "LongType",
+ decimal = "DecimalType(10,0)")
+
+ complexTypes <- list("map<string,integer>" = "MapType(StringType,IntegerType,true)",
+ "array<string>" = "ArrayType(StringType,true)",
+ "struct<a:string>" = "StructType(StructField(a,StringType,true))")
+
+ typeList <- c(primitiveTypes, complexTypes)
+ typeStrings <- names(typeList)
+
+ for (i in seq_along(typeStrings)){
+ typeString <- typeStrings[i]
+ expected <- typeList[[i]]
+ testField <- structField("_col", typeString)
+ expect_is(testField, "structField")
+ expect_true(testField$nullable())
+ expect_equal(testField$dataType.toString(), expected)
+ }
+
+ # negative cases
+ primitiveErrors <- list(Byte = "Byte",
+ INTEGER = "INTEGER",
+ numeric = "numeric",
+ character = "character",
+ raw = "raw",
+ logical = "logical",
+ short = "short",
+ varchar = "varchar",
+ long = "long",
+ char = "char")
+
+ complexErrors <- list("map<string, integer>" = " integer",
+ "array<String>" = "String",
+ "struct<a:string >" = "string ",
+ "map <string,integer>" = "map <string,integer>",
+ "array< string>" = " string",
+ "struct<a: string>" = " string")
+
+ errorList <- c(primitiveErrors, complexErrors)
+ typeStrings <- names(errorList)
+
+ for (i in seq_along(typeStrings)){
+ typeString <- typeStrings[i]
+ expected <- paste0("Unsupported type for SparkDataframe: ", errorList[[i]])
+ expect_error(structField("_col", typeString), expected)
+ }
+})
+
+test_that("create DataFrame from RDD", {
+ skip_on_cran()
+
+ rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
+ df <- createDataFrame(rdd, list("a", "b"))
+ dfAsDF <- as.DataFrame(rdd, list("a", "b"))
+ expect_is(df, "SparkDataFrame")
+ expect_is(dfAsDF, "SparkDataFrame")
+ expect_equal(count(df), 10)
+ expect_equal(count(dfAsDF), 10)
+ expect_equal(nrow(df), 10)
+ expect_equal(nrow(dfAsDF), 10)
+ expect_equal(ncol(df), 2)
+ expect_equal(ncol(dfAsDF), 2)
+ expect_equal(dim(df), c(10, 2))
+ expect_equal(dim(dfAsDF), c(10, 2))
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(columns(dfAsDF), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+ expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
+
+ df <- createDataFrame(rdd)
+ dfAsDF <- as.DataFrame(rdd)
+ expect_is(df, "SparkDataFrame")
+ expect_is(dfAsDF, "SparkDataFrame")
+ expect_equal(columns(df), c("_1", "_2"))
+ expect_equal(columns(dfAsDF), c("_1", "_2"))
+
+ schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
+ structField(x = "b", type = "string", nullable = TRUE))
+ df <- createDataFrame(rdd, schema)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+ rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
+ df <- createDataFrame(rdd)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 10)
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+ schema <- structType(structField("name", "string"), structField("age", "integer"),
+ structField("height", "float"))
+ df <- read.df(jsonPathNa, "json", schema)
+ df2 <- createDataFrame(toRDD(df), schema)
+ df2AsDF <- as.DataFrame(toRDD(df), schema)
+ expect_equal(columns(df2), c("name", "age", "height"))
+ expect_equal(columns(df2AsDF), c("name", "age", "height"))
+ expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
+ expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
+ expect_equal(as.list(collect(where(df2, df2$name == "Bob"))),
+ list(name = "Bob", age = 16, height = 176.5))
+ expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))),
+ list(name = "Bob", age = 16, height = 176.5))
+
+ localDF <- data.frame(name = c("John", "Smith", "Sarah"),
+ age = c(19L, 23L, 18L),
+ height = c(176.5, 181.4, 173.7))
+ df <- createDataFrame(localDF, schema)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 3)
+ expect_equal(columns(df), c("name", "age", "height"))
+ expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
+ expect_equal(as.list(collect(where(df, df$name == "John"))),
+ list(name = "John", age = 19L, height = 176.5))
+ expect_equal(getNumPartitions(df), 1)
+
+ df <- as.DataFrame(cars, numPartitions = 2)
+ expect_equal(getNumPartitions(df), 2)
+ df <- createDataFrame(cars, numPartitions = 3)
+ expect_equal(getNumPartitions(df), 3)
+ # validate limit by num of rows
+ df <- createDataFrame(cars, numPartitions = 60)
+ expect_equal(getNumPartitions(df), 50)
+ # validate when 1 < (length(coll) / numSlices) << length(coll)
+ df <- createDataFrame(cars, numPartitions = 20)
+ expect_equal(getNumPartitions(df), 20)
+
+ df <- as.DataFrame(data.frame(0))
+ expect_is(df, "SparkDataFrame")
+ df <- createDataFrame(list(list(1)))
+ expect_is(df, "SparkDataFrame")
+ df <- as.DataFrame(data.frame(0), numPartitions = 2)
+ # no data to partition, goes to 1
+ expect_equal(getNumPartitions(df), 1)
+
+ setHiveContext(sc)
+ sql("CREATE TABLE people (name string, age double, height float)")
+ df <- read.df(jsonPathNa, "json", schema)
+ insertInto(df, "people")
+ expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age,
+ c(16))
+ expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
+ c(176.5))
+ sql("DROP TABLE people")
+ unsetHiveContext()
+})
+
+test_that("createDataFrame uses files for large objects", {
+ skip_on_cran()
+
+ # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
+ conf <- callJMethod(sparkSession, "conf")
+ callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
+ df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
+ expect_equal(getNumPartitions(df), 3)
+
+ # Resetting the conf back to default value
+ callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
+ expect_equal(dim(df), dim(iris))
+})
+
+test_that("read/write csv as DataFrame", {
+ if (not_cran_or_windows_with_hadoop()) {
+ csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+ mockLinesCsv <- c("year,make,model,comment,blank",
+ "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+ "1997,Ford,E350,\"Go get one now they are going fast\",",
+ "2015,Chevy,Volt",
+ "NA,Dummy,Placeholder")
+ writeLines(mockLinesCsv, csvPath)
+
+ # default "header" is false, inferSchema to handle "year" as "int"
+ df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+ expect_equal(count(df), 4)
+ expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
+ expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
+ sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
+
+ # since "year" is "int", let's skip the NA values
+ withoutna <- na.omit(df, how = "any", cols = "year")
+ expect_equal(count(withoutna), 3)
+
+ unlink(csvPath)
+ csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+ mockLinesCsv <- c("year,make,model,comment,blank",
+ "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+ "1997,Ford,E350,\"Go get one now they are going fast\",",
+ "2015,Chevy,Volt",
+ "Empty,Dummy,Placeholder")
+ writeLines(mockLinesCsv, csvPath)
+
+ df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty")
+ expect_equal(count(df2), 4)
+ withoutna2 <- na.omit(df2, how = "any", cols = "year")
+ expect_equal(count(withoutna2), 3)
+ expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
+
+ # writing csv file
+ csvPath2 <- tempfile(pattern = "csvtest2", fileext = ".csv")
+ write.df(df2, path = csvPath2, "csv", header = "true")
+ df3 <- read.df(csvPath2, "csv", header = "true")
+ expect_equal(nrow(df3), nrow(df2))
+ expect_equal(colnames(df3), colnames(df2))
+ csv <- read.csv(file = list.files(csvPath2, pattern = "^part", full.names = T)[[1]])
+ expect_equal(colnames(df3), colnames(csv))
+
+ unlink(csvPath)
+ unlink(csvPath2)
+ }
+})
+
+test_that("Support other types for options", {
+ skip_on_cran()
+
+ csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+ mockLinesCsv <- c("year,make,model,comment,blank",
+ "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+ "1997,Ford,E350,\"Go get one now they are going fast\",",
+ "2015,Chevy,Volt",
+ "NA,Dummy,Placeholder")
+ writeLines(mockLinesCsv, csvPath)
+
+ csvDf <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+ expected <- read.df(csvPath, "csv", header = TRUE, inferSchema = TRUE)
+ expect_equal(collect(csvDf), collect(expected))
+
+ expect_error(read.df(csvPath, "csv", header = TRUE, maxColumns = 3))
+ unlink(csvPath)
+})
+
+test_that("convert NAs to null type in DataFrames", {
+ rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
+ df <- createDataFrame(rdd, list("a", "b"))
+ expect_true(is.na(collect(df)[2, "a"]))
+ expect_equal(collect(df)[2, "b"], 4L)
+
+ l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
+ df <- createDataFrame(l)
+ expect_equal(collect(df)[2, "x"], 1L)
+ expect_true(is.na(collect(df)[2, "y"]))
+
+ rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
+ df <- createDataFrame(rdd, list("a", "b"))
+ expect_true(is.na(collect(df)[2, "a"]))
+ expect_equal(collect(df)[2, "b"], 4)
+
+ l <- data.frame(x = 1, y = c(1, NA_real_, 3))
+ df <- createDataFrame(l)
+ expect_equal(collect(df)[2, "x"], 1)
+ expect_true(is.na(collect(df)[2, "y"]))
+
+ l <- list("a", "b", NA, "d")
+ df <- createDataFrame(l)
+ expect_true(is.na(collect(df)[3, "_1"]))
+ expect_equal(collect(df)[4, "_1"], "d")
+
+ l <- list("a", "b", NA_character_, "d")
+ df <- createDataFrame(l)
+ expect_true(is.na(collect(df)[3, "_1"]))
+ expect_equal(collect(df)[4, "_1"], "d")
+
+ l <- list(TRUE, FALSE, NA, TRUE)
+ df <- createDataFrame(l)
+ expect_true(is.na(collect(df)[3, "_1"]))
+ expect_equal(collect(df)[4, "_1"], TRUE)
+})
+
+test_that("toDF", {
+ skip_on_cran()
+
+ rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
+ df <- toDF(rdd, list("a", "b"))
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 10)
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+ df <- toDF(rdd)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(columns(df), c("_1", "_2"))
+
+ schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
+ structField(x = "b", type = "string", nullable = TRUE))
+ df <- toDF(rdd, schema)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+ rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
+ df <- toDF(rdd)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 10)
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+})
+
+test_that("create DataFrame from list or data.frame", {
+ l <- list(list(1, 2), list(3, 4))
+ df <- createDataFrame(l, c("a", "b"))
+ expect_equal(columns(df), c("a", "b"))
+
+ l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+ df <- createDataFrame(l)
+ expect_equal(columns(df), c("a", "b"))
+
+ a <- 1:3
+ b <- c("a", "b", "c")
+ ldf <- data.frame(a, b)
+ df <- createDataFrame(ldf)
+ expect_equal(columns(df), c("a", "b"))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+ expect_equal(count(df), 3)
+ ldf2 <- collect(df)
+ expect_equal(ldf$a, ldf2$a)
+
+ irisdf <- suppressWarnings(createDataFrame(iris))
+ iris_collected <- collect(irisdf)
+ expect_equivalent(iris_collected[, -5], iris[, -5])
+ expect_equal(iris_collected$Species, as.character(iris$Species))
+
+ mtcarsdf <- createDataFrame(mtcars)
+ expect_equivalent(collect(mtcarsdf), mtcars)
+
+ bytes <- as.raw(c(1, 2, 3))
+ df <- createDataFrame(list(list(bytes)))
+ expect_equal(collect(df)[[1]][[1]], bytes)
+})
+
+test_that("create DataFrame with different data types", {
+ l <- list(a = 1L, b = 2, c = TRUE, d = "ss", e = as.Date("2012-12-13"),
+ f = as.POSIXct("2015-03-15 12:13:14.056"))
+ df <- createDataFrame(list(l))
+ expect_equal(dtypes(df), list(c("a", "int"), c("b", "double"), c("c", "boolean"),
+ c("d", "string"), c("e", "date"), c("f", "timestamp")))
+ expect_equal(count(df), 1)
+ expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
+})
+
+test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+ df <- data.frame(
+ id = 1:2,
+ time = c(as.POSIXlt("2016-01-10"), NA),
+ date = c(as.Date("2016-10-01"), NA))
+
+ DF <- collect(createDataFrame(df))
+ expect_true(is.na(DF$date[2]))
+ expect_equal(DF$date[1], as.Date("2016-10-01"))
+ expect_true(is.na(DF$time[2]))
+ expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
+})
+
+test_that("create DataFrame with complex types", {
+ e <- new.env()
+ assign("n", 3L, envir = e)
+
+ s <- listToStruct(list(a = "aa", b = 3L))
+
+ l <- list(as.list(1:10), list("a", "b"), e, s)
+ df <- createDataFrame(list(l), c("a", "b", "c", "d"))
+ expect_equal(dtypes(df), list(c("a", "array<int>"),
+ c("b", "array<string>"),
+ c("c", "map<string,int>"),
+ c("d", "struct<a:string,b:int>")))
+ expect_equal(count(df), 1)
+ ldf <- collect(df)
+ expect_equal(names(ldf), c("a", "b", "c", "d"))
+ expect_equal(ldf[1, 1][[1]], l[[1]])
+ expect_equal(ldf[1, 2][[1]], l[[2]])
+
+ e <- ldf$c[[1]]
+ expect_equal(class(e), "environment")
+ expect_equal(ls(e), "n")
+ expect_equal(e$n, 3L)
+
+ s <- ldf$d[[1]]
+ expect_equal(class(s), "struct")
+ expect_equal(s$a, "aa")
+ expect_equal(s$b, 3L)
+})
+
+test_that("create DataFrame from a data.frame with complex types", {
+ skip_on_cran()
+
+ ldf <- data.frame(row.names = 1:2)
+ ldf$a_list <- list(list(1, 2), list(3, 4))
+ ldf$an_envir <- c(as.environment(list(a = 1, b = 2)), as.environment(list(c = 3)))
+
+ sdf <- createDataFrame(ldf)
+ collected <- collect(sdf)
+
+ expect_identical(ldf[, 1, FALSE], collected[, 1, FALSE])
+ expect_equal(ldf$an_envir, collected$an_envir)
+})
+
+test_that("Collect DataFrame with complex types", {
+ skip_on_cran()
+
+ # ArrayType
+ df <- read.json(complexTypeJsonPath)
+ ldf <- collect(df)
+ expect_equal(nrow(ldf), 3)
+ expect_equal(ncol(ldf), 3)
+ expect_equal(names(ldf), c("c1", "c2", "c3"))
+ expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
+ expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
+ expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+
+ # MapType
+ schema <- structType(structField("name", "string"),
+ structField("info", "map<string,double>"))
+ df <- read.df(mapTypeJsonPath, "json", schema)
+ expect_equal(dtypes(df), list(c("name", "string"),
+ c("info", "map<string,double>")))
+ ldf <- collect(df)
+ expect_equal(nrow(ldf), 3)
+ expect_equal(ncol(ldf), 2)
+ expect_equal(names(ldf), c("name", "info"))
+ expect_equal(ldf$name, c("Bob", "Alice", "David"))
+ bob <- ldf$info[[1]]
+ expect_equal(class(bob), "environment")
+ expect_equal(bob$age, 16)
+ expect_equal(bob$height, 176.5)
+
+ # StructType
+ df <- read.json(mapTypeJsonPath)
+ expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
+ c("name", "string")))
+ ldf <- collect(df)
+ expect_equal(nrow(ldf), 3)
+ expect_equal(ncol(ldf), 2)
+ expect_equal(names(ldf), c("info", "name"))
+ expect_equal(ldf$name, c("Bob", "Alice", "David"))
+ bob <- ldf$info[[1]]
+ expect_equal(class(bob), "struct")
+ expect_equal(bob$age, 16)
+ expect_equal(bob$height, 176.5)
+})
+
+test_that("read/write json files", {
+ if (not_cran_or_windows_with_hadoop()) {
+ # Test read.df
+ df <- read.df(jsonPath, "json")
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 3)
+
+ # Test read.df with a user defined schema
+ schema <- structType(structField("name", type = "string"),
+ structField("age", type = "double"))
+
+ df1 <- read.df(jsonPath, "json", schema)
+ expect_is(df1, "SparkDataFrame")
+ expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
+
+ # Test loadDF
+ df2 <- loadDF(jsonPath, "json", schema)
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
+
+ # Test read.json
+ df <- read.json(jsonPath)
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 3)
+
+ # Test write.df
+ jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json")
+ write.df(df, jsonPath2, "json", mode = "overwrite")
+
+ # Test write.json
+ jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json")
+ write.json(df, jsonPath3)
+
+ # Test read.json()/jsonFile() works with multiple input paths
+ jsonDF1 <- read.json(c(jsonPath2, jsonPath3))
+ expect_is(jsonDF1, "SparkDataFrame")
+ expect_equal(count(jsonDF1), 6)
+ # Suppress warnings because jsonFile is deprecated
+ jsonDF2 <- suppressWarnings(jsonFile(c(jsonPath2, jsonPath3)))
+ expect_is(jsonDF2, "SparkDataFrame")
+ expect_equal(count(jsonDF2), 6)
+
+ unlink(jsonPath2)
+ unlink(jsonPath3)
+ }
+})
+
+test_that("read/write json files - compression option", {
+ skip_on_cran()
+
+ df <- read.df(jsonPath, "json")
+
+ jsonPath <- tempfile(pattern = "jsonPath", fileext = ".json")
+ write.json(df, jsonPath, compression = "gzip")
+ jsonDF <- read.json(jsonPath)
+ expect_is(jsonDF, "SparkDataFrame")
+ expect_equal(count(jsonDF), count(df))
+ expect_true(length(list.files(jsonPath, pattern = ".gz")) > 0)
+
+ unlink(jsonPath)
+})
+
+test_that("jsonRDD() on a RDD with json string", {
+ skip_on_cran()
+
+ sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+ rdd <- parallelize(sc, mockLines)
+ expect_equal(countRDD(rdd), 3)
+ df <- suppressWarnings(jsonRDD(sqlContext, rdd))
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 3)
+
+ rdd2 <- flatMap(rdd, function(x) c(x, x))
+ df <- suppressWarnings(jsonRDD(sqlContext, rdd2))
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 6)
+})
+
+test_that("test tableNames and tables", {
+ count <- count(listTables())
+
+ df <- read.json(jsonPath)
+ createOrReplaceTempView(df, "table1")
+ expect_equal(length(tableNames()), count + 1)
+ expect_equal(length(tableNames("default")), count + 1)
+
+ tables <- listTables()
+ expect_equal(count(tables), count + 1)
+ expect_equal(count(tables()), count(tables))
+ expect_true("tableName" %in% colnames(tables()))
+ expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables())))
+
+ suppressWarnings(registerTempTable(df, "table2"))
+ tables <- listTables()
+ expect_equal(count(tables), count + 2)
+ suppressWarnings(dropTempTable("table1"))
+ expect_true(dropTempView("table2"))
+
+ tables <- listTables()
+ expect_equal(count(tables), count + 0)
+})
+
+test_that(
+ "createOrReplaceTempView() results in a queryable table and sql() results in a new DataFrame", {
+ df <- read.json(jsonPath)
+ createOrReplaceTempView(df, "table1")
+ newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
+ expect_is(newdf, "SparkDataFrame")
+ expect_equal(count(newdf), 1)
+ expect_true(dropTempView("table1"))
+
+ createOrReplaceTempView(df, "dfView")
+ sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
+ out <- capture.output(sqlCast)
+ expect_true(is.data.frame(sqlCast))
+ expect_equal(names(sqlCast)[1], "x")
+ expect_equal(nrow(sqlCast), 1)
+ expect_equal(ncol(sqlCast), 1)
+ expect_equal(out[1], " x")
+ expect_equal(out[2], "1 2")
+ expect_true(dropTempView("dfView"))
+})
+
+test_that("test cache, uncache and clearCache", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ createOrReplaceTempView(df, "table1")
+ cacheTable("table1")
+ uncacheTable("table1")
+ clearCache()
+ expect_true(dropTempView("table1"))
+
+ expect_error(uncacheTable("foo"),
+ "Error in uncacheTable : no such table - Table or view 'foo' not found in database 'default'")
+})
+
+test_that("insertInto() on a registered table", {
+ if (not_cran_or_windows_with_hadoop()) {
+ df <- read.df(jsonPath, "json")
+ write.df(df, parquetPath, "parquet", "overwrite")
+ dfParquet <- read.df(parquetPath, "parquet")
+
+ lines <- c("{\"name\":\"Bob\", \"age\":24}",
+ "{\"name\":\"James\", \"age\":35}")
+ jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".tmp")
+ parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
+ writeLines(lines, jsonPath2)
+ df2 <- read.df(jsonPath2, "json")
+ write.df(df2, parquetPath2, "parquet", "overwrite")
+ dfParquet2 <- read.df(parquetPath2, "parquet")
+
+ createOrReplaceTempView(dfParquet, "table1")
+ insertInto(dfParquet2, "table1")
+ expect_equal(count(sql("select * from table1")), 5)
+ expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
+ expect_true(dropTempView("table1"))
+
+ createOrReplaceTempView(dfParquet, "table1")
+ insertInto(dfParquet2, "table1", overwrite = TRUE)
+ expect_equal(count(sql("select * from table1")), 2)
+ expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
+ expect_true(dropTempView("table1"))
+
+ unlink(jsonPath2)
+ unlink(parquetPath2)
+ }
+})
+
+test_that("tableToDF() returns a new DataFrame", {
+ df <- read.json(jsonPath)
+ createOrReplaceTempView(df, "table1")
+ tabledf <- tableToDF("table1")
+ expect_is(tabledf, "SparkDataFrame")
+ expect_equal(count(tabledf), 3)
+ tabledf2 <- tableToDF("table1")
+ expect_equal(count(tabledf2), 3)
+ expect_true(dropTempView("table1"))
+})
+
+test_that("toRDD() returns an RRDD", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ testRDD <- toRDD(df)
+ expect_is(testRDD, "RDD")
+ expect_equal(countRDD(testRDD), 3)
+})
+
+test_that("union on two RDDs created from DataFrames returns an RRDD", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ RDD1 <- toRDD(df)
+ RDD2 <- toRDD(df)
+ unioned <- unionRDD(RDD1, RDD2)
+ expect_is(unioned, "RDD")
+ expect_equal(getSerializedMode(unioned), "byte")
+ expect_equal(collectRDD(unioned)[[2]]$name, "Andy")
+})
+
+test_that("union on mixed serialization types correctly returns a byte RRDD", {
+ skip_on_cran()
+
+ # Byte RDD
+ nums <- 1:10
+ rdd <- parallelize(sc, nums, 2L)
+
+ # String RDD
+ textLines <- c("Michael",
+ "Andy, 30",
+ "Justin, 19")
+ textPath <- tempfile(pattern = "sparkr-textLines", fileext = ".tmp")
+ writeLines(textLines, textPath)
+ textRDD <- textFile(sc, textPath)
+
+ df <- read.json(jsonPath)
+ dfRDD <- toRDD(df)
+
+ unionByte <- unionRDD(rdd, dfRDD)
+ expect_is(unionByte, "RDD")
+ expect_equal(getSerializedMode(unionByte), "byte")
+ expect_equal(collectRDD(unionByte)[[1]], 1)
+ expect_equal(collectRDD(unionByte)[[12]]$name, "Andy")
+
+ unionString <- unionRDD(textRDD, dfRDD)
+ expect_is(unionString, "RDD")
+ expect_equal(getSerializedMode(unionString), "byte")
+ expect_equal(collectRDD(unionString)[[1]], "Michael")
+ expect_equal(collectRDD(unionString)[[5]]$name, "Andy")
+})
+
+test_that("objectFile() works with row serialization", {
+ skip_on_cran()
+
+ objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ df <- read.json(jsonPath)
+ dfRDD <- toRDD(df)
+ saveAsObjectFile(coalesceRDD(dfRDD, 1L), objectPath)
+ objectIn <- objectFile(sc, objectPath)
+
+ expect_is(objectIn, "RDD")
+ expect_equal(getSerializedMode(objectIn), "byte")
+ expect_equal(collectRDD(objectIn)[[2]]$age, 30)
+})
+
+test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ testRDD <- lapply(df, function(row) {
+ row$newCol <- row$age + 5
+ row
+ })
+ expect_is(testRDD, "RDD")
+ collected <- collectRDD(testRDD)
+ expect_equal(collected[[1]]$name, "Michael")
+ expect_equal(collected[[2]]$newCol, 35)
+})
+
+test_that("collect() returns a data.frame", {
+ df <- read.json(jsonPath)
+ rdf <- collect(df)
+ expect_true(is.data.frame(rdf))
+ expect_equal(names(rdf)[1], "age")
+ expect_equal(nrow(rdf), 3)
+ expect_equal(ncol(rdf), 2)
+
+ # collect() returns data correctly from a DataFrame with 0 row
+ df0 <- limit(df, 0)
+ rdf <- collect(df0)
+ expect_true(is.data.frame(rdf))
+ expect_equal(names(rdf)[1], "age")
+ expect_equal(nrow(rdf), 0)
+ expect_equal(ncol(rdf), 2)
+
+ # collect() correctly handles multiple columns with same name
+ df <- createDataFrame(list(list(1, 2)), schema = c("name", "name"))
+ ldf <- collect(df)
+ expect_equal(names(ldf), c("name", "name"))
+})
+
+test_that("limit() returns DataFrame with the correct number of rows", {
+ df <- read.json(jsonPath)
+ dfLimited <- limit(df, 2)
+ expect_is(dfLimited, "SparkDataFrame")
+ expect_equal(count(dfLimited), 2)
+})
+
+test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
+ df <- read.json(jsonPath)
+ expect_equal(nrow(collect(df)), nrow(take(df, 10)))
+ expect_equal(ncol(collect(df)), ncol(take(df, 10)))
+})
+
+test_that("collect() support Unicode characters", {
+ lines <- c("{\"name\":\"안녕하세요\"}",
+ "{\"name\":\"您好\", \"age\":30}",
+ "{\"name\":\"こんにちは\", \"age\":19}",
+ "{\"name\":\"Xin chào\"}")
+
+ jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(lines, jsonPath)
+
+ df <- read.df(jsonPath, "json")
+ rdf <- collect(df)
+ expect_true(is.data.frame(rdf))
+ expect_equal(rdf$name[1], markUtf8("안녕하세요"))
+ expect_equal(rdf$name[2], markUtf8("您好"))
+ expect_equal(rdf$name[3], markUtf8("こんにちは"))
+ expect_equal(rdf$name[4], markUtf8("Xin chào"))
+
+ df1 <- createDataFrame(rdf)
+ expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+})
+
+test_that("multiple pipeline transformations result in an RDD with the correct values", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ first <- lapply(df, function(row) {
+ row$age <- row$age + 5
+ row
+ })
+ second <- lapply(first, function(row) {
+ row$testCol <- if (row$age == 35 && !is.na(row$age)) TRUE else FALSE
+ row
+ })
+ expect_is(second, "RDD")
+ expect_equal(countRDD(second), 3)
+ expect_equal(collectRDD(second)[[2]]$age, 35)
+ expect_true(collectRDD(second)[[2]]$testCol)
+ expect_false(collectRDD(second)[[3]]$testCol)
+})
+
+test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
+ df <- read.json(jsonPath)
+ expect_false(df@env$isCached)
+ cache(df)
+ expect_true(df@env$isCached)
+
+ unpersist(df)
+ expect_false(df@env$isCached)
+
+ persist(df, "MEMORY_AND_DISK")
+ expect_true(df@env$isCached)
+
+ expect_equal(storageLevel(df),
+ "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
+
+ unpersist(df)
+ expect_false(df@env$isCached)
+
+ # make sure the data is collectable
+ expect_true(is.data.frame(collect(df)))
+})
+
+test_that("setCheckpointDir(), checkpoint() on a DataFrame", {
+ if (not_cran_or_windows_with_hadoop()) {
+ checkpointDir <- file.path(tempdir(), "cproot")
+ expect_true(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
+
+ setCheckpointDir(checkpointDir)
+ df <- read.json(jsonPath)
+ df <- checkpoint(df)
+ expect_is(df, "SparkDataFrame")
+ expect_false(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
+ }
+})
+
+test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
+ df <- read.json(jsonPath)
+ testSchema <- schema(df)
+ expect_equal(length(testSchema$fields()), 2)
+ expect_equal(testSchema$fields()[[1]]$dataType.toString(), "LongType")
+ expect_equal(testSchema$fields()[[2]]$dataType.simpleString(), "string")
+ expect_equal(testSchema$fields()[[1]]$name(), "age")
+
+ testTypes <- dtypes(df)
+ expect_equal(length(testTypes[[1]]), 2)
+ expect_equal(testTypes[[1]][1], "age")
+
+ testCols <- columns(df)
+ expect_equal(length(testCols), 2)
+ expect_equal(testCols[2], "name")
+
+ testNames <- names(df)
+ expect_equal(length(testNames), 2)
+ expect_equal(testNames[2], "name")
+})
+
+test_that("names() colnames() set the column names", {
+ df <- read.json(jsonPath)
+ names(df) <- c("col1", "col2")
+ expect_equal(colnames(df)[2], "col2")
+
+ colnames(df) <- c("col3", "col4")
+ expect_equal(names(df)[1], "col3")
+
+ expect_error(names(df) <- NULL, "Invalid column names.")
+ expect_error(names(df) <- c("sepal.length", "sepal_width"),
+ "Column names cannot contain the '.' symbol.")
+ expect_error(names(df) <- c(1, 2), "Invalid column names.")
+ expect_error(names(df) <- c("a"),
+ "Column names must have the same length as the number of columns in the dataset.")
+ expect_error(names(df) <- c("1", NA), "Column names cannot be NA.")
+
+ expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
+ "Column names cannot contain the '.' symbol.")
+ expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
+ expect_error(colnames(df) <- c("a"),
+ "Column names must have the same length as the number of columns in the dataset.")
+ expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.")
+
+ # Note: if this test is broken, remove check for "." character on colnames<- method
+ irisDF <- suppressWarnings(createDataFrame(iris))
+ expect_equal(names(irisDF)[1], "Sepal_Length")
+
+ # Test base::colnames base::names
+ m2 <- cbind(1, 1:4)
+ expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2"))
+ colnames(m2) <- c("x", "Y")
+ expect_equal(colnames(m2), c("x", "Y"))
+
+ z <- list(a = 1, b = "c", c = 1:3)
+ expect_equal(names(z)[3], "c")
+ names(z)[3] <- "c2"
+ expect_equal(names(z)[3], "c2")
+
+ # Test subset assignment
+ colnames(df)[1] <- "col5"
+ expect_equal(colnames(df)[1], "col5")
+ names(df)[2] <- "col6"
+ expect_equal(names(df)[2], "col6")
+})
+
+test_that("head() and first() return the correct data", {
+ df <- read.json(jsonPath)
+ testHead <- head(df)
+ expect_equal(nrow(testHead), 3)
+ expect_equal(ncol(testHead), 2)
+
+ testHead2 <- head(df, 2)
+ expect_equal(nrow(testHead2), 2)
+ expect_equal(ncol(testHead2), 2)
+
+ testFirst <- first(df)
+ expect_equal(nrow(testFirst), 1)
+
+ # head() and first() return the correct data on
+ # a DataFrame with 0 row
+ df0 <- limit(df, 0)
+
+ testHead <- head(df0)
+ expect_equal(nrow(testHead), 0)
+ expect_equal(ncol(testHead), 2)
+
+ testFirst <- first(df0)
+ expect_equal(nrow(testFirst), 0)
+ expect_equal(ncol(testFirst), 2)
+})
+
+test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
+ lines <- c("{\"name\":\"Michael\"}",
+ "{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"Justin\", \"age\":19}",
+ "{\"name\":\"Justin\", \"age\":19}")
+ jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(lines, jsonPathWithDup)
+
+ df <- read.json(jsonPathWithDup)
+ uniques <- distinct(df)
+ expect_is(uniques, "SparkDataFrame")
+ expect_equal(count(uniques), 3)
+
+ uniques2 <- unique(df)
+ expect_is(uniques2, "SparkDataFrame")
+ expect_equal(count(uniques2), 3)
+
+ # Test dropDuplicates()
+ df <- createDataFrame(
+ list(
+ list(2, 1, 2), list(1, 1, 1),
+ list(1, 2, 1), list(2, 1, 2),
+ list(2, 2, 2), list(2, 2, 1),
+ list(2, 1, 1), list(1, 1, 2),
+ list(1, 2, 2), list(1, 2, 1)),
+ schema = c("key", "value1", "value2"))
+ result <- collect(dropDuplicates(df))
+ expected <- rbind.data.frame(
+ c(1, 1, 1), c(1, 1, 2), c(1, 2, 1),
+ c(1, 2, 2), c(2, 1, 1), c(2, 1, 2),
+ c(2, 2, 1), c(2, 2, 2))
+ names(expected) <- c("key", "value1", "value2")
+ expect_equivalent(
+ result[order(result$key, result$value1, result$value2), ],
+ expected)
+
+ result <- collect(dropDuplicates(df, c("key", "value1")))
+ expected <- rbind.data.frame(
+ c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+ names(expected) <- c("key", "value1", "value2")
+ expect_equivalent(
+ result[order(result$key, result$value1, result$value2), ],
+ expected)
+
+ result <- collect(dropDuplicates(df, "key", "value1"))
+ expected <- rbind.data.frame(
+ c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+ names(expected) <- c("key", "value1", "value2")
+ expect_equivalent(
+ result[order(result$key, result$value1, result$value2), ],
+ expected)
+
+ result <- collect(dropDuplicates(df, "key"))
+ expected <- rbind.data.frame(
+ c(1, 1, 1), c(2, 1, 2))
+ names(expected) <- c("key", "value1", "value2")
+ expect_equivalent(
+ result[order(result$key, result$value1, result$value2), ],
+ expected)
+})
+
+test_that("sample on a DataFrame", {
+ df <- read.json(jsonPath)
+ sampled <- sample(df, FALSE, 1.0)
+ expect_equal(nrow(collect(sampled)), count(df))
+ expect_is(sampled, "SparkDataFrame")
+ sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
+ expect_true(count(sampled2) < 3)
+
+ count1 <- count(sample(df, FALSE, 0.1, 0))
+ count2 <- count(sample(df, FALSE, 0.1, 0))
+ expect_equal(count1, count2)
+
+ # Also test sample_frac
+ sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
+ expect_true(count(sampled3) < 3)
+
+ # nolint start
+ # Test base::sample is working
+ #expect_equal(length(sample(1:12)), 12)
+ # nolint end
+})
+
+test_that("select operators", {
+ df <- select(read.json(jsonPath), "name", "age")
+ expect_is(df$name, "Column")
+ expect_is(df[[2]], "Column")
+ expect_is(df[["age"]], "Column")
+
+ expect_warning(df[[1:2]],
+ "Subset index has length > 1. Only the first index is used.")
+ expect_is(suppressWarnings(df[[1:2]]), "Column")
+ expect_warning(df[[c("name", "age")]],
+ "Subset index has length > 1. Only the first index is used.")
+ expect_is(suppressWarnings(df[[c("name", "age")]]), "Column")
+
+ expect_warning(df[[1:2]] <- df[[1]],
+ "Subset index has length > 1. Only the first index is used.")
+ expect_warning(df[[c("name", "age")]] <- df[[1]],
+ "Subset index has length > 1. Only the first index is used.")
+
+ expect_is(df[, 1, drop = F], "SparkDataFrame")
+ expect_equal(columns(df[, 1, drop = F]), c("name"))
+ expect_equal(columns(df[, "age", drop = F]), c("age"))
+
+ df2 <- df[, c("age", "name")]
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(columns(df2), c("age", "name"))
+
+ df$age2 <- df$age
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == df$age)), 2)
+ df$age2 <- df$age * 2
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
+ df$age2 <- df[["age"]] * 3
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == df$age * 3)), 2)
+
+ df$age2 <- 21
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == 21)), 3)
+
+ df$age2 <- c(22)
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == 22)), 3)
+
+ expect_error(df$age3 <- c(22, NA),
+ "value must be a Column, literal value as atomic in length of 1, or NULL")
+
+ df[["age2"]] <- 23
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == 23)), 3)
+
+ df[[3]] <- 24
+ expect_equal(columns(df), c("name", "age", "age2"))
+ expect_equal(count(where(df, df$age2 == 24)), 3)
+
+ df[[3]] <- df$age
+ expect_equal(count(where(df, df$age2 == df$age)), 2)
+
+ df[["age2"]] <- df[["name"]]
+ expect_equal(count(where(df, df$age2 == df$name)), 3)
+
+ expect_error(df[["age3"]] <- c(22, 23),
+ "value must be a Column, literal value as atomic in length of 1, or NULL")
+
+ # Test parameter drop
+ expect_equal(class(df[, 1]) == "SparkDataFrame", T)
+ expect_equal(class(df[, 1, drop = T]) == "Column", T)
+ expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T)
+ expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T)
+ expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T)
+})
+
+test_that("select with column", {
+ df <- read.json(jsonPath)
+ df1 <- select(df, "name")
+ expect_equal(columns(df1), c("name"))
+ expect_equal(count(df1), 3)
+
+ df2 <- select(df, df$age)
+ expect_equal(columns(df2), c("age"))
+ expect_equal(count(df2), 3)
+
+ df3 <- select(df, lit("x"))
+ expect_equal(columns(df3), c("x"))
+ expect_equal(count(df3), 3)
+ expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
+
+ df4 <- select(df, c("name", "age"))
+ expect_equal(columns(df4), c("name", "age"))
+ expect_equal(count(df4), 3)
+
+ # Test select with alias
+ df5 <- alias(df, "table")
+
+ expect_equal(columns(select(df5, column("table.name"))), "name")
+ expect_equal(columns(select(df5, "table.name")), "name")
+
+ # Test that stats::alias is not masked
+ expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
+
+
+ expect_error(select(df, c("name", "age"), "name"),
+ "To select multiple columns, use a character vector or list for col")
+})
+
+test_that("drop column", {
+ df <- select(read.json(jsonPath), "name", "age")
+ df1 <- drop(df, "name")
+ expect_equal(columns(df1), c("age"))
+
+ df$age2 <- df$age
+ df1 <- drop(df, c("name", "age"))
+ expect_equal(columns(df1), c("age2"))
+
+ df1 <- drop(df, df$age)
+ expect_equal(columns(df1), c("name", "age2"))
+
+ df$age2 <- NULL
+ expect_equal(columns(df), c("name", "age"))
+ df$age3 <- NULL
+ expect_equal(columns(df), c("name", "age"))
+
+ # Test to make sure base::drop is not masked
+ expect_equal(drop(1:3 %*% 2:4), 20)
+})
+
+test_that("subsetting", {
+ # read.json returns columns in random order
+ df <- select(read.json(jsonPath), "name", "age")
+ filtered <- df[df$age > 20, ]
+ expect_equal(count(filtered), 1)
+ expect_equal(columns(filtered), c("name", "age"))
+ expect_equal(collect(filtered)$name, "Andy")
+
+ df2 <- df[df$age == 19, 1, drop = F]
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(count(df2), 1)
+ expect_equal(columns(df2), c("name"))
+ expect_equal(collect(df2)$name, "Justin")
+
+ df3 <- df[df$age > 20, 2, drop = F]
+ expect_equal(count(df3), 1)
+ expect_equal(columns(df3), c("age"))
+
+ df4 <- df[df$age %in% c(19, 30), 1:2]
+ expect_equal(count(df4), 2)
+ expect_equal(columns(df4), c("name", "age"))
+
+ df5 <- df[df$age %in% c(19), c(1, 2)]
+ expect_equal(count(df5), 1)
+ expect_equal(columns(df5), c("name", "age"))
+
+ df6 <- subset(df, df$age %in% c(30), c(1, 2))
+ expect_equal(count(df6), 1)
+ expect_equal(columns(df6), c("name", "age"))
+
+ df7 <- subset(df, select = "name", drop = F)
+ expect_equal(count(df7), 3)
+ expect_equal(columns(df7), c("name"))
+
+ # Test base::subset is working
+ expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68)
+})
+
+test_that("selectExpr() on a DataFrame", {
+ df <- read.json(jsonPath)
+ selected <- selectExpr(df, "age * 2")
+ expect_equal(names(selected), "(age * 2)")
+ expect_equal(collect(selected), collect(select(df, df$age * 2L)))
+
+ selected2 <- selectExpr(df, "name as newName", "abs(age) as age")
+ expect_equal(names(selected2), c("newName", "age"))
+ expect_equal(count(selected2), 3)
+})
+
+test_that("expr() on a DataFrame", {
+ df <- read.json(jsonPath)
+ expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123)
+})
+
+test_that("column calculation", {
+ df <- read.json(jsonPath)
+ d <- collect(select(df, alias(df$age + 1, "age2")))
+ expect_equal(names(d), c("age2"))
+ df2 <- select(df, lower(df$name), abs(df$age))
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(count(df2), 3)
+})
+
+test_that("test HiveContext", {
+ if (not_cran_or_windows_with_hadoop()) {
+ setHiveContext(sc)
+
+ schema <- structType(structField("name", "string"), structField("age", "integer"),
+ structField("height", "float"))
+ createTable("people", source = "json", schema = schema)
+ df <- read.df(jsonPathNa, "json", schema)
+ insertInto(df, "people")
+ expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16))
+ sql("DROP TABLE people")
+
+ df <- createTable("json", jsonPath, "json")
+ expect_is(df, "SparkDataFrame")
+ expect_equal(count(df), 3)
+ df2 <- sql("select * from json")
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(count(df2), 3)
+
+ jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ saveAsTable(df, "json2", "json", "append", path = jsonPath2)
+ df3 <- sql("select * from json2")
+ expect_is(df3, "SparkDataFrame")
+ expect_equal(count(df3), 3)
+ unlink(jsonPath2)
+
+ hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ saveAsTable(df, "hivetestbl", path = hivetestDataPath)
+ df4 <- sql("select * from hivetestbl")
+ expect_is(df4, "SparkDataFrame")
+ expect_equal(count(df4), 3)
+ unlink(hivetestDataPath)
+
+ parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)
+ df5 <- sql("select * from parquetest")
+ expect_is(df5, "SparkDataFrame")
+ expect_equal(count(df5), 3)
+ unlink(parquetDataPath)
+
+ unsetHiveContext()
+ }
+})
+
+test_that("column operators", {
+ c <- column("a")
+ c2 <- (- c + 1 - 2) * 3 / 4.0
+ c3 <- (c + c2 - c2) * c2 %% c2
+ c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
+ c5 <- c2 ^ c3 ^ c4
+ c6 <- c2 %<=>% c3
+ c7 <- !c6
+})
+
+test_that("column functions", {
+ skip_on_cran()
+
+ c <- column("a")
+ c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
+ c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
+ c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
+ c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
+ c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
+ c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
+ c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
+ c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
+ c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
+ c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
+ c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
+ c12 <- variance(c)
+ c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
+ c14 <- cume_dist() + ntile(1) + corr(c, c1)
+ c15 <- dense_rank() + percent_rank() + rank() + row_number()
+ c16 <- is.nan(c) + isnan(c) + isNaN(c)
+ c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
+ c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
+ c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
+ c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
+ c21 <- posexplode_outer(c) + explode_outer(c)
+ c22 <- not(c)
+
+ # Test if base::is.nan() is exposed
+ expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
+
+ # Test if base::rank() is exposed
+ expect_equal(class(rank())[[1]], "Column")
+ expect_equal(rank(1:3), as.numeric(c(1:3)))
+
+ df <- read.json(jsonPath)
+ df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
+ expect_equal(collect(df2)[[2, 1]], TRUE)
+ expect_equal(collect(df2)[[2, 2]], FALSE)
+ expect_equal(collect(df2)[[3, 1]], FALSE)
+ expect_equal(collect(df2)[[3, 2]], TRUE)
+
+ # Test that input_file_name()
+ actual_names <- sort(collect(distinct(select(df, input_file_name()))))
+ expect_equal(length(actual_names), 1)
+ expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
+
+ df3 <- select(df, between(df$name, c("Apache", "Spark")))
+ expect_equal(collect(df3)[[1, 1]], TRUE)
+ expect_equal(collect(df3)[[2, 1]], FALSE)
+ expect_equal(collect(df3)[[3, 1]], TRUE)
+
+ df4 <- select(df, countDistinct(df$age, df$name))
+ expect_equal(collect(df4)[[1, 1]], 2)
+
+ expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
+ expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
+ expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
+
+ df5 <- createDataFrame(list(list(a = "010101")))
+ expect_equal(collect(select(df5, conv(df5$a, 2, 16)))[1, 1], "15")
+
+ # Test array_contains() and sort_array()
+ df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L))))
+ result <- collect(select(df, array_contains(df[[1]], 1L)))[[1]]
+ expect_equal(result, c(TRUE, FALSE))
+
+ result <- collect(select(df, sort_array(df[[1]], FALSE)))[[1]]
+ expect_equal(result, list(list(3L, 2L, 1L), list(6L, 5L, 4L)))
+ result <- collect(select(df, sort_array(df[[1]])))[[1]]
+ expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L)))
+
+ # Test that stats::lag is working
+ expect_equal(length(lag(ldeaths, 12)), 72)
+
+ # Test struct()
+ df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
+ schema = c("a", "b", "c"))
+ result <- collect(select(df, alias(struct("a", "c"), "d")))
+ expected <- data.frame(row.names = 1:2)
+ expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
+ listToStruct(list(a = 4L, c = 6L)))
+ expect_equal(result, expected)
+
+ result <- collect(select(df, alias(struct(df$a, df$b), "d")))
+ expected <- data.frame(row.names = 1:2)
+ expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
+ listToStruct(list(a = 4L, b = 5L)))
+ expect_equal(result, expected)
+
+ # Test encode(), decode()
+ bytes <- as.raw(c(0xe5, 0xa4, 0xa7, 0xe5, 0x8d, 0x83, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c))
+ df <- createDataFrame(list(list(markUtf8("大千世界"), "utf-8", bytes)),
+ schema = c("a", "b", "c"))
+ result <- collect(select(df, encode(df$a, "utf-8"), decode(df$c, "utf-8")))
+ expect_equal(result[[1]][[1]], bytes)
+ expect_equal(result[[2]], markUtf8("大千世界"))
+
+ # Test first(), last()
+ df <- read.json(jsonPath)
+ expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
+ expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
+ expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
+ expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
+ expect_equal(collect(select(df, last(df$age)))[[1]], 19)
+ expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
+ expect_equal(collect(select(df, last("age")))[[1]], 19)
+ expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)
+
+ # Test bround()
+ df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
+ expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
+ expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
+
+ # Test to_json(), from_json()
+ df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
+ j <- collect(select(df, alias(to_json(df$people), "json")))
+ expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
+
+ df <- read.json(mapTypeJsonPath)
+ j <- collect(select(df, alias(to_json(df$info), "json")))
+ expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
+ df <- as.DataFrame(j)
+ schema <- structType(structField("age", "integer"),
+ structField("height", "double"))
+ s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
+ expect_equal(ncol(s), 1)
+ expect_equal(nrow(s), 3)
+ expect_is(s[[1]][[1]], "struct")
+ expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+
+ # passing option
+ df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
+ schema2 <- structType(structField("date", "date"))
+ s <- collect(select(df, from_json(df$col, schema2)))
+ expect_equal(s[[1]][[1]], NA)
+ s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
+ expect_is(s[[1]][[1]]$date, "Date")
+ expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
+
+ # check for unparseable
+ df <- as.DataFrame(list(list("a" = "")))
+ expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
+
+ # check if array type in string is correctly supported.
+ jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
+ df <- as.DataFrame(list(list("people" = jsonArr)))
+ schema <- structType(structField("name", "string"))
+ arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
+ expect_equal(ncol(arr), 1)
+ expect_equal(nrow(arr), 1)
+ expect_is(arr[[1]][[1]], "list")
+ expect_equal(length(arr$arrcol[[1]]), 2)
+ expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
+ expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
+
+ # Test create_array() and create_map()
+ df <- as.DataFrame(data.frame(
+ x = c(1.0, 2.0), y = c(-1.0, 3.0), z = c(-2.0, 5.0)
+ ))
+
+ arrs <- collect(select(df, create_array(df$x, df$y, df$z)))
+ expect_equal(arrs[, 1], list(list(1, -1, -2), list(2, 3, 5)))
+
+ maps <- collect(select(
+ df, create_map(lit("x"), df$x, lit("y"), df$y, lit("z"), df$z)))
+
+ expect_equal(
+ maps[, 1],
+ lapply(
+ list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3, z = 5)),
+ as.environment))
+
+ df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
+ expect_equal(
+ collect(select(df, alias(not(df$is_true), "is_false"))),
+ data.frame(is_false = c(FALSE, TRUE, NA))
+ )
+})
+
+test_that("column binary mathfunctions", {
+ lines <- c("{\"a\":1, \"b\":5}",
+ "{\"a\":2, \"b\":6}",
+ "{\"a\":3, \"b\":7}",
+ "{\"a\":4, \"b\":8}")
+ jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(lines, jsonPathWithDup)
+ df <- read.json(jsonPathWithDup)
+ expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
+ expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
+ expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
+ expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
+ ## nolint start
+ expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
+ expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
+ expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
+ expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
+ ## nolint end
+ expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
+ expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
+ expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
+ expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
+ expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
+ expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
+ expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
+})
+
+test_that("string operators", {
+ df <- read.json(jsonPath)
+ expect_equal(count(where(df, like(df$name, "A%"))), 1)
+ expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
+ expect_true(first(select(df, startsWith(df$name, "M")))[[1]])
+ expect_false(first(select(df, startsWith(df$name, "m")))[[1]])
+ expect_true(first(select(df, endsWith(df$name, "el")))[[1]])
+ expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
+ if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+ expect_true(startsWith("Hello World", "Hello"))
+ expect_false(endsWith("Hello World", "a"))
+ }
+ expect_equal(collect(select(df, cast(df$age, "string")))[[2, 1]], "30")
+ expect_equal(collect(select(df, concat(df$name, lit(":"), df$age)))[[2, 1]], "Andy:30")
+ expect_equal(collect(select(df, concat_ws(":", df$name)))[[2, 1]], "Andy")
+ expect_equal(collect(select(df, concat_ws(":", df$name, df$age)))[[2, 1]], "Andy:30")
+ expect_equal(collect(select(df, instr(df$name, "i")))[, 1], c(2, 0, 5))
+ expect_equal(collect(select(df, format_number(df$age, 2)))[2, 1], "30.00")
+ expect_equal(collect(select(df, sha1(df$name)))[2, 1],
+ "ab5a000e88b5d9d0fa2575f5c6263eb93452405d")
+ expect_equal(collect(select(df, sha2(df$name, 256)))[2, 1],
+ "80f2aed3c618c423ddf05a2891229fba44942d907173152442cf6591441ed6dc")
+ expect_equal(collect(select(df, format_string("Name:%s", df$name)))[2, 1], "Name:Andy")
+ expect_equal(collect(select(df, format_string("%s, %d", df$name, df$age)))[2, 1], "Andy, 30")
+ expect_equal(collect(select(df, regexp_extract(df$name, "(n.y)", 1)))[2, 1], "ndy")
+ expect_equal(collect(select(df, regexp_replace(df$name, "(n.y)", "ydn")))[2, 1], "Aydn")
+
+ l2 <- list(list(a = "aaads"))
+ df2 <- createDataFrame(l2)
+ expect_equal(collect(select(df2, locate("aa", df2$a)))[1, 1], 1)
+ expect_equal(collect(select(df2, locate("aa", df2$a, 2)))[1, 1], 2)
+ expect_equal(collect(select(df2, lpad(df2$a, 8, "#")))[1, 1], "###aaads") # nolint
+ expect_equal(collect(select(df2, rpad(df2$a, 8, "#")))[1, 1], "aaads###") # nolint
+
+ l3 <- list(list(a = "a.b.c.d"))
+ df3 <- createDataFrame(l3)
+ expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
+ expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
+ expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
+
+ l4 <- list(list(a = "a.b@c.d 1\\b"))
+ df4 <- createDataFrame(l4)
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
+ list(list("a.b@c.d", "1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
+ list(list("a", "b@c", "d 1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "@")))[1, 1],
+ list(list("a.b", "c.d 1\\b"))
+ )
+ expect_equal(
+ collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
+ list(list("a.b@c.d 1", "b"))
+ )
+
+ l5 <- list(list(a = "abc"))
+ df5 <- createDataFrame(l5)
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
+ "abc"
+ )
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
+ "abcabcabc"
+ )
+ expect_equal(
+ collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
+ ""
+ )
+})
+
+test_that("date functions on a DataFrame", {
+ .originalTimeZone <- Sys.getenv("TZ")
+ Sys.setenv(TZ = "UTC")
+ l <- list(list(a = 1L, b = as.Date("2012-12-13")),
+ list(a = 2L, b = as.Date("2013-12-14")),
+ list(a = 3L, b = as.Date("2014-12-15")))
+ df <- createDataFrame(l)
+ expect_equal(collect(select(df, dayofmonth(df$b)))[, 1], c(13, 14, 15))
+ expect_equal(collect(select(df, dayofyear(df$b)))[, 1], c(348, 348, 349))
+ expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51))
+ expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014))
+ expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12))
+ expect_equal(collect(select(df, last_day(df$b)))[, 1],
+ c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31")))
+ expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1],
+ c(as.Date("2012-12-17"), as.Date("2013-12-16"), as.Date("2014-12-22")))
+ expect_equal(collect(select(df, date_format(df$b, "y")))[, 1], c("2012", "2013", "2014"))
+ expect_equal(collect(select(df, add_months(df$b, 3)))[, 1],
+ c(as.Date("2013-03-13"), as.Date("2014-03-14"), as.Date("2015-03-15")))
+ expect_equal(collect(select(df, date_add(df$b, 1)))[, 1],
+ c(as.Date("2012-12-14"), as.Date("2013-12-15"), as.Date("2014-12-16")))
+ expect_equal(collect(select(df, date_sub(df$b, 1)))[, 1],
+ c(as.Date("2012-12-12"), as.Date("2013-12-13"), as.Date("2014-12-14")))
+
+ l2 <- list(list(a = 1L, b = as.POSIXlt("2012-12-13 12:34:00", tz = "UTC")),
+ list(a = 2L, b = as.POSIXlt("2014-12-15 01:24:34", tz = "UTC")))
+ df2 <- createDataFrame(l2)
+ expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
+ expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
+ expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
+ c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC")))
+ expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
+ c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC")))
+ expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
+ expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
+ expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
+
+ l3 <- list(list(a = 1000), list(a = -1000))
+ df3 <- createDataFrame(l3)
+ result31 <- collect(select(df3, from_unixtime(df3$a)))
+ expect_equal(grep("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", result31[, 1], perl = TRUE),
+ c(1, 2))
+ result32 <- collect(select(df3, from_unixtime(df3$a, "yyyy")))
+ expect_equal(grep("\\d{4}", result32[, 1]), c(1, 2))
+ Sys.setenv(TZ = .originalTimeZone)
+})
+
+test_that("greatest() and least() on a DataFrame", {
+ l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+ df <- createDataFrame(l)
+ expect_equal(collect(select(df, greatest(df$a, df$b)))[, 1], c(2, 4))
+ expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3))
+})
+
+test_that("time windowing (window()) with all inputs", {
+ df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+ df$window <- window(df$t, "5 seconds", "5 seconds", "0 seconds")
+ local <- collect(df)$v
+ # Not checking time windows because of possible time zone issues. Just checking that the function
+ # works
+ expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with slide duration", {
+ df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+ df$window <- window(df$t, "5 seconds", "2 seconds")
+ local <- collect(df)$v
+ # Not checking time windows because of possible time zone issues. Just checking that the function
+ # works
+ expect_equal(local, c(1, 1))
+})
+
+test_that("time windowing (window()) with start time", {
+ df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+ df$window <- window(df$t, "5 seconds", startTime = "2 seconds")
+ local <- collect(df)$v
+ # Not checking time windows because of possible time zone issues. Just checking that the function
+ # works
+ expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with just window duration", {
+ df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+ df$window <- window(df$t, "5 seconds")
+ local <- collect(df)$v
+ # Not checking time windows because of possible time zone issues. Just checking that the function
+ # works
+ expect_equal(local, c(1))
+})
+
+test_that("when(), otherwise() and ifelse() on a DataFrame", {
+ l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+ df <- createDataFrame(l)
+ expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, 1)))[, 1], c(NA, 1))
+ expect_equal(collect(select(df, otherwise(when(df$a > 1, 1), 0)))[, 1], c(0, 1))
+ expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
+})
+
+test_that("when(), otherwise() and ifelse() with column on a DataFrame", {
+ l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+ df <- createDataFrame(l)
+ expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, lit(1))))[, 1], c(NA, 1))
+ expect_equal(collect(select(df, otherwise(when(df$a > 1, lit(1)), lit(0))))[, 1], c(0, 1))
+ expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, lit(0), lit(1))))[, 1], c(1, 0))
+})
+
+test_that("group by, agg functions", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+ df1 <- agg(df, name = "max", age = "sum")
+ expect_equal(1, count(df1))
+ df1 <- agg(df, age2 = max(df$age))
+ expect_equal(1, count(df1))
+ expect_equal(columns(df1), c("age2"))
+
+ gd <- groupBy(df, "name")
+ expect_is(gd, "GroupedData")
+ df2 <- count(gd)
+ expect_is(df2, "SparkDataFrame")
+ expect_equal(3, count(df2))
+
+ # Also test group_by, summarize, mean
+ gd1 <- group_by(df, "name")
+ expect_is(gd1, "GroupedData")
+ df_summarized <- summarize(gd, mean_age = mean(df$age))
+ expect_is(df_summarized, "SparkDataFrame")
+ expect_equal(3, count(df_summarized))
+
+ df3 <- agg(gd, age = "stddev")
+ expect_is(df3, "SparkDataFrame")
+ df3_local <- collect(df3)
+ expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
+
+ df4 <- agg(gd, sumAge = sum(df$age))
+ expect_is(df4, "SparkDataFrame")
+ expect_equal(3, count(df4))
+ expect_equal(columns(df4), c("name", "sumAge"))
+
+ df5 <- sum(gd, "age")
+ expect_is(df5, "SparkDataFrame")
+ expect_equal(3, count(df5))
+
+ expect_equal(3, count(mean(gd)))
+ expect_equal(3, count(max(gd)))
+ expect_equal(30, collect(max(gd))[2, 2])
+ expect_equal(1, collect(count(gd))[1, 2])
+
+ mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
+ "{\"name\":\"ID1\", \"value\": \"10\"}",
+ "{\"name\":\"ID1\", \"value\": \"22\"}",
+ "{\"name\":\"ID2\", \"value\": \"-3\"}")
+ jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(mockLines2, jsonPath2)
+ gd2 <- groupBy(read.json(jsonPath2), "name")
+ df6 <- agg(gd2, value = "sum")
+ df6_local <- collect(df6)
+ expect_equal(42, df6_local[df6_local$name == "ID1", ][1, 2])
+ expect_equal(-3, df6_local[df6_local$name == "ID2", ][1, 2])
+
+ df7 <- agg(gd2, value = "stddev")
+ df7_local <- collect(df7)
+ expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
+ expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
+
+ mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"Justin\", \"age\":19}",
+ "{\"name\":\"Justin\", \"age\":1}")
+ jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(mockLines3, jsonPath3)
+ df8 <- read.json(jsonPath3)
+ gd3 <- groupBy(df8, "name")
+ gd3_local <- collect(sum(gd3))
+ expect_equal(60, gd3_local[gd3_local$name == "Andy", ][1, 2])
+ expect_equal(20, gd3_local[gd3_local$name == "Justin", ][1, 2])
+
+ expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
+ gd3_local <- collect(agg(gd3, var(df8$age)))
+ expect_equal(162, gd3_local[gd3_local$name == "Justin", ][1, 2])
+
+ # Test stats::sd, stats::var are working
+ expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
+ expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
+
+ # Test collect_list and collect_set
+ gd3_collections_local <- collect(
+ agg(gd3, collect_set(df8$age), collect_list(df8$age))
+ )
+
+ expect_equal(
+ unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 2]),
+ c(30)
+ )
+
+ expect_equal(
+ unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 3]),
+ c(30, 30)
+ )
+
+ expect_equal(
+ sort(unlist(
+ gd3_collections_local[gd3_collections_local$name == "Justin", 3]
+ )),
+ c(1, 19)
+ )
+
+ unlink(jsonPath2)
+ unlink(jsonPath3)
+})
+
+test_that("pivot GroupedData column", {
+ df <- createDataFrame(data.frame(
+ earnings = c(10000, 10000, 11000, 15000, 12000, 20000, 21000, 22000),
+ course = c("R", "Python", "R", "Python", "R", "Python", "R", "Python"),
+ year = c(2013, 2013, 2014, 2014, 2015, 2015, 2016, 2016)
+ ))
+ sum1 <- collect(sum(pivot(groupBy(df, "year"), "course"), "earnings"))
+ sum2 <- collect(sum(pivot(groupBy(df, "year"), "course", c("Python", "R")), "earnings"))
+ sum3 <- collect(sum(pivot(groupBy(df, "year"), "course", list("Python", "R")), "earnings"))
+ sum4 <- collect(sum(pivot(groupBy(df, "year"), "course", "R"), "earnings"))
+
+ correct_answer <- data.frame(
+ year = c(2013, 2014, 2015, 2016),
+ Python = c(10000, 15000, 20000, 22000),
+ R = c(10000, 11000, 12000, 21000)
+ )
+ expect_equal(sum1, correct_answer)
+ expect_equal(sum2, correct_answer)
+ expect_equal(sum3, correct_answer)
+ expect_equal(sum4, correct_answer[, c("year", "R")])
+
+ expect_error(collect(sum(pivot(groupBy(df, "year"), "course", c("R", "R")), "earnings")))
+ expect_error(collect(sum(pivot(groupBy(df, "year"), "course", list("R", "R")), "earnings")))
+})
+
+test_that("test multi-dimensional aggregations with cube and rollup", {
+ df <- createDataFrame(data.frame(
+ id = 1:6,
+ year = c(2016, 2016, 2016, 2017, 2017, 2017),
+ salary = c(10000, 15000, 20000, 22000, 32000, 21000),
+ department = c("management", "rnd", "sales", "management", "rnd", "sales")
+ ))
+
+ actual_cube <- collect(
+ orderBy(
+ agg(
+ cube(df, "year", "department"),
+ expr("sum(salary) AS total_salary"),
+ expr("avg(salary) AS average_salary"),
+ alias(grouping_bit(df$year), "grouping_year"),
+ alias(grouping_bit(df$department), "grouping_department"),
+ alias(grouping_id(df$year, df$department), "grouping_id")
+ ),
+ "year", "department"
+ )
+ )
+
+ expected_cube <- data.frame(
+ year = c(rep(NA, 4), rep(2016, 4), rep(2017, 4)),
+ department = rep(c(NA, "management", "rnd", "sales"), times = 3),
+ total_salary = c(
+ 120000, # Total
+ 10000 + 22000, 15000 + 32000, 20000 + 21000, # Department only
+ 20000 + 15000 + 10000, # 2016
+ 10000, 15000, 20000, # 2016 each department
+ 21000 + 32000 + 22000, # 2017
+ 22000, 32000, 21000 # 2017 each department
+ ),
+ average_salary = c(
+ # Total
+ mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
+ # Mean by department
+ mean(c(10000, 22000)), mean(c(15000, 32000)), mean(c(20000, 21000)),
+ mean(c(10000, 15000, 20000)), # 2016
+ 10000, 15000, 20000, # 2016 each department
+ mean(c(21000, 32000, 22000)), # 2017
+ 22000, 32000, 21000 # 2017 each department
+ ),
+ grouping_year = c(
+ 1, # global
+ 1, 1, 1, # by department
+ 0, # 2016
+ 0, 0, 0, # 2016 by department
+ 0, # 2017
+ 0, 0, 0 # 2017 by department
+ ),
+ grouping_department = c(
+ 1, # global
+ 0, 0, 0, # by department
+ 1, # 2016
+ 0, 0, 0, # 2016 by department
+ 1, # 2017
+ 0, 0, 0 # 2017 by department
+ ),
+ grouping_id = c(
+ 3, # 11
+ 2, 2, 2, # 10
+ 1, # 01
+ 0, 0, 0, # 00
+ 1, # 01
+ 0, 0, 0 # 00
+ ),
+ stringsAsFactors = FALSE
+ )
+
+ expect_equal(actual_cube, expected_cube)
+
+ # cube should accept column objects
+ expect_equal(
+ count(sum(cube(df, df$year, df$department), "salary")),
+ 12
+ )
+
+ # cube without columns should result in a single aggregate
+ expect_equal(
+ collect(agg(cube(df), expr("sum(salary) as total_salary"))),
+ data.frame(total_salary = 120000)
+ )
+
+ actual_rollup <- collect(
+ orderBy(
+ agg(
+ rollup(df, "year", "department"),
+ expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"),
+ alias(grouping_bit(df$year), "grouping_year"),
+ alias(grouping_bit(df$department), "grouping_department"),
+ alias(grouping_id(df$year, df$department), "grouping_id")
+ ),
+ "year", "department"
+ )
+ )
+
+ expected_rollup <- data.frame(
+ year = c(NA, rep(2016, 4), rep(2017, 4)),
+ department = c(NA, rep(c(NA, "management", "rnd", "sales"), times = 2)),
+ total_salary = c(
+ 120000, # Total
+ 20000 + 15000 + 10000, # 2016
+ 10000, 15000, 20000, # 2016 each department
+ 21000 + 32000 + 22000, # 2017
+ 22000, 32000, 21000 # 2017 each department
+ ),
+ average_salary = c(
+ # Total
+ mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
+ mean(c(10000, 15000, 20000)), # 2016
+ 10000, 15000, 20000, # 2016 each department
+ mean(c(21000, 32000, 22000)), # 2017
+ 22000, 32000, 21000 # 2017 each department
+ ),
+ grouping_year = c(
+ 1, # global
+ 0, # 2016
+ 0, 0, 0, # 2016 each department
+ 0, # 2017
+ 0, 0, 0 # 2017 each department
+ ),
+ grouping_department = c(
+ 1, # global
+ 1, # 2016
+ 0, 0, 0, # 2016 each department
+ 1, # 2017
+ 0, 0, 0 # 2017 each department
+ ),
+ grouping_id = c(
+ 3, # 11
+ 1, # 01
+ 0, 0, 0, # 00
+ 1, # 01
+ 0, 0, 0 # 00
+ ),
+ stringsAsFactors = FALSE
+ )
+
+ expect_equal(actual_rollup, expected_rollup)
+
+ # cube should accept column objects
+ expect_equal(
+ count(sum(rollup(df, df$year, df$department), "salary")),
+ 9
+ )
+
+ # rollup without columns should result in a single aggregate
+ expect_equal(
+ collect(agg(rollup(df), expr("sum(salary) as total_salary"))),
+ data.frame(total_salary = 120000)
+ )
+})
+
+test_that("arrange() and orderBy() on a DataFrame", {
+ df <- read.json(jsonPath)
+ sorted <- arrange(df, df$age)
+ expect_equal(collect(sorted)[1, 2], "Michael")
+
+ sorted2 <- arrange(df, "name", decreasing = FALSE)
+ expect_equal(collect(sorted2)[2, "age"], 19)
+
+ sorted3 <- orderBy(df, asc(df$age))
+ expect_true(is.na(first(sorted3)$age))
+ expect_equal(collect(sorted3)[2, "age"], 19)
+
+ sorted4 <- orderBy(df, desc(df$name))
+ expect_equal(first(sorted4)$name, "Michael")
+ expect_equal(collect(sorted4)[3, "name"], "Andy")
+
+ sorted5 <- arrange(df, "age", "name", decreasing = TRUE)
+ expect_equal(collect(sorted5)[1, 2], "Andy")
+
+ sorted6 <- arrange(df, "age", "name", decreasing = c(T, F))
+ expect_equal(collect(sorted6)[1, 2], "Andy")
+
+ sorted7 <- arrange(df, "name", decreasing = FALSE)
+ expect_equal(collect(sorted7)[2, "age"], 19)
+})
+
+test_that("filter() on a DataFrame", {
+ df <- read.json(jsonPath)
+ filtered <- filter(df, "age > 20")
+ expect_equal(count(filtered), 1)
+ expect_equal(collect(filtered)$name, "Andy")
+ filtered2 <- where(df, df$name != "Michael")
+ expect_equal(count(filtered2), 2)
+ expect_equal(collect(filtered2)$age[2], 19)
+
+ # test suites for %in%
+ filtered3 <- filter(df, "age in (19)")
+ expect_equal(count(filtered3), 1)
+ filtered4 <- filter(df, "age in (19, 30)")
+ expect_equal(count(filtered4), 2)
+ filtered5 <- where(df, df$age %in% c(19))
+ expect_equal(count(filtered5), 1)
+ filtered6 <- where(df, df$age %in% c(19, 30))
+ expect_equal(count(filtered6), 2)
+
+ # test suites for %<=>%
+ dfNa <- read.json(jsonPathNa)
+ expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
+ expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
+ expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
+ expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
+ # match NA from two columns
+ expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
+ expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)
+
+ # Test stats::filter is working
+ #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
+})
+
+test_that("join(), crossJoin() and merge() on a DataFrame", {
+ skip_on_cran()
+
+ df <- read.json(jsonPath)
+
+ mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
+ "{\"name\":\"Andy\", \"test\": \"no\"}",
+ "{\"name\":\"Justin\", \"test\": \"yes\"}",
+ "{\"name\":\"Bob\", \"test\": \"yes\"}")
+ jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(mockLines2, jsonPath2)
+ df2 <- read.json(jsonPath2)
+
+ # inner join, not cartesian join
+ expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
+ # cartesian join
+ expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
+ paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
+ " INNER join between logical plans).*"))
+
+ joined <- crossJoin(df, df2)
+ expect_equal(names(joined), c("age", "name", "name", "test"))
+ expect_equal(count(joined), 12)
+ expect_equal(names(collect(joined)), c("age", "name", "name", "test"))
+
+ joined2 <- join(df, df2, df$name == df2$name)
+ expect_equal(names(joined2), c("age", "name", "name", "test"))
+ expect_equal(count(joined2), 3)
+
+ joined3 <- join(df, df2, df$name == df2$name, "rightouter")
+ expect_equal(names(joined3), c("age", "name", "name", "test"))
+ expect_equal(count(joined3), 4)
+ expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
+
+ joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
+ alias(df$age + 5, "newAge"), df$name, df2$test)
+ expect_equal(names(joined4), c("newAge", "name", "test"))
+ expect_equal(count(joined4), 4)
+ expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
+
+ joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+ expect_equal(names(joined5), c("age", "name", "name", "test"))
+ expect_equal(count(joined5), 3)
+ expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
+
+ joined6 <- join(df, df2, df$name == df2$name, "inner")
+ expect_equal(names(joined6), c("age", "name", "name", "test"))
+ expect_equal(count(joined6), 3)
+
+ joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
+ expect_equal(names(joined7), c("age", "name"))
+ expect_equal(count(joined7), 3)
+
+ joined8 <- join(df, df2, df$name == df2$name, "left_outer")
+ expect_equal(names(joined8), c("age", "name", "name", "test"))
+ expect_equal(count(joined8), 3)
+ expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
+
+ joined9 <- join(df, df2, df$name == df2$name, "right_outer")
+ expect_equal(names(joined9), c("age", "name", "name", "test"))
+ expect_equal(count(joined9), 4)
+ expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
+
+ merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
+ expect_equal(count(merged), 4)
+ expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+ expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19)
+
+ merged <- merge(df, df2, suffixes = c("-X", "-Y"))
+ expect_equal(count(merged), 3)
+ expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+ expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30)
+
+ merged <- merge(df, df2, by = "name", suffixes = c("-X", "-Y"), sort = FALSE)
+ expect_equal(count(merged), 3)
+ expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+ expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael")
+
+ merged <- merge(df, df2, by = "name", all = T, sort = T)
+ expect_equal(count(merged), 4)
+ expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+ expect_equal(collect(orderBy(merged, merged$"name_y"))$"name_x"[1], "Andy")
+
+ merged <- merge(df, df2, by = NULL)
+ expect_equal(count(merged), 12)
+ expect_equal(names(merged), c("age", "name", "name", "test"))
+
+ mockLines3 <- c("{\"name\":\"Michael\", \"name_y\":\"Michael\", \"test\": \"yes\"}",
+ "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}",
+ "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}",
+ "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
+ jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(mockLines3, jsonPath3)
+ df3 <- read.json(jsonPath3)
+ expect_error(merge(df, df3),
+ paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
+ "Please use different suffixes for the intersected columns.", sep = ""))
+
+ unlink(jsonPath2)
+ unlink(jsonPath3)
+
+ # Join with broadcast hint
+ df1 <- sql("SELECT * FROM range(10e10)")
+ df2 <- sql("SELECT * FROM range(10e10)")
+
+ execution_plan <- capture.output(explain(join(df1, df2, df1$id == df2$id)))
+ expect_false(any(grepl("BroadcastHashJoin", execution_plan)))
+
+ execution_plan_hint <- capture.output(
+ explain(join(df1, hint(df2, "broadcast"), df1$id == df2$id))
+ )
+ expect_true(any(grepl("BroadcastHashJoin", execution_plan_hint)))
+
+ execution_plan_broadcast <- capture.output(
+ explain(join(df1, broadcast(df2), df1$id == df2$id))
+ )
+ expect_true(any(grepl("BroadcastHashJoin", execution_plan_broadcast)))
+})
+
+test_that("toJSON() on DataFrame", {
+ df <- as.DataFrame(cars)
+ df_json <- toJSON(df)
+ expect_is(df_json, "SparkDataFrame")
+ expect_equal(colnames(df_json), c("value"))
+ expect_equal(head(df_json, 1),
+ data.frame(value = "{\"speed\":4.0,\"dist\":2.0}", stringsAsFactors = FALSE))
+})
+
+test_that("showDF()", {
+ df <- read.json(jsonPath)
+ expected <- paste("+----+-------+\n",
+ "| age| name|\n",
+ "+----+-------+\n",
+ "|null|Michael|\n",
+ "| 30| Andy|\n",
+ "| 19| Justin|\n",
+ "+----+-------+\n", sep = "")
+ expected2 <- paste("+---+----+\n",
+ "|age|name|\n",
+ "+---+----+\n",
+ "|nul| Mic|\n",
+ "| 30| And|\n",
+ "| 19| Jus|\n",
+ "+---+----+\n", sep = "")
+ expect_output(showDF(df), expected)
+ expect_output(showDF(df, truncate = 3), expected2)
+})
+
+test_that("isLocal()", {
+ df <- read.json(jsonPath)
+ expect_false(isLocal(df))
+})
+
+test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
+ df <- read.json(jsonPath)
+
+ lines <- c("{\"name\":\"Bob\", \"age\":24}",
+ "{\"name\":\"Andy\", \"age\":30}",
+ "{\"name\":\"James\", \"age\":35}")
+ jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+ writeLines(lines, jsonPath2)
+ df2 <- read.df(jsonPath2, "json")
+
+ unioned <- arrange(union(df, df2), df$age)
+ expect_is(unioned, "SparkDataFrame")
+ expect_equal(count(unioned), 6)
+ expect_equal(first(unioned)$name, "Michael")
+ expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
+
+ unioned2 <- arrange(rbind(unioned, df, df2), df$age)
+ expect_is(unioned2, "SparkDataFrame")
+ expect_equal(count(unioned2), 12)
+ expect_equal(first(unioned2)$name, "Michael")
+
+ df3 <- df2
+ names(df3)[1] <- "newName"
+ expect_error(rbind(df, df3),
+ "Names of input data frames are different.")
+ expect_error(rbind(df, df2, df3),
+ "Names of input data frames are different.")
+
+ excepted <- arrange(except(df, df2), desc(df$age))
+ expect_is(unioned, "SparkDataFrame")
+ expect_equal(count(excepted), 2)
+ expect_equal(first(excepted)$name, "Justin")
+
+ intersected <- arrange(intersect(df, df2), df$age)
+ expect_is(unioned, "SparkDataFrame")
+ expect_equal(count(intersected), 1)
+ expect_equal(first(intersected)$name, "Andy")
+
+ # Test base::union is working
+ expect_equal(union(c(1:3), c(3:5)), c(1:5))
+
+ # Test base::rbind is working
+ expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)
+
+ # Test base::intersect is working
+ expect_equal(length(intersect(1:20, 3:23)), 18)
+
+ unlink(jsonPath2)
+})
+
+test_that("withColumn() and withColumnRenamed()", {
+ df <- read.json(jsonPath)
+ newDF <- withColumn(df, "newAge", df$age + 2)
+ expect_equal(length(columns(newDF)), 3)
+ expect_equal(columns(newDF)[3], "newAge")
+ expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
+
+ # Replace existing column
+ newDF <- withColumn(df, "age", df$age + 2)
+ expect_equal(length(columns(newDF)), 2)
+ expect_equal(first(filter(newDF, df$name != "Michael"))$age, 32)
+
+ newDF <- withColumn(df, "age", 18)
+ expect_equal(length(columns(newDF)), 2)
+ expect_equal(first(newDF)$age, 18)
+
+ expect_error(withColumn(df, "age", list("a")),
+ "Literal value must be atomic in length of 1")
+
+ newDF2 <- withColumnRenamed(df, "age", "newerAge")
+ expect_equal(length(columns(newDF2)), 2)
+ expect_equal(columns(newDF2)[1], "newerAge")
+})
+
+test_that("mutate(), transform(), rename() and names()", {
+ df <- read.json(jsonPath)
+ newDF <- mutate(df, newAge = df$age + 2)
+ expect_equal(length(columns(newDF)), 3)
+ expect_equal(columns(newDF)[3], "newAge")
+ expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
+
+ newDF <- mutate(df, age = df$age
<TRUNCATED>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R b/R/pkg/tests/fulltests/test_mllib_fpm.R
new file mode 100644
index 0000000..4e10ca1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib frequent pattern mining")
+
+# Tests for MLlib frequent pattern mining algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.fpGrowth", {
+ data <- selectExpr(createDataFrame(data.frame(items = c(
+ "1,2",
+ "1,2",
+ "1,2,3",
+ "1,3"
+ ))), "split(items, ',') as items")
+
+ model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1)
+
+ itemsets <- collect(spark.freqItemsets(model))
+
+ expected_itemsets <- data.frame(
+ items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))),
+ freq = c(2, 2, 3, 3, 4)
+ )
+
+ expect_equivalent(expected_itemsets, itemsets)
+
+ expected_association_rules <- data.frame(
+ antecedent = I(list(list("2"), list("3"))),
+ consequent = I(list(list("1"), list("1"))),
+ confidence = c(1, 1)
+ )
+
+ expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
+
+ new_data <- selectExpr(createDataFrame(data.frame(items = c(
+ "1,2",
+ "1,3",
+ "2,3"
+ ))), "split(items, ',') as items")
+
+ expected_predictions <- data.frame(
+ items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
+ prediction = I(list(list(), list(), list("1")))
+ )
+
+ expect_equivalent(expected_predictions, collect(predict(model, new_data)))
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
+ write.ml(model, modelPath, overwrite = TRUE)
+ loaded_model <- read.ml(modelPath)
+
+ expect_equivalent(
+ itemsets,
+ collect(spark.freqItemsets(loaded_model)))
+
+ unlink(modelPath)
+ }
+
+ model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
+ expect_equal(
+ count(spark.freqItemsets(model_without_numpartitions)),
+ count(spark.freqItemsets(model))
+ )
+
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_recommendation.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R
new file mode 100644
index 0000000..cc8064f
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib recommendation algorithms")
+
+# Tests for MLlib recommendation algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.als", {
+ data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
+ list(2, 1, 1.0), list(2, 2, 5.0))
+ df <- createDataFrame(data, c("user", "item", "score"))
+ model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
+ rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+ stats <- summary(model)
+ expect_equal(stats$rank, 10)
+ test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
+ predictions <- collect(predict(model, test))
+
+ expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
+ tolerance = 1e-4)
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats2$rating, "score")
+ userFactors <- collect(stats$userFactors)
+ itemFactors <- collect(stats$itemFactors)
+ userFactors2 <- collect(stats2$userFactors)
+ itemFactors2 <- collect(stats2$itemFactors)
+
+ orderUser <- order(userFactors$id)
+ orderUser2 <- order(userFactors2$id)
+ expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
+ expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
+
+ orderItem <- order(itemFactors$id)
+ orderItem2 <- order(itemFactors2$id)
+ expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
+ expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
+
+ unlink(modelPath)
+ }
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R
new file mode 100644
index 0000000..b05fdd3
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -0,0 +1,480 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib regression algorithms, except for tree-based algorithms")
+
+# Tests for MLlib regression algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("formula of spark.glm", {
+ skip_on_cran()
+
+ training <- suppressWarnings(createDataFrame(iris))
+ # directly calling the spark API
+ # dot minus and intercept vs native glm
+ model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # feature interaction vs native glm
+ model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # glm should work with long formula
+ training <- suppressWarnings(createDataFrame(iris))
+ training$LongLongLongLongLongName <- training$Sepal_Width
+ training$VeryLongLongLongLonLongName <- training$Sepal_Length
+ training$AnotherLongLongLongLongName <- training$Species
+ model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
+ AnotherLongLongLongLongName)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
+test_that("spark.glm and predict", {
+ training <- suppressWarnings(createDataFrame(iris))
+ # gaussian family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # poisson family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+ family = poisson(link = identity))
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+ rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
+ data = iris, family = poisson(link = identity)), iris))
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # Gamma family
+ x <- runif(100, -1, 1)
+ y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+ df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+ model <- glm(y ~ x, family = Gamma, df)
+ out <- capture.output(print(summary(model)))
+ expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
+ # tweedie family
+ model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+ family = "tweedie", var.power = 1.2, link.power = 0.0)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+
+ # manual calculation of the R predicted values to avoid dependence on statmod
+ #' library(statmod)
+ #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+ #' family = tweedie(var.power = 1.2, link.power = 0.0))
+ #' print(coef(rModel))
+
+ rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+ rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+ data = iris) %*% rCoef))
+ expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+ # Test stats::predict is working
+ x <- rnorm(15)
+ y <- x + rnorm(15)
+ expect_equal(length(predict(lm(y ~ x))), 15)
+})
+
+test_that("spark.glm summary", {
+ # gaussian family
+ training <- suppressWarnings(createDataFrame(iris))
+ stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+ # test summary coefficients return matrix type
+ expect_true(class(stats$coefficients) == "matrix")
+ expect_true(class(stats$coefficients[, 1]) == "numeric")
+
+ coefs <- stats$coefficients
+ rCoefs <- rStats$coefficients
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ out <- capture.output(print(stats))
+ expect_match(out[2], "Deviance Residuals:")
+ expect_true(any(grepl("AIC: 59.22", out)))
+
+ # binomial family
+ df <- suppressWarnings(createDataFrame(iris))
+ training <- df[df$Species %in% c("versicolor", "virginica"), ]
+ stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
+ family = binomial(link = "logit")))
+
+ rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+ rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+ family = binomial(link = "logit")))
+
+ coefs <- stats$coefficients
+ rCoefs <- rStats$coefficients
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # Test spark.glm works with weighted dataset
+ a1 <- c(0, 1, 2, 3)
+ a2 <- c(5, 2, 1, 3)
+ w <- c(1, 2, 3, 4)
+ b <- c(1, 0, 1, 0)
+ data <- as.data.frame(cbind(a1, a2, w, b))
+ df <- createDataFrame(data)
+
+ stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
+ rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
+
+ coefs <- stats$coefficients
+ rCoefs <- rStats$coefficients
+ expect_true(all(abs(rCoefs - coefs) < 1e-3))
+ expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # Test summary works on base GLM models
+ baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+ baseSummary <- summary(baseModel)
+ expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+
+ # Test spark.glm works with regularization parameter
+ data <- as.data.frame(cbind(a1, a2, b))
+ df <- suppressWarnings(createDataFrame(data))
+ regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
+ expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
+
+ # Test spark.glm works on collinear data
+ A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+ b <- c(1, 2, 3, 4)
+ data <- as.data.frame(cbind(A, b))
+ df <- createDataFrame(data)
+ stats <- summary(spark.glm(df, b ~ . - 1))
+ coefs <- stats$coefficients
+ expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
+})
+
+test_that("spark.glm save/load", {
+ skip_on_cran()
+
+ training <- suppressWarnings(createDataFrame(iris))
+ m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+ s <- summary(m)
+
+ modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
+ write.ml(m, modelPath)
+ expect_error(write.ml(m, modelPath))
+ write.ml(m, modelPath, overwrite = TRUE)
+ m2 <- read.ml(modelPath)
+ s2 <- summary(m2)
+
+ expect_equal(s$coefficients, s2$coefficients)
+ expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
+ expect_equal(s$dispersion, s2$dispersion)
+ expect_equal(s$null.deviance, s2$null.deviance)
+ expect_equal(s$deviance, s2$deviance)
+ expect_equal(s$df.null, s2$df.null)
+ expect_equal(s$df.residual, s2$df.residual)
+ expect_equal(s$aic, s2$aic)
+ expect_equal(s$iter, s2$iter)
+ expect_true(!s$is.loaded)
+ expect_true(s2$is.loaded)
+
+ unlink(modelPath)
+})
+
+test_that("formula of glm", {
+ skip_on_cran()
+
+ training <- suppressWarnings(createDataFrame(iris))
+ # dot minus and intercept vs native glm
+ model <- glm(Sepal_Width ~ . - Species + 0, data = training)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # feature interaction vs native glm
+ model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # glm should work with long formula
+ training <- suppressWarnings(createDataFrame(iris))
+ training$LongLongLongLongLongName <- training$Sepal_Width
+ training$VeryLongLongLongLonLongName <- training$Sepal_Length
+ training$AnotherLongLongLongLongName <- training$Species
+ model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
+ data = training)
+ vals <- collect(select(predict(model, training), "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
+test_that("glm and predict", {
+ skip_on_cran()
+
+ training <- suppressWarnings(createDataFrame(iris))
+ # gaussian family
+ model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+ rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # poisson family
+ model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+ family = poisson(link = identity))
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+ rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
+ data = iris, family = poisson(link = identity)), iris))
+ expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+ # tweedie family
+ model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+ family = "tweedie", var.power = 1.2, link.power = 0.0)
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+ vals <- collect(select(prediction, "prediction"))
+
+ # manual calculation of the R predicted values to avoid dependence on statmod
+ #' library(statmod)
+ #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+ #' family = tweedie(var.power = 1.2, link.power = 0.0))
+ #' print(coef(rModel))
+
+ rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+ rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+ data = iris) %*% rCoef))
+ expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+ # Test stats::predict is working
+ x <- rnorm(15)
+ y <- x + rnorm(15)
+ expect_equal(length(predict(lm(y ~ x))), 15)
+})
+
+test_that("glm summary", {
+ skip_on_cran()
+
+ # gaussian family
+ training <- suppressWarnings(createDataFrame(iris))
+ stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+
+ rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+ coefs <- stats$coefficients
+ rCoefs <- rStats$coefficients
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # binomial family
+ df <- suppressWarnings(createDataFrame(iris))
+ training <- df[df$Species %in% c("versicolor", "virginica"), ]
+ stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+ family = binomial(link = "logit")))
+
+ rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+ rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+ family = binomial(link = "logit")))
+
+ coefs <- stats$coefficients
+ rCoefs <- rStats$coefficients
+ expect_true(all(abs(rCoefs - coefs) < 1e-4))
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+ expect_equal(stats$dispersion, rStats$dispersion)
+ expect_equal(stats$null.deviance, rStats$null.deviance)
+ expect_equal(stats$deviance, rStats$deviance)
+ expect_equal(stats$df.null, rStats$df.null)
+ expect_equal(stats$df.residual, rStats$df.residual)
+ expect_equal(stats$aic, rStats$aic)
+
+ # Test summary works on base GLM models
+ baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+ baseSummary <- summary(baseModel)
+ expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+})
+
+test_that("glm save/load", {
+ skip_on_cran()
+
+ training <- suppressWarnings(createDataFrame(iris))
+ m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
+ s <- summary(m)
+
+ modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
+ write.ml(m, modelPath)
+ expect_error(write.ml(m, modelPath))
+ write.ml(m, modelPath, overwrite = TRUE)
+ m2 <- read.ml(modelPath)
+ s2 <- summary(m2)
+
+ expect_equal(s$coefficients, s2$coefficients)
+ expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
+ expect_equal(s$dispersion, s2$dispersion)
+ expect_equal(s$null.deviance, s2$null.deviance)
+ expect_equal(s$deviance, s2$deviance)
+ expect_equal(s$df.null, s2$df.null)
+ expect_equal(s$df.residual, s2$df.residual)
+ expect_equal(s$aic, s2$aic)
+ expect_equal(s$iter, s2$iter)
+ expect_true(!s$is.loaded)
+ expect_true(s2$is.loaded)
+
+ unlink(modelPath)
+})
+
+test_that("spark.isoreg", {
+ label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
+ feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
+ weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
+ data <- as.data.frame(cbind(label, feature, weight))
+ df <- createDataFrame(data)
+
+ model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
+ weightCol = "weight")
+ # only allow one variable on the right hand side of the formula
+ expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
+ result <- summary(model)
+ expect_equal(result$predictions, list(7, 5, 4, 4, 1))
+
+ # Test model prediction
+ predict_data <- list(list(-2.0), list(-1.0), list(0.5),
+ list(0.75), list(1.0), list(2.0), list(9.0))
+ predict_df <- createDataFrame(predict_data, c("feature"))
+ predict_result <- collect(select(predict(model, predict_df), "prediction"))
+ expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-isoreg", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ expect_equal(result, summary(model2))
+
+ unlink(modelPath)
+ }
+})
+
+test_that("spark.survreg", {
+ # R code to reproduce the result.
+ #
+ #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
+ #' x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
+ #' library(survival)
+ #' model <- survreg(Surv(time, status) ~ x + sex, rData)
+ #' summary(model)
+ #' predict(model, data)
+ #
+ # -- output of 'summary(model)'
+ #
+ # Value Std. Error z p
+ # (Intercept) 1.315 0.270 4.88 1.07e-06
+ # x -0.190 0.173 -1.10 2.72e-01
+ # sex -0.253 0.329 -0.77 4.42e-01
+ # Log(scale) -1.160 0.396 -2.93 3.41e-03
+ #
+ # -- output of 'predict(model, data)'
+ #
+ # 1 2 3 4 5 6 7
+ # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
+ #
+ data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
+ list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
+ df <- createDataFrame(data, c("time", "status", "x", "sex"))
+ model <- spark.survreg(df, Surv(time, status) ~ x + sex)
+ stats <- summary(model)
+ coefs <- as.vector(stats$coefficients[, 1])
+ rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
+ expect_equal(coefs, rCoefs, tolerance = 1e-4)
+ expect_true(all(
+ rownames(stats$coefficients) ==
+ c("(Intercept)", "x", "sex", "Log(scale)")))
+ p <- collect(select(predict(model, df), "prediction"))
+ expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
+ 2.390146, 2.891269, 2.891269), tolerance = 1e-4)
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ coefs2 <- as.vector(stats2$coefficients[, 1])
+ expect_equal(coefs, coefs2)
+ expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
+
+ unlink(modelPath)
+ }
+
+ # Test survival::survreg
+ if (requireNamespace("survival", quietly = TRUE)) {
+ rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
+ x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
+ expect_error(
+ model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
+ NA)
+ expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
+ }
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_stat.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_stat.R b/R/pkg/tests/fulltests/test_mllib_stat.R
new file mode 100644
index 0000000..1600833
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_stat.R
@@ -0,0 +1,53 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib statistics algorithms")
+
+# Tests for MLlib statistics algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.kstest", {
+ data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
+ df <- createDataFrame(data)
+ testResult <- spark.kstest(df, "test", "norm")
+ stats <- summary(testResult)
+
+ rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
+
+ expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+ expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+ expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+ testResult <- spark.kstest(df, "test", "norm", -0.5)
+ stats <- summary(testResult)
+
+ rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
+
+ expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+ expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+ expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+ # Test print.summary.KSTest
+ printStats <- capture.output(print.summary.KSTest(stats))
+ expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
+ expect_match(printStats[5],
+ "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_tree.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
new file mode 100644
index 0000000..31427ee
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -0,0 +1,320 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib tree-based algorithms")
+
+# Tests for MLlib tree-based algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+ sparkHome <- sparkR.conf("spark.home")
+ file.path(sparkHome, x)
+}
+
+test_that("spark.gbt", {
+ skip_on_cran()
+
+ # regression
+ data <- suppressWarnings(createDataFrame(longley))
+ model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
+ predictions <- collect(predict(model, data))
+ expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+ 63.221, 63.639, 64.989, 63.761,
+ 66.019, 67.857, 68.169, 66.513,
+ 68.655, 69.564, 69.331, 70.551),
+ tolerance = 1e-4)
+ stats <- summary(model)
+ expect_equal(stats$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+ expect_equal(stats$formula, "Employed ~ .")
+ expect_equal(stats$numFeatures, 6)
+ expect_equal(length(stats$treeWeights), 20)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$formula, stats2$formula)
+ expect_equal(stats$numFeatures, stats2$numFeatures)
+ expect_equal(stats$features, stats2$features)
+ expect_equal(stats$featureImportances, stats2$featureImportances)
+ expect_equal(stats$maxDepth, stats2$maxDepth)
+ expect_equal(stats$numTrees, stats2$numTrees)
+ expect_equal(stats$treeWeights, stats2$treeWeights)
+
+ unlink(modelPath)
+ }
+
+ # classification
+ # label must be binary - GBTClassifier currently only supports binary classification.
+ iris2 <- iris[iris$Species != "virginica", ]
+ data <- suppressWarnings(createDataFrame(iris2))
+ model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+ stats <- summary(model)
+ expect_equal(stats$numFeatures, 2)
+ expect_equal(stats$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+ expect_error(capture.output(stats), NA)
+ expect_true(length(capture.output(stats)) > 6)
+ predictions <- collect(predict(model, data))$prediction
+ # test string prediction values
+ expect_equal(length(grep("setosa", predictions)), 50)
+ expect_equal(length(grep("versicolor", predictions)), 50)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$depth, stats2$depth)
+ expect_equal(stats$numNodes, stats2$numNodes)
+ expect_equal(stats$numClasses, stats2$numClasses)
+
+ unlink(modelPath)
+ }
+
+ iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+ df <- suppressWarnings(createDataFrame(iris2))
+ m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+ s <- summary(m)
+ # test numeric prediction values
+ expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
+ expect_equal(s$numFeatures, 5)
+ expect_equal(s$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+
+ # spark.gbt classification can work on libsvm data
+ if (not_cran_or_windows_with_hadoop()) {
+ data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
+ source = "libsvm")
+ model <- spark.gbt(data, label ~ features, "classification")
+ expect_equal(summary(model)$numFeatures, 692)
+ }
+})
+
+test_that("spark.randomForest", {
+ # regression
+ data <- suppressWarnings(createDataFrame(longley))
+ model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+ numTrees = 1)
+
+ predictions <- collect(predict(model, data))
+ expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+ 63.221, 63.639, 64.989, 63.761,
+ 66.019, 67.857, 68.169, 66.513,
+ 68.655, 69.564, 69.331, 70.551),
+ tolerance = 1e-4)
+
+ stats <- summary(model)
+ expect_equal(stats$numTrees, 1)
+ expect_equal(stats$maxDepth, 5)
+ expect_error(capture.output(stats), NA)
+ expect_true(length(capture.output(stats)) > 6)
+
+ model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+ numTrees = 20, seed = 123)
+ predictions <- collect(predict(model, data))
+ expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
+ 63.53160, 64.05470, 65.12710, 64.30450,
+ 66.70910, 67.86125, 68.08700, 67.21865,
+ 68.89275, 69.53180, 69.39640, 69.68250),
+ tolerance = 1e-4)
+ stats <- summary(model)
+ expect_equal(stats$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$formula, stats2$formula)
+ expect_equal(stats$numFeatures, stats2$numFeatures)
+ expect_equal(stats$features, stats2$features)
+ expect_equal(stats$featureImportances, stats2$featureImportances)
+ expect_equal(stats$numTrees, stats2$numTrees)
+ expect_equal(stats$maxDepth, stats2$maxDepth)
+ expect_equal(stats$treeWeights, stats2$treeWeights)
+
+ unlink(modelPath)
+ }
+
+ # classification
+ data <- suppressWarnings(createDataFrame(iris))
+ model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
+ maxDepth = 5, maxBins = 16)
+
+ stats <- summary(model)
+ expect_equal(stats$numFeatures, 2)
+ expect_equal(stats$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+ expect_error(capture.output(stats), NA)
+ expect_true(length(capture.output(stats)) > 6)
+ # Test string prediction values
+ predictions <- collect(predict(model, data))$prediction
+ expect_equal(length(grep("setosa", predictions)), 50)
+ expect_equal(length(grep("versicolor", predictions)), 50)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$depth, stats2$depth)
+ expect_equal(stats$numNodes, stats2$numNodes)
+ expect_equal(stats$numClasses, stats2$numClasses)
+
+ unlink(modelPath)
+ }
+
+ # Test numeric response variable
+ labelToIndex <- function(species) {
+ switch(as.character(species),
+ setosa = 0.0,
+ versicolor = 1.0,
+ virginica = 2.0
+ )
+ }
+ iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+ data <- suppressWarnings(createDataFrame(iris[-5]))
+ model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+ maxDepth = 5, maxBins = 16)
+ stats <- summary(model)
+ expect_equal(stats$numFeatures, 2)
+ expect_equal(stats$numTrees, 20)
+ expect_equal(stats$maxDepth, 5)
+
+ # Test numeric prediction values
+ predictions <- collect(predict(model, data))$prediction
+ expect_equal(length(grep("1.0", predictions)), 50)
+ expect_equal(length(grep("2.0", predictions)), 50)
+
+ # spark.randomForest classification can work on libsvm data
+ if (not_cran_or_windows_with_hadoop()) {
+ data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+ source = "libsvm")
+ model <- spark.randomForest(data, label ~ features, "classification")
+ expect_equal(summary(model)$numFeatures, 4)
+ }
+})
+
+test_that("spark.decisionTree", {
+ skip_on_cran()
+
+ # regression
+ data <- suppressWarnings(createDataFrame(longley))
+ model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
+
+ predictions <- collect(predict(model, data))
+ expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+ 63.221, 63.639, 64.989, 63.761,
+ 66.019, 67.857, 68.169, 66.513,
+ 68.655, 69.564, 69.331, 70.551),
+ tolerance = 1e-4)
+
+ stats <- summary(model)
+ expect_equal(stats$maxDepth, 5)
+ expect_error(capture.output(stats), NA)
+ expect_true(length(capture.output(stats)) > 6)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-decisionTreeRegression", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$formula, stats2$formula)
+ expect_equal(stats$numFeatures, stats2$numFeatures)
+ expect_equal(stats$features, stats2$features)
+ expect_equal(stats$featureImportances, stats2$featureImportances)
+ expect_equal(stats$maxDepth, stats2$maxDepth)
+
+ unlink(modelPath)
+ }
+
+ # classification
+ data <- suppressWarnings(createDataFrame(iris))
+ model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
+ maxDepth = 5, maxBins = 16)
+
+ stats <- summary(model)
+ expect_equal(stats$numFeatures, 2)
+ expect_equal(stats$maxDepth, 5)
+ expect_error(capture.output(stats), NA)
+ expect_true(length(capture.output(stats)) > 6)
+ # Test string prediction values
+ predictions <- collect(predict(model, data))$prediction
+ expect_equal(length(grep("setosa", predictions)), 50)
+ expect_equal(length(grep("versicolor", predictions)), 50)
+
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-decisionTreeClassification", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$depth, stats2$depth)
+ expect_equal(stats$numNodes, stats2$numNodes)
+ expect_equal(stats$numClasses, stats2$numClasses)
+
+ unlink(modelPath)
+ }
+
+ # Test numeric response variable
+ labelToIndex <- function(species) {
+ switch(as.character(species),
+ setosa = 0.0,
+ versicolor = 1.0,
+ virginica = 2.0
+ )
+ }
+ iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+ data <- suppressWarnings(createDataFrame(iris[-5]))
+ model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+ maxDepth = 5, maxBins = 16)
+ stats <- summary(model)
+ expect_equal(stats$numFeatures, 2)
+ expect_equal(stats$maxDepth, 5)
+
+ # Test numeric prediction values
+ predictions <- collect(predict(model, data))$prediction
+ expect_equal(length(grep("1.0", predictions)), 50)
+ expect_equal(length(grep("2.0", predictions)), 50)
+
+ # spark.decisionTree classification can work on libsvm data
+ if (not_cran_or_windows_with_hadoop()) {
+ data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+ source = "libsvm")
+ model <- spark.decisionTree(data, label ~ features, "classification")
+ expect_equal(summary(model)$numFeatures, 4)
+ }
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_parallelize_collect.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_parallelize_collect.R b/R/pkg/tests/fulltests/test_parallelize_collect.R
new file mode 100644
index 0000000..52d4c93
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_parallelize_collect.R
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("parallelize() and collect()")
+
+# Mock data
+numVector <- c(-10:97)
+numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
+strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
+ "violated, but I'm not. No, in fact, I think this is a friendly",
+ "message, like \"Hey, wanna play?\" and yes, I want to play. ",
+ "I really, really do.")
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
+ "other times it helps me control the chaos.",
+ "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
+ "raising me. But they're both dead now. I didn't kill them. Honest.")
+
+numPairs <- list(list(1, 1), list(1, 2), list(2, 2), list(2, 3))
+strPairs <- list(list(strList, strList), list(strList, strList))
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+jsc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Tests
+
+test_that("parallelize() on simple vectors and lists returns an RDD", {
+ skip_on_cran()
+
+ numVectorRDD <- parallelize(jsc, numVector, 1)
+ numVectorRDD2 <- parallelize(jsc, numVector, 10)
+ numListRDD <- parallelize(jsc, numList, 1)
+ numListRDD2 <- parallelize(jsc, numList, 4)
+ strVectorRDD <- parallelize(jsc, strVector, 2)
+ strVectorRDD2 <- parallelize(jsc, strVector, 3)
+ strListRDD <- parallelize(jsc, strList, 4)
+ strListRDD2 <- parallelize(jsc, strList, 1)
+
+ rdds <- c(numVectorRDD,
+ numVectorRDD2,
+ numListRDD,
+ numListRDD2,
+ strVectorRDD,
+ strVectorRDD2,
+ strListRDD,
+ strListRDD2)
+
+ for (rdd in rdds) {
+ expect_is(rdd, "RDD")
+ expect_true(.hasSlot(rdd, "jrdd")
+ && inherits(rdd@jrdd, "jobj")
+ && isInstanceOf(rdd@jrdd, "org.apache.spark.api.java.JavaRDD"))
+ }
+})
+
+test_that("collect(), following a parallelize(), gives back the original collections", {
+ skip_on_cran()
+
+ numVectorRDD <- parallelize(jsc, numVector, 10)
+ expect_equal(collectRDD(numVectorRDD), as.list(numVector))
+
+ numListRDD <- parallelize(jsc, numList, 1)
+ numListRDD2 <- parallelize(jsc, numList, 4)
+ expect_equal(collectRDD(numListRDD), as.list(numList))
+ expect_equal(collectRDD(numListRDD2), as.list(numList))
+
+ strVectorRDD <- parallelize(jsc, strVector, 2)
+ strVectorRDD2 <- parallelize(jsc, strVector, 3)
+ expect_equal(collectRDD(strVectorRDD), as.list(strVector))
+ expect_equal(collectRDD(strVectorRDD2), as.list(strVector))
+
+ strListRDD <- parallelize(jsc, strList, 4)
+ strListRDD2 <- parallelize(jsc, strList, 1)
+ expect_equal(collectRDD(strListRDD), as.list(strList))
+ expect_equal(collectRDD(strListRDD2), as.list(strList))
+})
+
+test_that("regression: collect() following a parallelize() does not drop elements", {
+ skip_on_cran()
+
+ # 10 %/% 6 = 1, ceiling(10 / 6) = 2
+ collLen <- 10
+ numPart <- 6
+ expected <- runif(collLen)
+ actual <- collectRDD(parallelize(jsc, expected, numPart))
+ expect_equal(actual, as.list(expected))
+})
+
+test_that("parallelize() and collect() work for lists of pairs (pairwise data)", {
+ skip_on_cran()
+
+ # use the pairwise logical to indicate pairwise data
+ numPairsRDDD1 <- parallelize(jsc, numPairs, 1)
+ numPairsRDDD2 <- parallelize(jsc, numPairs, 2)
+ numPairsRDDD3 <- parallelize(jsc, numPairs, 3)
+ expect_equal(collectRDD(numPairsRDDD1), numPairs)
+ expect_equal(collectRDD(numPairsRDDD2), numPairs)
+ expect_equal(collectRDD(numPairsRDDD3), numPairs)
+ # can also leave out the parameter name, if the params are supplied in order
+ strPairsRDDD1 <- parallelize(jsc, strPairs, 1)
+ strPairsRDDD2 <- parallelize(jsc, strPairs, 2)
+ expect_equal(collectRDD(strPairsRDDD1), strPairs)
+ expect_equal(collectRDD(strPairsRDDD2), strPairs)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_rdd.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_rdd.R b/R/pkg/tests/fulltests/test_rdd.R
new file mode 100644
index 0000000..fb244e1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_rdd.R
@@ -0,0 +1,906 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("basic RDD functions")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+nums <- 1:10
+rdd <- parallelize(sc, nums, 2L)
+
+intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
+intRdd <- parallelize(sc, intPairs, 2L)
+
+test_that("get number of partitions in RDD", {
+ skip_on_cran()
+
+ expect_equal(getNumPartitionsRDD(rdd), 2)
+ expect_equal(getNumPartitionsRDD(intRdd), 2)
+})
+
+test_that("first on RDD", {
+ skip_on_cran()
+
+ expect_equal(firstRDD(rdd), 1)
+ newrdd <- lapply(rdd, function(x) x + 1)
+ expect_equal(firstRDD(newrdd), 2)
+})
+
+test_that("count and length on RDD", {
+ skip_on_cran()
+
+ expect_equal(countRDD(rdd), 10)
+ expect_equal(lengthRDD(rdd), 10)
+})
+
+test_that("count by values and keys", {
+ skip_on_cran()
+
+ mods <- lapply(rdd, function(x) { x %% 3 })
+ actual <- countByValue(mods)
+ expected <- list(list(0, 3L), list(1, 4L), list(2, 3L))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ actual <- countByKey(intRdd)
+ expected <- list(list(2L, 2L), list(1L, 2L))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("lapply on RDD", {
+ skip_on_cran()
+
+ multiples <- lapply(rdd, function(x) { 2 * x })
+ actual <- collectRDD(multiples)
+ expect_equal(actual, as.list(nums * 2))
+})
+
+test_that("lapplyPartition on RDD", {
+ skip_on_cran()
+
+ sums <- lapplyPartition(rdd, function(part) { sum(unlist(part)) })
+ actual <- collectRDD(sums)
+ expect_equal(actual, list(15, 40))
+})
+
+test_that("mapPartitions on RDD", {
+ skip_on_cran()
+
+ sums <- mapPartitions(rdd, function(part) { sum(unlist(part)) })
+ actual <- collectRDD(sums)
+ expect_equal(actual, list(15, 40))
+})
+
+test_that("flatMap() on RDDs", {
+ skip_on_cran()
+
+ flat <- flatMap(intRdd, function(x) { list(x, x) })
+ actual <- collectRDD(flat)
+ expect_equal(actual, rep(intPairs, each = 2))
+})
+
+test_that("filterRDD on RDD", {
+ skip_on_cran()
+
+ filtered.rdd <- filterRDD(rdd, function(x) { x %% 2 == 0 })
+ actual <- collectRDD(filtered.rdd)
+ expect_equal(actual, list(2, 4, 6, 8, 10))
+
+ filtered.rdd <- Filter(function(x) { x[[2]] < 0 }, intRdd)
+ actual <- collectRDD(filtered.rdd)
+ expect_equal(actual, list(list(1L, -1)))
+
+ # Filter out all elements.
+ filtered.rdd <- filterRDD(rdd, function(x) { x > 10 })
+ actual <- collectRDD(filtered.rdd)
+ expect_equal(actual, list())
+})
+
+test_that("lookup on RDD", {
+ skip_on_cran()
+
+ vals <- lookup(intRdd, 1L)
+ expect_equal(vals, list(-1, 200))
+
+ vals <- lookup(intRdd, 3L)
+ expect_equal(vals, list())
+})
+
+test_that("several transformations on RDD (a benchmark on PipelinedRDD)", {
+ skip_on_cran()
+
+ rdd2 <- rdd
+ for (i in 1:12)
+ rdd2 <- lapplyPartitionsWithIndex(
+ rdd2, function(partIndex, part) {
+ part <- as.list(unlist(part) * partIndex + i)
+ })
+ rdd2 <- lapply(rdd2, function(x) x + x)
+ actual <- collectRDD(rdd2)
+ expected <- list(24, 24, 24, 24, 24,
+ 168, 170, 172, 174, 176)
+ expect_equal(actual, expected)
+})
+
+test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkpoint()", {
+ skip_on_cran()
+
+ # RDD
+ rdd2 <- rdd
+ # PipelinedRDD
+ rdd2 <- lapplyPartitionsWithIndex(
+ rdd2,
+ function(partIndex, part) {
+ part <- as.list(unlist(part) * partIndex)
+ })
+
+ cacheRDD(rdd2)
+ expect_true(rdd2@env$isCached)
+ rdd2 <- lapply(rdd2, function(x) x)
+ expect_false(rdd2@env$isCached)
+
+ unpersistRDD(rdd2)
+ expect_false(rdd2@env$isCached)
+
+ persistRDD(rdd2, "MEMORY_AND_DISK")
+ expect_true(rdd2@env$isCached)
+ rdd2 <- lapply(rdd2, function(x) x)
+ expect_false(rdd2@env$isCached)
+
+ unpersistRDD(rdd2)
+ expect_false(rdd2@env$isCached)
+
+ tempDir <- tempfile(pattern = "checkpoint")
+ setCheckpointDirSC(sc, tempDir)
+ checkpointRDD(rdd2)
+ expect_true(rdd2@env$isCheckpointed)
+
+ rdd2 <- lapply(rdd2, function(x) x)
+ expect_false(rdd2@env$isCached)
+ expect_false(rdd2@env$isCheckpointed)
+
+ # make sure the data is collectable
+ collectRDD(rdd2)
+
+ unlink(tempDir)
+})
+
+test_that("reduce on RDD", {
+ skip_on_cran()
+
+ sum <- reduce(rdd, "+")
+ expect_equal(sum, 55)
+
+ # Also test with an inline function
+ sumInline <- reduce(rdd, function(x, y) { x + y })
+ expect_equal(sumInline, 55)
+})
+
+test_that("lapply with dependency", {
+ skip_on_cran()
+
+ fa <- 5
+ multiples <- lapply(rdd, function(x) { fa * x })
+ actual <- collectRDD(multiples)
+
+ expect_equal(actual, as.list(nums * 5))
+})
+
+test_that("lapplyPartitionsWithIndex on RDDs", {
+ skip_on_cran()
+
+ func <- function(partIndex, part) { list(partIndex, Reduce("+", part)) }
+ actual <- collectRDD(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE)
+ expect_equal(actual, list(list(0, 15), list(1, 40)))
+
+ pairsRDD <- parallelize(sc, list(list(1, 2), list(3, 4), list(4, 8)), 1L)
+ partitionByParity <- function(key) { if (key %% 2 == 1) 0 else 1 }
+ mkTup <- function(partIndex, part) { list(partIndex, part) }
+ actual <- collectRDD(lapplyPartitionsWithIndex(
+ partitionByRDD(pairsRDD, 2L, partitionByParity),
+ mkTup),
+ FALSE)
+ expect_equal(actual, list(list(0, list(list(1, 2), list(3, 4))),
+ list(1, list(list(4, 8)))))
+})
+
+test_that("sampleRDD() on RDDs", {
+ skip_on_cran()
+
+ expect_equal(unlist(collectRDD(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums)
+})
+
+test_that("takeSample() on RDDs", {
+ skip_on_cran()
+
+ # ported from RDDSuite.scala, modified seeds
+ data <- parallelize(sc, 1:100, 2L)
+ for (seed in 4:5) {
+ s <- takeSample(data, FALSE, 20L, seed)
+ expect_equal(length(s), 20L)
+ expect_equal(length(unique(s)), 20L)
+ for (elem in s) {
+ expect_true(elem >= 1 && elem <= 100)
+ }
+ }
+ for (seed in 4:5) {
+ s <- takeSample(data, FALSE, 200L, seed)
+ expect_equal(length(s), 100L)
+ expect_equal(length(unique(s)), 100L)
+ for (elem in s) {
+ expect_true(elem >= 1 && elem <= 100)
+ }
+ }
+ for (seed in 4:5) {
+ s <- takeSample(data, TRUE, 20L, seed)
+ expect_equal(length(s), 20L)
+ for (elem in s) {
+ expect_true(elem >= 1 && elem <= 100)
+ }
+ }
+ for (seed in 4:5) {
+ s <- takeSample(data, TRUE, 100L, seed)
+ expect_equal(length(s), 100L)
+ # Chance of getting all distinct elements is astronomically low, so test we
+ # got less than 100
+ expect_true(length(unique(s)) < 100L)
+ }
+ for (seed in 4:5) {
+ s <- takeSample(data, TRUE, 200L, seed)
+ expect_equal(length(s), 200L)
+ # Chance of getting all distinct elements is still quite low, so test we
+ # got less than 100
+ expect_true(length(unique(s)) < 100L)
+ }
+})
+
+test_that("mapValues() on pairwise RDDs", {
+ skip_on_cran()
+
+ multiples <- mapValues(intRdd, function(x) { x * 2 })
+ actual <- collectRDD(multiples)
+ expected <- lapply(intPairs, function(x) {
+ list(x[[1]], x[[2]] * 2)
+ })
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("flatMapValues() on pairwise RDDs", {
+ skip_on_cran()
+
+ l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4))))
+ actual <- collectRDD(flatMapValues(l, function(x) { x }))
+ expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+
+ # Generate x to x+1 for every value
+ actual <- collectRDD(flatMapValues(intRdd, function(x) { x: (x + 1) }))
+ expect_equal(actual,
+ list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101),
+ list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
+})
+
+test_that("reduceByKeyLocally() on PairwiseRDDs", {
+ skip_on_cran()
+
+ pairs <- parallelize(sc, list(list(1, 2), list(1.1, 3), list(1, 4)), 2L)
+ actual <- reduceByKeyLocally(pairs, "+")
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list(1, 6), list(1.1, 3))))
+
+ pairs <- parallelize(sc, list(list("abc", 1.2), list(1.1, 0), list("abc", 1.3),
+ list("bb", 5)), 4L)
+ actual <- reduceByKeyLocally(pairs, "+")
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list("abc", 2.5), list(1.1, 0), list("bb", 5))))
+})
+
+test_that("distinct() on RDDs", {
+ skip_on_cran()
+
+ nums.rep2 <- rep(1:10, 2)
+ rdd.rep2 <- parallelize(sc, nums.rep2, 2L)
+ uniques <- distinctRDD(rdd.rep2)
+ actual <- sort(unlist(collectRDD(uniques)))
+ expect_equal(actual, nums)
+})
+
+test_that("maximum() on RDDs", {
+ skip_on_cran()
+
+ max <- maximum(rdd)
+ expect_equal(max, 10)
+})
+
+test_that("minimum() on RDDs", {
+ skip_on_cran()
+
+ min <- minimum(rdd)
+ expect_equal(min, 1)
+})
+
+test_that("sumRDD() on RDDs", {
+ skip_on_cran()
+
+ sum <- sumRDD(rdd)
+ expect_equal(sum, 55)
+})
+
+test_that("keyBy on RDDs", {
+ skip_on_cran()
+
+ func <- function(x) { x * x }
+ keys <- keyBy(rdd, func)
+ actual <- collectRDD(keys)
+ expect_equal(actual, lapply(nums, function(x) { list(func(x), x) }))
+})
+
+test_that("repartition/coalesce on RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, 1:20, 4L) # each partition contains 5 elements
+
+ # repartition
+ r1 <- repartitionRDD(rdd, 2)
+ expect_equal(getNumPartitionsRDD(r1), 2L)
+ count <- length(collectPartition(r1, 0L))
+ expect_true(count >= 8 && count <= 12)
+
+ r2 <- repartitionRDD(rdd, 6)
+ expect_equal(getNumPartitionsRDD(r2), 6L)
+ count <- length(collectPartition(r2, 0L))
+ expect_true(count >= 0 && count <= 4)
+
+ # coalesce
+ r3 <- coalesceRDD(rdd, 1)
+ expect_equal(getNumPartitionsRDD(r3), 1L)
+ count <- length(collectPartition(r3, 0L))
+ expect_equal(count, 20)
+})
+
+test_that("sortBy() on RDDs", {
+ skip_on_cran()
+
+ sortedRdd <- sortBy(rdd, function(x) { x * x }, ascending = FALSE)
+ actual <- collectRDD(sortedRdd)
+ expect_equal(actual, as.list(sort(nums, decreasing = TRUE)))
+
+ rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
+ sortedRdd2 <- sortBy(rdd2, function(x) { x * x })
+ actual <- collectRDD(sortedRdd2)
+ expect_equal(actual, as.list(nums))
+})
+
+test_that("takeOrdered() on RDDs", {
+ skip_on_cran()
+
+ l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
+ rdd <- parallelize(sc, l)
+ actual <- takeOrdered(rdd, 6L)
+ expect_equal(actual, as.list(sort(unlist(l)))[1:6])
+
+ l <- list("e", "d", "c", "d", "a")
+ rdd <- parallelize(sc, l)
+ actual <- takeOrdered(rdd, 3L)
+ expect_equal(actual, as.list(sort(unlist(l)))[1:3])
+})
+
+test_that("top() on RDDs", {
+ skip_on_cran()
+
+ l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
+ rdd <- parallelize(sc, l)
+ actual <- top(rdd, 6L)
+ expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:6])
+
+ l <- list("e", "d", "c", "d", "a")
+ rdd <- parallelize(sc, l)
+ actual <- top(rdd, 3L)
+ expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:3])
+})
+
+test_that("fold() on RDDs", {
+ skip_on_cran()
+
+ actual <- fold(rdd, 0, "+")
+ expect_equal(actual, Reduce("+", nums, 0))
+
+ rdd <- parallelize(sc, list())
+ actual <- fold(rdd, 0, "+")
+ expect_equal(actual, 0)
+})
+
+test_that("aggregateRDD() on RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, list(1, 2, 3, 4))
+ zeroValue <- list(0, 0)
+ seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+ combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+ actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
+ expect_equal(actual, list(10, 4))
+
+ rdd <- parallelize(sc, list())
+ actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
+ expect_equal(actual, list(0, 0))
+})
+
+test_that("zipWithUniqueId() on RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
+ actual <- collectRDD(zipWithUniqueId(rdd))
+ expected <- list(list("a", 0), list("b", 1), list("c", 4),
+ list("d", 2), list("e", 5))
+ expect_equal(actual, expected)
+
+ rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
+ actual <- collectRDD(zipWithUniqueId(rdd))
+ expected <- list(list("a", 0), list("b", 1), list("c", 2),
+ list("d", 3), list("e", 4))
+ expect_equal(actual, expected)
+})
+
+test_that("zipWithIndex() on RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
+ actual <- collectRDD(zipWithIndex(rdd))
+ expected <- list(list("a", 0), list("b", 1), list("c", 2),
+ list("d", 3), list("e", 4))
+ expect_equal(actual, expected)
+
+ rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
+ actual <- collectRDD(zipWithIndex(rdd))
+ expected <- list(list("a", 0), list("b", 1), list("c", 2),
+ list("d", 3), list("e", 4))
+ expect_equal(actual, expected)
+})
+
+test_that("glom() on RDD", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, as.list(1:4), 2L)
+ actual <- collectRDD(glom(rdd))
+ expect_equal(actual, list(list(1, 2), list(3, 4)))
+})
+
+test_that("keys() on RDDs", {
+ skip_on_cran()
+
+ keys <- keys(intRdd)
+ actual <- collectRDD(keys)
+ expect_equal(actual, lapply(intPairs, function(x) { x[[1]] }))
+})
+
+test_that("values() on RDDs", {
+ skip_on_cran()
+
+ values <- values(intRdd)
+ actual <- collectRDD(values)
+ expect_equal(actual, lapply(intPairs, function(x) { x[[2]] }))
+})
+
+test_that("pipeRDD() on RDDs", {
+ skip_on_cran()
+
+ actual <- collectRDD(pipeRDD(rdd, "more"))
+ expected <- as.list(as.character(1:10))
+ expect_equal(actual, expected)
+
+ trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n"))
+ actual <- collectRDD(pipeRDD(trailed.rdd, "sort"))
+ expected <- list("", "1", "2", "3")
+ expect_equal(actual, expected)
+
+ rev.nums <- 9:0
+ rev.rdd <- parallelize(sc, rev.nums, 2L)
+ actual <- collectRDD(pipeRDD(rev.rdd, "sort"))
+ expected <- as.list(as.character(c(5:9, 0:4)))
+ expect_equal(actual, expected)
+})
+
+test_that("zipRDD() on RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, 0:4, 2)
+ rdd2 <- parallelize(sc, 1000:1004, 2)
+ actual <- collectRDD(zipRDD(rdd1, rdd2))
+ expect_equal(actual,
+ list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)))
+
+ mockFile <- c("Spark is pretty.", "Spark is awesome.")
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName, 1)
+ actual <- collectRDD(zipRDD(rdd, rdd))
+ expected <- lapply(mockFile, function(x) { list(x, x) })
+ expect_equal(actual, expected)
+
+ rdd1 <- parallelize(sc, 0:1, 1)
+ actual <- collectRDD(zipRDD(rdd1, rdd))
+ expected <- lapply(0:1, function(x) { list(x, mockFile[x + 1]) })
+ expect_equal(actual, expected)
+
+ rdd1 <- map(rdd, function(x) { x })
+ actual <- collectRDD(zipRDD(rdd, rdd1))
+ expected <- lapply(mockFile, function(x) { list(x, x) })
+ expect_equal(actual, expected)
+
+ unlink(fileName)
+})
+
+test_that("cartesian() on RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, 1:3)
+ actual <- collectRDD(cartesian(rdd, rdd))
+ expect_equal(sortKeyValueList(actual),
+ list(
+ list(1, 1), list(1, 2), list(1, 3),
+ list(2, 1), list(2, 2), list(2, 3),
+ list(3, 1), list(3, 2), list(3, 3)))
+
+ # test case where one RDD is empty
+ emptyRdd <- parallelize(sc, list())
+ actual <- collectRDD(cartesian(rdd, emptyRdd))
+ expect_equal(actual, list())
+
+ mockFile <- c("Spark is pretty.", "Spark is awesome.")
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName)
+ actual <- collectRDD(cartesian(rdd, rdd))
+ expected <- list(
+ list("Spark is awesome.", "Spark is pretty."),
+ list("Spark is awesome.", "Spark is awesome."),
+ list("Spark is pretty.", "Spark is pretty."),
+ list("Spark is pretty.", "Spark is awesome."))
+ expect_equal(sortKeyValueList(actual), expected)
+
+ rdd1 <- parallelize(sc, 0:1)
+ actual <- collectRDD(cartesian(rdd1, rdd))
+ expect_equal(sortKeyValueList(actual),
+ list(
+ list(0, "Spark is pretty."),
+ list(0, "Spark is awesome."),
+ list(1, "Spark is pretty."),
+ list(1, "Spark is awesome.")))
+
+ rdd1 <- map(rdd, function(x) { x })
+ actual <- collectRDD(cartesian(rdd, rdd1))
+ expect_equal(sortKeyValueList(actual), expected)
+
+ unlink(fileName)
+})
+
+test_that("subtract() on RDDs", {
+ skip_on_cran()
+
+ l <- list(1, 1, 2, 2, 3, 4)
+ rdd1 <- parallelize(sc, l)
+
+ # subtract by itself
+ actual <- collectRDD(subtract(rdd1, rdd1))
+ expect_equal(actual, list())
+
+ # subtract by an empty RDD
+ rdd2 <- parallelize(sc, list())
+ actual <- collectRDD(subtract(rdd1, rdd2))
+ expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
+ l)
+
+ rdd2 <- parallelize(sc, list(2, 4))
+ actual <- collectRDD(subtract(rdd1, rdd2))
+ expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
+ list(1, 1, 3))
+
+ l <- list("a", "a", "b", "b", "c", "d")
+ rdd1 <- parallelize(sc, l)
+ rdd2 <- parallelize(sc, list("b", "d"))
+ actual <- collectRDD(subtract(rdd1, rdd2))
+ expect_equal(as.list(sort(as.vector(actual, mode = "character"))),
+ list("a", "a", "c"))
+})
+
+test_that("subtractByKey() on pairwise RDDs", {
+ skip_on_cran()
+
+ l <- list(list("a", 1), list("b", 4),
+ list("b", 5), list("a", 2))
+ rdd1 <- parallelize(sc, l)
+
+ # subtractByKey by itself
+ actual <- collectRDD(subtractByKey(rdd1, rdd1))
+ expect_equal(actual, list())
+
+ # subtractByKey by an empty RDD
+ rdd2 <- parallelize(sc, list())
+ actual <- collectRDD(subtractByKey(rdd1, rdd2))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(l))
+
+ rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1)))
+ actual <- collectRDD(subtractByKey(rdd1, rdd2))
+ expect_equal(actual,
+ list(list("b", 4), list("b", 5)))
+
+ l <- list(list(1, 1), list(2, 4),
+ list(2, 5), list(1, 2))
+ rdd1 <- parallelize(sc, l)
+ rdd2 <- parallelize(sc, list(list(1, 3), list(3, 1)))
+ actual <- collectRDD(subtractByKey(rdd1, rdd2))
+ expect_equal(actual,
+ list(list(2, 4), list(2, 5)))
+})
+
+test_that("intersection() on RDDs", {
+ skip_on_cran()
+
+ # intersection with self
+ actual <- collectRDD(intersection(rdd, rdd))
+ expect_equal(sort(as.integer(actual)), nums)
+
+ # intersection with an empty RDD
+ emptyRdd <- parallelize(sc, list())
+ actual <- collectRDD(intersection(rdd, emptyRdd))
+ expect_equal(actual, list())
+
+ rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5))
+ rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8))
+ actual <- collectRDD(intersection(rdd1, rdd2))
+ expect_equal(sort(as.integer(actual)), 1:3)
+})
+
+test_that("join() on pairwise RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+ rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+ actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3)))))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+ rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+ actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3)))))
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+ rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+ actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+ expect_equal(actual, list())
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+ rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+ actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+ expect_equal(actual, list())
+})
+
+test_that("leftOuterJoin() on pairwise RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+ rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+ actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+ rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+ actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+ rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+ actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+ rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+ actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+})
+
+test_that("rightOuterJoin() on pairwise RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+ rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+ actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+ rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+ actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+ rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+ actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+ rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+ actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
+})
+
+test_that("fullOuterJoin() on pairwise RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3)))
+ rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+ actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list(1, list(2, 1)), list(1, list(3, 1)),
+ list(2, list(NULL, 4)), list(3, list(3, NULL)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1)))
+ rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+ actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+ expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)),
+ list("a", list(3, 1)), list("c", list(1, NULL)))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+ rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+ actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)),
+ list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+ rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+ actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)),
+ list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
+})
+
+test_that("sortByKey() on pairwise RDDs", {
+ skip_on_cran()
+
+ numPairsRdd <- map(rdd, function(x) { list (x, x) })
+ sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
+ actual <- collectRDD(sortedRdd)
+ numPairs <- lapply(nums, function(x) { list (x, x) })
+ expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
+
+ rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
+ numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
+ sortedRdd2 <- sortByKey(numPairsRdd2)
+ actual <- collectRDD(sortedRdd2)
+ expect_equal(actual, numPairs)
+
+ # sort by string keys
+ l <- list(list("a", 1), list("b", 2), list("1", 3), list("d", 4), list("2", 5))
+ rdd3 <- parallelize(sc, l, 2L)
+ sortedRdd3 <- sortByKey(rdd3)
+ actual <- collectRDD(sortedRdd3)
+ expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+ # test on the boundary cases
+
+ # boundary case 1: the RDD to be sorted has only 1 partition
+ rdd4 <- parallelize(sc, l, 1L)
+ sortedRdd4 <- sortByKey(rdd4)
+ actual <- collectRDD(sortedRdd4)
+ expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+ # boundary case 2: the sorted RDD has only 1 partition
+ rdd5 <- parallelize(sc, l, 2L)
+ sortedRdd5 <- sortByKey(rdd5, numPartitions = 1L)
+ actual <- collectRDD(sortedRdd5)
+ expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+ # boundary case 3: the RDD to be sorted has only 1 element
+ l2 <- list(list("a", 1))
+ rdd6 <- parallelize(sc, l2, 2L)
+ sortedRdd6 <- sortByKey(rdd6)
+ actual <- collectRDD(sortedRdd6)
+ expect_equal(actual, l2)
+
+ # boundary case 4: the RDD to be sorted has 0 element
+ l3 <- list()
+ rdd7 <- parallelize(sc, l3, 2L)
+ sortedRdd7 <- sortByKey(rdd7)
+ actual <- collectRDD(sortedRdd7)
+ expect_equal(actual, l3)
+})
+
+test_that("collectAsMap() on a pairwise RDD", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
+ vals <- collectAsMap(rdd)
+ expect_equal(vals, list(`1` = 2, `3` = 4))
+
+ rdd <- parallelize(sc, list(list("a", 1), list("b", 2)))
+ vals <- collectAsMap(rdd)
+ expect_equal(vals, list(a = 1, b = 2))
+
+ rdd <- parallelize(sc, list(list(1.1, 2.2), list(1.2, 2.4)))
+ vals <- collectAsMap(rdd)
+ expect_equal(vals, list(`1.1` = 2.2, `1.2` = 2.4))
+
+ rdd <- parallelize(sc, list(list(1, "a"), list(2, "b")))
+ vals <- collectAsMap(rdd)
+ expect_equal(vals, list(`1` = "a", `2` = "b"))
+})
+
+test_that("show()", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, list(1:10))
+ expect_output(showRDD(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+")
+})
+
+test_that("sampleByKey() on pairwise RDDs", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, 1:2000)
+ pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list("a", x) else list("b", x) })
+ fractions <- list(a = 0.2, b = 0.1)
+ sample <- sampleByKey(pairsRDD, FALSE, fractions, 1618L)
+ expect_equal(100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")), TRUE)
+ expect_equal(50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")), TRUE)
+ expect_equal(lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0, TRUE)
+ expect_equal(lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000, TRUE)
+ expect_equal(lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0, TRUE)
+ expect_equal(lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000, TRUE)
+
+ rdd <- parallelize(sc, 1:2000)
+ pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list(2, x) else list(3, x) })
+ fractions <- list(`2` = 0.2, `3` = 0.1)
+ sample <- sampleByKey(pairsRDD, TRUE, fractions, 1618L)
+ expect_equal(100 < length(lookup(sample, 2)) && 300 > length(lookup(sample, 2)), TRUE)
+ expect_equal(50 < length(lookup(sample, 3)) && 150 > length(lookup(sample, 3)), TRUE)
+ expect_equal(lookup(sample, 2)[which.min(lookup(sample, 2))] >= 0, TRUE)
+ expect_equal(lookup(sample, 2)[which.max(lookup(sample, 2))] <= 2000, TRUE)
+ expect_equal(lookup(sample, 3)[which.min(lookup(sample, 3))] >= 0, TRUE)
+ expect_equal(lookup(sample, 3)[which.max(lookup(sample, 3))] <= 2000, TRUE)
+})
+
+test_that("Test correct concurrency of RRDD.compute()", {
+ skip_on_cran()
+
+ rdd <- parallelize(sc, 1:1000, 100)
+ jrdd <- getJRDD(lapply(rdd, function(x) { x }), "row")
+ zrdd <- callJMethod(jrdd, "zip", jrdd)
+ count <- callJMethod(zrdd, "count")
+ expect_equal(count, 1000)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_shuffle.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_shuffle.R b/R/pkg/tests/fulltests/test_shuffle.R
new file mode 100644
index 0000000..18320ea
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_shuffle.R
@@ -0,0 +1,248 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("partitionBy, groupByKey, reduceByKey etc.")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
+intRdd <- parallelize(sc, intPairs, 2L)
+
+doublePairs <- list(list(1.5, -1), list(2.5, 100), list(2.5, 1), list(1.5, 200))
+doubleRdd <- parallelize(sc, doublePairs, 2L)
+
+numPairs <- list(list(1L, 100), list(2L, 200), list(4L, -1), list(3L, 1),
+ list(3L, 0))
+numPairsRdd <- parallelize(sc, numPairs, length(numPairs))
+
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge and ",
+ "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ")
+strListRDD <- parallelize(sc, strList, 4)
+
+test_that("groupByKey for integers", {
+ skip_on_cran()
+
+ grouped <- groupByKey(intRdd, 2L)
+
+ actual <- collectRDD(grouped)
+
+ expected <- list(list(2L, list(100, 1)), list(1L, list(-1, 200)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("groupByKey for doubles", {
+ skip_on_cran()
+
+ grouped <- groupByKey(doubleRdd, 2L)
+
+ actual <- collectRDD(grouped)
+
+ expected <- list(list(1.5, list(-1, 200)), list(2.5, list(100, 1)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("reduceByKey for ints", {
+ skip_on_cran()
+
+ reduced <- reduceByKey(intRdd, "+", 2L)
+
+ actual <- collectRDD(reduced)
+
+ expected <- list(list(2L, 101), list(1L, 199))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("reduceByKey for doubles", {
+ skip_on_cran()
+
+ reduced <- reduceByKey(doubleRdd, "+", 2L)
+ actual <- collectRDD(reduced)
+
+ expected <- list(list(1.5, 199), list(2.5, 101))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for ints", {
+ skip_on_cran()
+
+ reduced <- combineByKey(intRdd, function(x) { x }, "+", "+", 2L)
+
+ actual <- collectRDD(reduced)
+
+ expected <- list(list(2L, 101), list(1L, 199))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for doubles", {
+ skip_on_cran()
+
+ reduced <- combineByKey(doubleRdd, function(x) { x }, "+", "+", 2L)
+ actual <- collectRDD(reduced)
+
+ expected <- list(list(1.5, 199), list(2.5, 101))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for characters", {
+ skip_on_cran()
+
+ stringKeyRDD <- parallelize(sc,
+ list(list("max", 1L), list("min", 2L),
+ list("other", 3L), list("max", 4L)), 2L)
+ reduced <- combineByKey(stringKeyRDD,
+ function(x) { x }, "+", "+", 2L)
+ actual <- collectRDD(reduced)
+
+ expected <- list(list("max", 5L), list("min", 2L), list("other", 3L))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("aggregateByKey", {
+ skip_on_cran()
+
+ # test aggregateByKey for int keys
+ rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+
+ zeroValue <- list(0, 0)
+ seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+ combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+ aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
+
+ actual <- collectRDD(aggregatedRDD)
+
+ expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ # test aggregateByKey for string keys
+ rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))
+
+ zeroValue <- list(0, 0)
+ seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+ combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+ aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
+
+ actual <- collectRDD(aggregatedRDD)
+
+ expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("foldByKey", {
+ skip_on_cran()
+
+ # test foldByKey for int keys
+ folded <- foldByKey(intRdd, 0, "+", 2L)
+
+ actual <- collectRDD(folded)
+
+ expected <- list(list(2L, 101), list(1L, 199))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ # test foldByKey for double keys
+ folded <- foldByKey(doubleRdd, 0, "+", 2L)
+
+ actual <- collectRDD(folded)
+
+ expected <- list(list(1.5, 199), list(2.5, 101))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ # test foldByKey for string keys
+ stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))
+
+ stringKeyRDD <- parallelize(sc, stringKeyPairs)
+ folded <- foldByKey(stringKeyRDD, 0, "+", 2L)
+
+ actual <- collectRDD(folded)
+
+ expected <- list(list("b", 101), list("a", 199))
+ expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+ # test foldByKey for empty pair RDD
+ rdd <- parallelize(sc, list())
+ folded <- foldByKey(rdd, 0, "+", 2L)
+ actual <- collectRDD(folded)
+ expected <- list()
+ expect_equal(actual, expected)
+
+ # test foldByKey for RDD with only 1 pair
+ rdd <- parallelize(sc, list(list(1, 1)))
+ folded <- foldByKey(rdd, 0, "+", 2L)
+ actual <- collectRDD(folded)
+ expected <- list(list(1, 1))
+ expect_equal(actual, expected)
+})
+
+test_that("partitionBy() partitions data correctly", {
+ skip_on_cran()
+
+ # Partition by magnitude
+ partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
+
+ resultRDD <- partitionByRDD(numPairsRdd, 2L, partitionByMagnitude)
+
+ expected_first <- list(list(1, 100), list(2, 200)) # key less than 3
+ expected_second <- list(list(4, -1), list(3, 1), list(3, 0)) # key greater than or equal 3
+ actual_first <- collectPartition(resultRDD, 0L)
+ actual_second <- collectPartition(resultRDD, 1L)
+
+ expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+ expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+test_that("partitionBy works with dependencies", {
+ skip_on_cran()
+
+ kOne <- 1
+ partitionByParity <- function(key) { if (key %% 2 == kOne) 7 else 4 }
+
+ # Partition by parity
+ resultRDD <- partitionByRDD(numPairsRdd, numPartitions = 2L, partitionByParity)
+
+ # keys even; 100 %% 2 == 0
+ expected_first <- list(list(2, 200), list(4, -1))
+ # keys odd; 3 %% 2 == 1
+ expected_second <- list(list(1, 100), list(3, 1), list(3, 0))
+ actual_first <- collectPartition(resultRDD, 0L)
+ actual_second <- collectPartition(resultRDD, 1L)
+
+ expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+ expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+test_that("test partitionBy with string keys", {
+ skip_on_cran()
+
+ words <- flatMap(strListRDD, function(line) { strsplit(line, " ")[[1]] })
+ wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+ resultRDD <- partitionByRDD(wordCount, 2L)
+ expected_first <- list(list("Dexter", 1), list("Dexter", 1))
+ expected_second <- list(list("and", 1), list("and", 1))
+
+ actual_first <- Filter(function(item) { item[[1]] == "Dexter" },
+ collectPartition(resultRDD, 0L))
+ actual_second <- Filter(function(item) { item[[1]] == "and" },
+ collectPartition(resultRDD, 1L))
+
+ expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+ expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkR.R b/R/pkg/tests/fulltests/test_sparkR.R
new file mode 100644
index 0000000..a40981c
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkR.R
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in sparkR.R")
+
+test_that("sparkCheckInstall", {
+ skip_on_cran()
+
+ # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
+ # and the SparkR job was submitted by "spark-submit"
+ sparkHome <- paste0(tempdir(), "/", "sparkHome")
+ dir.create(sparkHome)
+ master <- ""
+ deployMode <- ""
+ expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+ unlink(sparkHome, recursive = TRUE)
+
+ # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
+ # and the SparkR job was submitted by "spark-submit"
+ sparkHome <- ""
+ master <- ""
+ deployMode <- ""
+ expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+
+ # "yarn-client, mesos-client" mode, SPARK_HOME was not set
+ sparkHome <- ""
+ master <- "yarn-client"
+ deployMode <- ""
+ expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+ sparkHome <- ""
+ master <- ""
+ deployMode <- "client"
+ expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+})
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org
[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic
tests only for CRAN
Posted by fe...@apache.org.
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_streaming.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_streaming.R b/R/pkg/inst/tests/testthat/test_streaming.R
deleted file mode 100644
index b20b431..0000000
--- a/R/pkg/inst/tests/testthat/test_streaming.R
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("Structured Streaming")
-
-# Tests for Structured Streaming functions in SparkR
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-jsonSubDir <- file.path("sparkr-test", "json", "")
-if (.Platform$OS.type == "windows") {
- # file.path removes the empty separator on Windows, adds it back
- jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
-}
-jsonDir <- file.path(tempdir(), jsonSubDir)
-dir.create(jsonDir, recursive = TRUE)
-
-mockLines <- c("{\"name\":\"Michael\"}",
- "{\"name\":\"Andy\", \"age\":30}",
- "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-writeLines(mockLines, jsonPath)
-
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
- "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
- "{\"name\":\"David\",\"age\":60,\"height\":null}")
-jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-
-schema <- structType(structField("name", "string"),
- structField("age", "integer"),
- structField("count", "double"))
-
-test_that("read.stream, write.stream, awaitTermination, stopQuery", {
- skip_on_cran()
-
- df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
- expect_true(isStreaming(df))
- counts <- count(group_by(df, "name"))
- q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete")
-
- expect_false(awaitTermination(q, 5 * 1000))
- callJMethod(q@ssq, "processAllAvailable")
- expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
-
- writeLines(mockLinesNa, jsonPathNa)
- awaitTermination(q, 5 * 1000)
- callJMethod(q@ssq, "processAllAvailable")
- expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
-
- stopQuery(q)
- expect_true(awaitTermination(q, 1))
- expect_error(awaitTermination(q), NA)
-})
-
-test_that("print from explain, lastProgress, status, isActive", {
- skip_on_cran()
-
- df <- read.stream("json", path = jsonDir, schema = schema)
- expect_true(isStreaming(df))
- counts <- count(group_by(df, "name"))
- q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete")
-
- awaitTermination(q, 5 * 1000)
- callJMethod(q@ssq, "processAllAvailable")
-
- expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
- expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q)))))
- expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)))))
-
- expect_equal(queryName(q), "people2")
- expect_true(isActive(q))
-
- stopQuery(q)
-})
-
-test_that("Stream other format", {
- skip_on_cran()
-
- parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
- df <- read.df(jsonPath, "json", schema)
- write.df(df, parquetPath, "parquet", "overwrite")
-
- df <- read.stream(path = parquetPath, schema = schema)
- expect_true(isStreaming(df))
- counts <- count(group_by(df, "name"))
- q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete")
-
- expect_false(awaitTermination(q, 5 * 1000))
- callJMethod(q@ssq, "processAllAvailable")
- expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
-
- expect_equal(queryName(q), "people3")
- expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet",
- capture.output(lastProgress(q)))))
- expect_true(isActive(q))
-
- stopQuery(q)
- expect_true(awaitTermination(q, 1))
- expect_false(isActive(q))
-
- unlink(parquetPath)
-})
-
-test_that("Non-streaming DataFrame", {
- skip_on_cran()
-
- c <- as.DataFrame(cars)
- expect_false(isStreaming(c))
-
- expect_error(write.stream(c, "memory", queryName = "people", outputMode = "complete"),
- paste0(".*(writeStream : analysis error - 'writeStream' can be called only on ",
- "streaming Dataset/DataFrame).*"))
-})
-
-test_that("Unsupported operation", {
- skip_on_cran()
-
- # memory sink without aggregation
- df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
- expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
- paste0(".*(start : analysis error - Complete output mode not supported when there ",
- "are no streaming aggregations on streaming DataFrames/Datasets).*"))
-})
-
-test_that("Terminated by error", {
- skip_on_cran()
-
- df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = -1)
- counts <- count(group_by(df, "name"))
- # This would not fail before returning with a StreamingQuery,
- # but could dump error log at just about the same time
- expect_error(q <- write.stream(counts, "memory", queryName = "people4", outputMode = "complete"),
- NA)
-
- expect_error(awaitTermination(q, 5 * 1000),
- paste0(".*(awaitTermination : streaming query error - Invalid value '-1' for option",
- " 'maxFilesPerTrigger', must be a positive integer).*"))
-
- expect_true(any(grepl("\"message\" : \"Terminated with exception: Invalid value",
- capture.output(status(q)))))
- expect_true(any(grepl("Streaming query has no progress", capture.output(lastProgress(q)))))
- expect_equal(queryName(q), "people4")
- expect_false(isActive(q))
-
- stopQuery(q)
-})
-
-unlink(jsonPath)
-unlink(jsonPathNa)
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_take.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/testthat/test_take.R
deleted file mode 100644
index c00723b..0000000
--- a/R/pkg/inst/tests/testthat/test_take.R
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("tests RDD function take()")
-
-# Mock data
-numVector <- c(-10:97)
-numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
-strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
- "violated, but I'm not. No, in fact, I think this is a friendly",
- "message, like \"Hey, wanna play?\" and yes, I want to play. ",
- "I really, really do.")
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
- "other times it helps me control the chaos.",
- "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
- "raising me. But they're both dead now. I didn't kill them. Honest.")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-test_that("take() gives back the original elements in correct count and order", {
- skip_on_cran()
-
- numVectorRDD <- parallelize(sc, numVector, 10)
- # case: number of elements to take is less than the size of the first partition
- expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1)))
- # case: number of elements to take is the same as the size of the first partition
- expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11)))
- # case: number of elements to take is greater than all elements
- expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector))
- expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), as.list(numVector))
-
- numListRDD <- parallelize(sc, numList, 1)
- numListRDD2 <- parallelize(sc, numList, 4)
- expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3))
- expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5))
- expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1)))
- expect_equal(takeRDD(numListRDD2, 999), numList)
-
- strVectorRDD <- parallelize(sc, strVector, 2)
- strVectorRDD2 <- parallelize(sc, strVector, 3)
- expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector))
- expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2)))
-
- strListRDD <- parallelize(sc, strList, 4)
- strListRDD2 <- parallelize(sc, strList, 1)
- expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3)))
- expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1)))
-
- expect_equal(length(takeRDD(strListRDD, 0)), 0)
- expect_equal(length(takeRDD(strVectorRDD, 0)), 0)
- expect_equal(length(takeRDD(numListRDD, 0)), 0)
- expect_equal(length(takeRDD(numVectorRDD, 0)), 0)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_textFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/testthat/test_textFile.R
deleted file mode 100644
index e8a961c..0000000
--- a/R/pkg/inst/tests/testthat/test_textFile.R
+++ /dev/null
@@ -1,182 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("the textFile() function")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("textFile() on a local file returns an RDD", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName)
- expect_is(rdd, "RDD")
- expect_true(countRDD(rdd) > 0)
- expect_equal(countRDD(rdd), 2)
-
- unlink(fileName)
-})
-
-test_that("textFile() followed by a collect() returns the same content", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName)
- expect_equal(collectRDD(rdd), as.list(mockFile))
-
- unlink(fileName)
-})
-
-test_that("textFile() word count works as expected", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName)
-
- words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
- wordCount <- lapply(words, function(word) { list(word, 1L) })
-
- counts <- reduceByKey(wordCount, "+", 2L)
- output <- collectRDD(counts)
- expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1),
- list("Spark", 2))
- expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
-
- unlink(fileName)
-})
-
-test_that("several transformations on RDD created by textFile()", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName) # RDD
- for (i in 1:10) {
- # PipelinedRDD initially created from RDD
- rdd <- lapply(rdd, function(x) paste(x, x))
- }
- collectRDD(rdd)
-
- unlink(fileName)
-})
-
-test_that("textFile() followed by a saveAsTextFile() returns the same content", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName1)
-
- rdd <- textFile(sc, fileName1, 1L)
- saveAsTextFile(rdd, fileName2)
- rdd <- textFile(sc, fileName2)
- expect_equal(collectRDD(rdd), as.list(mockFile))
-
- unlink(fileName1)
- unlink(fileName2)
-})
-
-test_that("saveAsTextFile() on a parallelized list works as expected", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- l <- list(1, 2, 3)
- rdd <- parallelize(sc, l, 1L)
- saveAsTextFile(rdd, fileName)
- rdd <- textFile(sc, fileName)
- expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)}))
-
- unlink(fileName)
-})
-
-test_that("textFile() and saveAsTextFile() word count works as expected", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName1)
-
- rdd <- textFile(sc, fileName1)
-
- words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
- wordCount <- lapply(words, function(word) { list(word, 1L) })
-
- counts <- reduceByKey(wordCount, "+", 2L)
-
- saveAsTextFile(counts, fileName2)
- rdd <- textFile(sc, fileName2)
-
- output <- collectRDD(rdd)
- expected <- list(list("awesome.", 1), list("Spark", 2),
- list("pretty.", 1), list("is", 2))
- expectedStr <- lapply(expected, function(x) { toString(x) })
- expect_equal(sortKeyValueList(output), sortKeyValueList(expectedStr))
-
- unlink(fileName1)
- unlink(fileName2)
-})
-
-test_that("textFile() on multiple paths", {
- skip_on_cran()
-
- fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines("Spark is pretty.", fileName1)
- writeLines("Spark is awesome.", fileName2)
-
- rdd <- textFile(sc, c(fileName1, fileName2))
- expect_equal(countRDD(rdd), 2)
-
- unlink(fileName1)
- unlink(fileName2)
-})
-
-test_that("Pipelined operations on RDDs created using textFile", {
- skip_on_cran()
-
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- rdd <- textFile(sc, fileName)
-
- lengths <- lapply(rdd, function(x) { length(x) })
- expect_equal(collectRDD(lengths), list(1, 1))
-
- lengthsPipelined <- lapply(lengths, function(x) { x + 10 })
- expect_equal(collectRDD(lengthsPipelined), list(11, 11))
-
- lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 })
- expect_equal(collectRDD(lengths30), list(31, 31))
-
- lengths20 <- lapply(lengths, function(x) { x + 20 })
- expect_equal(collectRDD(lengths20), list(21, 21))
-
- unlink(fileName)
-})
-
-sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R
deleted file mode 100644
index 6197ae7..0000000
--- a/R/pkg/inst/tests/testthat/test_utils.R
+++ /dev/null
@@ -1,248 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in utils.R")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-test_that("convertJListToRList() gives back (deserializes) the original JLists
- of strings and integers", {
- skip_on_cran()
- # It's hard to manually create a Java List using rJava, since it does not
- # support generics well. Instead, we rely on collectRDD() returning a
- # JList.
- nums <- as.list(1:10)
- rdd <- parallelize(sc, nums, 1L)
- jList <- callJMethod(rdd@jrdd, "collect")
- rList <- convertJListToRList(jList, flatten = TRUE)
- expect_equal(rList, nums)
-
- strs <- as.list("hello", "spark")
- rdd <- parallelize(sc, strs, 2L)
- jList <- callJMethod(rdd@jrdd, "collect")
- rList <- convertJListToRList(jList, flatten = TRUE)
- expect_equal(rList, strs)
-})
-
-test_that("serializeToBytes on RDD", {
- skip_on_cran()
- # File content
- mockFile <- c("Spark is pretty.", "Spark is awesome.")
- fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
- writeLines(mockFile, fileName)
-
- text.rdd <- textFile(sc, fileName)
- expect_equal(getSerializedMode(text.rdd), "string")
- ser.rdd <- serializeToBytes(text.rdd)
- expect_equal(collectRDD(ser.rdd), as.list(mockFile))
- expect_equal(getSerializedMode(ser.rdd), "byte")
-
- unlink(fileName)
-})
-
-test_that("cleanClosure on R functions", {
- y <- c(1, 2, 3)
- g <- function(x) { x + 1 }
- f <- function(x) { g(x) + y }
- newF <- cleanClosure(f)
- env <- environment(newF)
- expect_equal(length(ls(env)), 2) # y, g
- actual <- get("y", envir = env, inherits = FALSE)
- expect_equal(actual, y)
- actual <- get("g", envir = env, inherits = FALSE)
- expect_equal(actual, g)
-
- # Test for nested enclosures and package variables.
- env2 <- new.env()
- funcEnv <- new.env(parent = env2)
- f <- function(x) { log(g(x) + y) }
- environment(f) <- funcEnv # enclosing relationship: f -> funcEnv -> env2 -> .GlobalEnv
- newF <- cleanClosure(f)
- env <- environment(newF)
- expect_equal(length(ls(env)), 2) # "min" should not be included
- actual <- get("y", envir = env, inherits = FALSE)
- expect_equal(actual, y)
- actual <- get("g", envir = env, inherits = FALSE)
- expect_equal(actual, g)
-
- base <- c(1, 2, 3)
- l <- list(field = matrix(1))
- field <- matrix(2)
- defUse <- 3
- g <- function(x) { x + y }
- f <- function(x) {
- defUse <- base::as.integer(x) + 1 # Test for access operators `::`.
- lapply(x, g) + 1 # Test for capturing function call "g"'s closure as a argument of lapply.
- l$field[1, 1] <- 3 # Test for access operators `$`.
- res <- defUse + l$field[1, ] # Test for def-use chain of "defUse", and "" symbol.
- f(res) # Test for recursive calls.
- }
- newF <- cleanClosure(f)
- env <- environment(newF)
- # TODO(shivaram): length(ls(env)) is 4 here for some reason and `lapply` is included in `env`.
- # Disabling this test till we debug this.
- #
- # nolint start
- # expect_equal(length(ls(env)), 3) # Only "g", "l" and "f". No "base", "field" or "defUse".
- # nolint end
- expect_true("g" %in% ls(env))
- expect_true("l" %in% ls(env))
- expect_true("f" %in% ls(env))
- expect_equal(get("l", envir = env, inherits = FALSE), l)
- # "y" should be in the environemnt of g.
- newG <- get("g", envir = env, inherits = FALSE)
- env <- environment(newG)
- expect_equal(length(ls(env)), 1)
- actual <- get("y", envir = env, inherits = FALSE)
- expect_equal(actual, y)
-
- # Test for function (and variable) definitions.
- f <- function(x) {
- g <- function(y) { y * 2 }
- g(x)
- }
- newF <- cleanClosure(f)
- env <- environment(newF)
- expect_equal(length(ls(env)), 0) # "y" and "g" should not be included.
-
- # Test for overriding variables in base namespace (Issue: SparkR-196).
- nums <- as.list(1:10)
- rdd <- parallelize(sc, nums, 2L)
- t <- 4 # Override base::t in .GlobalEnv.
- f <- function(x) { x > t }
- newF <- cleanClosure(f)
- env <- environment(newF)
- expect_equal(ls(env), "t")
- expect_equal(get("t", envir = env, inherits = FALSE), t)
- actual <- collectRDD(lapply(rdd, f))
- expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6)))
- expect_equal(actual, expected)
-
- # Test for broadcast variables.
- a <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
- aBroadcast <- broadcastRDD(sc, a)
- normMultiply <- function(x) { norm(aBroadcast$value) * x }
- newnormMultiply <- SparkR:::cleanClosure(normMultiply)
- env <- environment(newnormMultiply)
- expect_equal(ls(env), "aBroadcast")
- expect_equal(get("aBroadcast", envir = env, inherits = FALSE), aBroadcast)
-})
-
-test_that("varargsToJProperties", {
- jprops <- newJObject("java.util.Properties")
- expect_true(class(jprops) == "jobj")
-
- jprops <- varargsToJProperties(abc = "123")
- expect_true(class(jprops) == "jobj")
- expect_equal(callJMethod(jprops, "getProperty", "abc"), "123")
-
- jprops <- varargsToJProperties(abc = "abc", b = 1)
- expect_equal(callJMethod(jprops, "getProperty", "abc"), "abc")
- expect_equal(callJMethod(jprops, "getProperty", "b"), "1")
-
- jprops <- varargsToJProperties()
- expect_equal(callJMethod(jprops, "size"), 0L)
-})
-
-test_that("convertToJSaveMode", {
- s <- convertToJSaveMode("error")
- expect_true(class(s) == "jobj")
- expect_match(capture.output(print.jobj(s)), "Java ref type org.apache.spark.sql.SaveMode id ")
- expect_error(convertToJSaveMode("foo"),
- 'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
-})
-
-test_that("captureJVMException", {
- skip_on_cran()
-
- method <- "createStructField"
- expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", method,
- "col", "unknown", TRUE),
- error = function(e) {
- captureJVMException(e, method)
- }),
- "parse error - .*DataType unknown.*not supported.")
-})
-
-test_that("hashCode", {
- skip_on_cran()
-
- expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA)
-})
-
-test_that("overrideEnvs", {
- config <- new.env()
- config[["spark.master"]] <- "foo"
- config[["config_only"]] <- "ok"
- param <- new.env()
- param[["spark.master"]] <- "local"
- param[["param_only"]] <- "blah"
- overrideEnvs(config, param)
- expect_equal(config[["spark.master"]], "local")
- expect_equal(config[["param_only"]], "blah")
- expect_equal(config[["config_only"]], "ok")
-})
-
-test_that("rbindRaws", {
-
- # Mixed Column types
- r <- serialize(1:5, connection = NULL)
- r1 <- serialize(1, connection = NULL)
- r2 <- serialize(letters, connection = NULL)
- r3 <- serialize(1:10, connection = NULL)
- inputData <- list(list(1L, r1, "a", r), list(2L, r2, "b", r),
- list(3L, r3, "c", r))
- expected <- data.frame(V1 = 1:3)
- expected$V2 <- list(r1, r2, r3)
- expected$V3 <- c("a", "b", "c")
- expected$V4 <- list(r, r, r)
- result <- rbindRaws(inputData)
- expect_equal(expected, result)
-
- # Single binary column
- input <- list(list(r1), list(r2), list(r3))
- expected <- subset(expected, select = "V2")
- result <- setNames(rbindRaws(input), "V2")
- expect_equal(expected, result)
-
-})
-
-test_that("varargsToStrEnv", {
- strenv <- varargsToStrEnv(a = 1, b = 1.1, c = TRUE, d = "abcd")
- env <- varargsToEnv(a = "1", b = "1.1", c = "true", d = "abcd")
- expect_equal(strenv, env)
- expect_error(varargsToStrEnv(a = list(1, "a")),
- paste0("Unsupported type for a : list. Supported types are logical, ",
- "numeric, character and NULL."))
- expect_warning(varargsToStrEnv(a = 1, 2, 3, 4), "Unnamed arguments ignored: 2, 3, 4.")
- expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
-})
-
-test_that("basenameSansExtFromUrl", {
- x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
- "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
- expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
- z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
- expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
-})
-
-sparkR.session.stop()
-
-message("--- End test (utils) ", as.POSIXct(Sys.time(), tz = "GMT"))
-message("elapsed ", (proc.time() - timer_ptm)[3])
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/jarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/jarTest.R b/R/pkg/tests/fulltests/jarTest.R
new file mode 100644
index 0000000..e2241e0
--- /dev/null
+++ b/R/pkg/tests/fulltests/jarTest.R
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+library(SparkR)
+
+sc <- sparkR.session(master = "local[1]")
+
+helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
+ "helloWorld",
+ "Dave")
+stopifnot(identical(helloTest, "Hello Dave"))
+
+basicFunction <- SparkR:::callJStatic("sparkrtest.DummyClass",
+ "addStuff",
+ 2L,
+ 2L)
+stopifnot(basicFunction == 4L)
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/packageInAJarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/packageInAJarTest.R b/R/pkg/tests/fulltests/packageInAJarTest.R
new file mode 100644
index 0000000..ac70626
--- /dev/null
+++ b/R/pkg/tests/fulltests/packageInAJarTest.R
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+library(SparkR)
+library(sparkPackageTest)
+
+sparkR.session(master = "local[1]")
+
+run1 <- myfunc(5L)
+
+run2 <- myfunc(-4L)
+
+sparkR.session.stop()
+
+if (run1 != 6) quit(save = "no", status = 1)
+
+if (run2 != -3) quit(save = "no", status = 1)
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_Serde.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
new file mode 100644
index 0000000..6e160fa
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("SerDe functionality")
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("SerDe of primitive types", {
+ skip_on_cran()
+
+ x <- callJStatic("SparkRHandler", "echo", 1L)
+ expect_equal(x, 1L)
+ expect_equal(class(x), "integer")
+
+ x <- callJStatic("SparkRHandler", "echo", 1)
+ expect_equal(x, 1)
+ expect_equal(class(x), "numeric")
+
+ x <- callJStatic("SparkRHandler", "echo", TRUE)
+ expect_true(x)
+ expect_equal(class(x), "logical")
+
+ x <- callJStatic("SparkRHandler", "echo", "abc")
+ expect_equal(x, "abc")
+ expect_equal(class(x), "character")
+})
+
+test_that("SerDe of list of primitive types", {
+ skip_on_cran()
+
+ x <- list(1L, 2L, 3L)
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+ expect_equal(class(y[[1]]), "integer")
+
+ x <- list(1, 2, 3)
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+ expect_equal(class(y[[1]]), "numeric")
+
+ x <- list(TRUE, FALSE)
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+ expect_equal(class(y[[1]]), "logical")
+
+ x <- list("a", "b", "c")
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+ expect_equal(class(y[[1]]), "character")
+
+ # Empty list
+ x <- list()
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+})
+
+test_that("SerDe of list of lists", {
+ skip_on_cran()
+
+ x <- list(list(1L, 2L, 3L), list(1, 2, 3),
+ list(TRUE, FALSE), list("a", "b", "c"))
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+
+ # List of empty lists
+ x <- list(list(), list())
+ y <- callJStatic("SparkRHandler", "echo", x)
+ expect_equal(x, y)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_Windows.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_Windows.R b/R/pkg/tests/fulltests/test_Windows.R
new file mode 100644
index 0000000..00d684e
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_Windows.R
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+context("Windows-specific tests")
+
+test_that("sparkJars tag in SparkContext", {
+ skip_on_cran()
+
+ if (.Platform$OS.type != "windows") {
+ skip("This test is only for Windows, skipped")
+ }
+
+ testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE)
+ abcPath <- testOutput[1]
+ expect_equal(abcPath, "a\\b\\c")
+})
+
+message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT"))
+message("elapsed ", (proc.time() - timer_ptm)[3])
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_binaryFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_binaryFile.R b/R/pkg/tests/fulltests/test_binaryFile.R
new file mode 100644
index 0000000..00954fa
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_binaryFile.R
@@ -0,0 +1,100 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions on binary files")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("saveAsObjectFile()/objectFile() following textFile() works", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName1)
+
+ rdd <- textFile(sc, fileName1, 1)
+ saveAsObjectFile(rdd, fileName2)
+ rdd <- objectFile(sc, fileName2)
+ expect_equal(collectRDD(rdd), as.list(mockFile))
+
+ unlink(fileName1)
+ unlink(fileName2, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
+ skip_on_cran()
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+
+ l <- list(1, 2, 3)
+ rdd <- parallelize(sc, l, 1)
+ saveAsObjectFile(rdd, fileName)
+ rdd <- objectFile(sc, fileName)
+ expect_equal(collectRDD(rdd), l)
+
+ unlink(fileName, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() following RDD transformations works", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName1)
+
+ rdd <- textFile(sc, fileName1)
+
+ words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+ wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+ counts <- reduceByKey(wordCount, "+", 2L)
+
+ saveAsObjectFile(counts, fileName2)
+ counts <- objectFile(sc, fileName2)
+
+ output <- collectRDD(counts)
+ expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1),
+ list("is", 2))
+ expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
+
+ unlink(fileName1)
+ unlink(fileName2, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
+ skip_on_cran()
+
+ fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+
+ rdd1 <- parallelize(sc, "Spark is pretty.")
+ saveAsObjectFile(rdd1, fileName1)
+ rdd2 <- parallelize(sc, "Spark is awesome.")
+ saveAsObjectFile(rdd2, fileName2)
+
+ rdd <- objectFile(sc, c(fileName1, fileName2))
+ expect_equal(countRDD(rdd), 2)
+
+ unlink(fileName1, recursive = TRUE)
+ unlink(fileName2, recursive = TRUE)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_binary_function.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_binary_function.R b/R/pkg/tests/fulltests/test_binary_function.R
new file mode 100644
index 0000000..236cb38
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_binary_function.R
@@ -0,0 +1,110 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("binary functions")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+nums <- 1:10
+rdd <- parallelize(sc, nums, 2L)
+
+# File content
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("union on two RDDs", {
+ skip_on_cran()
+
+ actual <- collectRDD(unionRDD(rdd, rdd))
+ expect_equal(actual, as.list(rep(nums, 2)))
+
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ text.rdd <- textFile(sc, fileName)
+ union.rdd <- unionRDD(rdd, text.rdd)
+ actual <- collectRDD(union.rdd)
+ expect_equal(actual, c(as.list(nums), mockFile))
+ expect_equal(getSerializedMode(union.rdd), "byte")
+
+ rdd <- map(text.rdd, function(x) {x})
+ union.rdd <- unionRDD(rdd, text.rdd)
+ actual <- collectRDD(union.rdd)
+ expect_equal(actual, as.list(c(mockFile, mockFile)))
+ expect_equal(getSerializedMode(union.rdd), "byte")
+
+ unlink(fileName)
+})
+
+test_that("cogroup on two RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+ rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+ cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
+ actual <- collectRDD(cogroup.rdd)
+ expect_equal(actual,
+ list(list(1, list(list(1), list(2, 3))), list(2, list(list(4), list()))))
+
+ rdd1 <- parallelize(sc, list(list("a", 1), list("a", 4)))
+ rdd2 <- parallelize(sc, list(list("b", 2), list("a", 3)))
+ cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
+ actual <- collectRDD(cogroup.rdd)
+
+ expected <- list(list("b", list(list(), list(2))), list("a", list(list(1, 4), list(3))))
+ expect_equal(sortKeyValueList(actual),
+ sortKeyValueList(expected))
+})
+
+test_that("zipPartitions() on RDDs", {
+ skip_on_cran()
+
+ rdd1 <- parallelize(sc, 1:2, 2L) # 1, 2
+ rdd2 <- parallelize(sc, 1:4, 2L) # 1:2, 3:4
+ rdd3 <- parallelize(sc, 1:6, 2L) # 1:3, 4:6
+ actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
+ func = function(x, y, z) { list(list(x, y, z))} ))
+ expect_equal(actual,
+ list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6))))
+
+ mockFile <- c("Spark is pretty.", "Spark is awesome.")
+ fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+ writeLines(mockFile, fileName)
+
+ rdd <- textFile(sc, fileName, 1)
+ actual <- collectRDD(zipPartitions(rdd, rdd,
+ func = function(x, y) { list(paste(x, y, sep = "\n")) }))
+ expected <- list(paste(mockFile, mockFile, sep = "\n"))
+ expect_equal(actual, expected)
+
+ rdd1 <- parallelize(sc, 0:1, 1)
+ actual <- collectRDD(zipPartitions(rdd1, rdd,
+ func = function(x, y) { list(x + nchar(y)) }))
+ expected <- list(0:1 + nchar(mockFile))
+ expect_equal(actual, expected)
+
+ rdd <- map(rdd, function(x) { x })
+ actual <- collectRDD(zipPartitions(rdd, rdd1,
+ func = function(x, y) { list(y + nchar(x)) }))
+ expect_equal(actual, expected)
+
+ unlink(fileName)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_broadcast.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_broadcast.R b/R/pkg/tests/fulltests/test_broadcast.R
new file mode 100644
index 0000000..2c96740
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_broadcast.R
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("broadcast variables")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Partitioned data
+nums <- 1:2
+rrdd <- parallelize(sc, nums, 2L)
+
+test_that("using broadcast variable", {
+ skip_on_cran()
+
+ randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+ randomMatBr <- broadcastRDD(sc, randomMat)
+
+ useBroadcast <- function(x) {
+ sum(SparkR:::value(randomMatBr) * x)
+ }
+ actual <- collectRDD(lapply(rrdd, useBroadcast))
+ expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
+ expect_equal(actual, expected)
+})
+
+test_that("without using broadcast variable", {
+ skip_on_cran()
+
+ randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+
+ useBroadcast <- function(x) {
+ sum(randomMat * x)
+ }
+ actual <- collectRDD(lapply(rrdd, useBroadcast))
+ expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
+ expect_equal(actual, expected)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_client.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_client.R b/R/pkg/tests/fulltests/test_client.R
new file mode 100644
index 0000000..3d53beb
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_client.R
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in client.R")
+
+test_that("adding spark-testing-base as a package works", {
+ skip_on_cran()
+
+ args <- generateSparkSubmitArgs("", "", "", "",
+ "holdenk:spark-testing-base:1.3.0_0.0.5")
+ expect_equal(gsub("[[:space:]]", "", args),
+ gsub("[[:space:]]", "",
+ "--packages holdenk:spark-testing-base:1.3.0_0.0.5"))
+})
+
+test_that("no package specified doesn't add packages flag", {
+ skip_on_cran()
+
+ args <- generateSparkSubmitArgs("", "", "", "", "")
+ expect_equal(gsub("[[:space:]]", "", args),
+ "")
+})
+
+test_that("multiple packages don't produce a warning", {
+ skip_on_cran()
+
+ expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA)
+})
+
+test_that("sparkJars sparkPackages as character vectors", {
+ skip_on_cran()
+
+ args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
+ c("com.databricks:spark-avro_2.10:2.0.1"))
+ expect_match(args, "--jars one.jar,two.jar,three.jar")
+ expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
+})
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
new file mode 100644
index 0000000..f6d9f54
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -0,0 +1,226 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("test functions in sparkR.R")
+
+test_that("Check masked functions", {
+ skip_on_cran()
+
+ # Check that we are not masking any new function from base, stats, testthat unexpectedly
+ # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
+ # hard for users to use base R functions. Please check when in doubt.
+ namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
+ namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
+ "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
+ "summary", "transform", "drop", "window", "as.data.frame", "union", "not")
+ if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+ namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+ }
+ masked <- conflicts(detail = TRUE)$`package:SparkR`
+ expect_true("describe" %in% masked) # only when with testthat..
+ func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
+ funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
+ maskedBySparkR <- masked[funcSparkROrEmpty]
+ expect_equal(length(maskedBySparkR), length(namesOfMasked))
+ # make the 2 lists the same length so expect_equal will print their content
+ l <- max(length(maskedBySparkR), length(namesOfMasked))
+ length(maskedBySparkR) <- l
+ length(namesOfMasked) <- l
+ expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
+ # above are those reported as masked when `library(SparkR)`
+ # note that many of these methods are still callable without base:: or stats:: prefix
+ # there should be a test for each of these, except followings, which are currently "broken"
+ funcHasAny <- unlist(lapply(masked, function(x) {
+ any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
+ }))
+ maskedCompletely <- masked[!funcHasAny]
+ expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
+ l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
+ length(maskedCompletely) <- l
+ length(namesOfMaskedCompletely) <- l
+ expect_equal(sort(maskedCompletely, na.last = TRUE),
+ sort(namesOfMaskedCompletely, na.last = TRUE))
+})
+
+test_that("repeatedly starting and stopping SparkR", {
+ skip_on_cran()
+
+ for (i in 1:4) {
+ sc <- suppressWarnings(sparkR.init(master = sparkRTestMaster))
+ rdd <- parallelize(sc, 1:20, 2L)
+ expect_equal(countRDD(rdd), 20)
+ suppressWarnings(sparkR.stop())
+ }
+})
+
+test_that("repeatedly starting and stopping SparkSession", {
+ for (i in 1:4) {
+ sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+ df <- createDataFrame(data.frame(dummy = 1:i))
+ expect_equal(count(df), i)
+ sparkR.session.stop()
+ }
+})
+
+test_that("rdd GC across sparkR.stop", {
+ skip_on_cran()
+
+ sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0
+ rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
+ rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
+ sparkR.session.stop()
+
+ sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0 again
+
+ # GC rdd1 before creating rdd3 and rdd2 after
+ rm(rdd1)
+ gc()
+
+ rdd3 <- parallelize(sc, 1:20, 2L) # rdd3 should get id 1 now
+ rdd4 <- parallelize(sc, 1:10, 2L) # rdd4 should get id 2 now
+
+ rm(rdd2)
+ gc()
+
+ countRDD(rdd3)
+ countRDD(rdd4)
+ sparkR.session.stop()
+})
+
+test_that("job group functions can be called", {
+ skip_on_cran()
+
+ sc <- sparkR.sparkContext(master = sparkRTestMaster)
+ setJobGroup("groupId", "job description", TRUE)
+ cancelJobGroup("groupId")
+ clearJobGroup()
+
+ suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
+ suppressWarnings(cancelJobGroup(sc, "groupId"))
+ suppressWarnings(clearJobGroup(sc))
+ sparkR.session.stop()
+})
+
+test_that("utility function can be called", {
+ skip_on_cran()
+
+ sparkR.sparkContext(master = sparkRTestMaster)
+ setLogLevel("ERROR")
+ sparkR.session.stop()
+})
+
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+ skip_on_cran()
+
+ e <- new.env()
+ e[["spark.driver.memory"]] <- "512m"
+ ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
+ expect_equal("--driver-memory \"512m\" sparkrmain", ops)
+
+ e[["spark.driver.memory"]] <- "5g"
+ e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
+ e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
+ e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
+ e[["random"]] <- "skipthis"
+ ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
+ # nolint start
+ expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
+ "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
+ "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
+ # nolint end
+
+ e[["spark.driver.extraClassPath"]] <- "/" # too short
+ ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
+ # nolint start
+ expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
+ "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
+ " --driver-memory 4g sparkr-shell2"))
+ # nolint end
+})
+
+test_that("sparkJars sparkPackages as comma-separated strings", {
+ skip_on_cran()
+
+ expect_warning(processSparkJars(" a, b "))
+ jars <- suppressWarnings(processSparkJars(" a, b "))
+ expect_equal(lapply(jars, basename), list("a", "b"))
+
+ jars <- suppressWarnings(processSparkJars(" abc ,, def "))
+ expect_equal(lapply(jars, basename), list("abc", "def"))
+
+ jars <- suppressWarnings(processSparkJars(c(" abc ,, def ", "", "xyz", " ", "a,b")))
+ expect_equal(lapply(jars, basename), list("abc", "def", "xyz", "a", "b"))
+
+ p <- processSparkPackages(c("ghi", "lmn"))
+ expect_equal(p, c("ghi", "lmn"))
+
+ # check normalizePath
+ f <- dir()[[1]]
+ expect_warning(processSparkJars(f), NA)
+ expect_match(processSparkJars(f), f)
+})
+
+test_that("spark.lapply should perform simple transforms", {
+ sparkR.sparkContext(master = sparkRTestMaster)
+ doubled <- spark.lapply(1:10, function(x) { 2 * x })
+ expect_equal(doubled, as.list(2 * 1:10))
+ sparkR.session.stop()
+})
+
+test_that("add and get file to be downloaded with Spark job on every node", {
+ skip_on_cran()
+
+ sparkR.sparkContext(master = sparkRTestMaster)
+ # Test add file.
+ path <- tempfile(pattern = "hello", fileext = ".txt")
+ filename <- basename(path)
+ words <- "Hello World!"
+ writeLines(words, path)
+ spark.addFile(path)
+ download_path <- spark.getSparkFiles(filename)
+ expect_equal(readLines(download_path), words)
+
+ # Test spark.getSparkFiles works well on executors.
+ seq <- seq(from = 1, to = 10, length.out = 5)
+ f <- function(seq) { spark.getSparkFiles(filename) }
+ results <- spark.lapply(seq, f)
+ for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
+
+ unlink(path)
+
+ # Test add directory recursively.
+ path <- paste0(tempdir(), "/", "recursive_dir")
+ dir.create(path)
+ dir_name <- basename(path)
+ path1 <- paste0(path, "/", "hello.txt")
+ file.create(path1)
+ sub_path <- paste0(path, "/", "sub_hello")
+ dir.create(sub_path)
+ path2 <- paste0(sub_path, "/", "sub_hello.txt")
+ file.create(path2)
+ words <- "Hello World!"
+ sub_words <- "Sub Hello World!"
+ writeLines(words, path1)
+ writeLines(sub_words, path2)
+ spark.addFile(path, recursive = TRUE)
+ download_path1 <- spark.getSparkFiles(paste0(dir_name, "/", "hello.txt"))
+ expect_equal(readLines(download_path1), words)
+ download_path2 <- spark.getSparkFiles(paste0(dir_name, "/", "sub_hello/sub_hello.txt"))
+ expect_equal(readLines(download_path2), sub_words)
+ unlink(path, recursive = TRUE)
+ sparkR.session.stop()
+})
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_includePackage.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_includePackage.R b/R/pkg/tests/fulltests/test_includePackage.R
new file mode 100644
index 0000000..d7d9eee
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_includePackage.R
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("include R packages")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Partitioned data
+nums <- 1:2
+rdd <- parallelize(sc, nums, 2L)
+
+test_that("include inside function", {
+ skip_on_cran()
+
+ # Only run the test if plyr is installed.
+ if ("plyr" %in% rownames(installed.packages())) {
+ suppressPackageStartupMessages(library(plyr))
+ generateData <- function(x) {
+ suppressPackageStartupMessages(library(plyr))
+ attach(airquality)
+ result <- transform(Ozone, logOzone = log(Ozone))
+ result
+ }
+
+ data <- lapplyPartition(rdd, generateData)
+ actual <- collectRDD(data)
+ }
+})
+
+test_that("use include package", {
+ skip_on_cran()
+
+ # Only run the test if plyr is installed.
+ if ("plyr" %in% rownames(installed.packages())) {
+ suppressPackageStartupMessages(library(plyr))
+ generateData <- function(x) {
+ attach(airquality)
+ result <- transform(Ozone, logOzone = log(Ozone))
+ result
+ }
+
+ includePackage(sc, plyr)
+ data <- lapplyPartition(rdd, generateData)
+ actual <- collectRDD(data)
+ }
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_jvm_api.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R
new file mode 100644
index 0000000..8b3b4f7
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_jvm_api.R
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("JVM API")
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("Create and call methods on object", {
+ jarr <- sparkR.newJObject("java.util.ArrayList")
+ # Add an element to the array
+ sparkR.callJMethod(jarr, "add", 1L)
+ # Check if get returns the same element
+ expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L)
+})
+
+test_that("Call static methods", {
+ # Convert a boolean to a string
+ strTrue <- sparkR.callJStatic("java.lang.String", "valueOf", TRUE)
+ expect_equal(strTrue, "true")
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
new file mode 100644
index 0000000..82e588d
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -0,0 +1,396 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib classification algorithms, except for tree-based algorithms")
+
+# Tests for MLlib classification algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+ sparkHome <- sparkR.conf("spark.home")
+ file.path(sparkHome, x)
+}
+
+test_that("spark.svmLinear", {
+ skip_on_cran()
+
+ df <- suppressWarnings(createDataFrame(iris))
+ training <- df[df$Species %in% c("versicolor", "virginica"), ]
+ model <- spark.svmLinear(training, Species ~ ., regParam = 0.01, maxIter = 10)
+ summary <- summary(model)
+
+ # test summary coefficients return matrix type
+ expect_true(class(summary$coefficients) == "matrix")
+ expect_true(class(summary$coefficients[, 1]) == "numeric")
+
+ coefs <- summary$coefficients[, "Estimate"]
+ expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085)
+ expect_true(all(abs(coefs - expected_coefs) < 0.1))
+
+ # Test prediction with string label
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+ expected <- c("versicolor", "versicolor", "versicolor", "virginica", "virginica",
+ "virginica", "virginica", "virginica", "virginica", "virginica")
+ expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
+
+ # Test model save and load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ coefs <- summary(model)$coefficients
+ coefs2 <- summary(model2)$coefficients
+ expect_equal(coefs, coefs2)
+ unlink(modelPath)
+ }
+
+ # Test prediction with numeric label
+ label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+ feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+ data <- as.data.frame(cbind(label, feature))
+ df <- createDataFrame(data)
+ model <- spark.svmLinear(df, label ~ feature, regParam = 0.1)
+ prediction <- collect(select(predict(model, df), "prediction"))
+ expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
+
+})
+
+test_that("spark.logit", {
+ # R code to reproduce the result.
+ # nolint start
+ #' library(glmnet)
+ #' iris.x = as.matrix(iris[, 1:4])
+ #' iris.y = as.factor(as.character(iris[, 5]))
+ #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+ #' coef(logit)
+ #
+ # $setosa
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # 1.0981324
+ # Sepal.Length -0.2909860
+ # Sepal.Width 0.5510907
+ # Petal.Length -0.1915217
+ # Petal.Width -0.4211946
+ #
+ # $versicolor
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # 1.520061e+00
+ # Sepal.Length 2.524501e-02
+ # Sepal.Width -5.310313e-01
+ # Petal.Length 3.656543e-02
+ # Petal.Width -3.144464e-05
+ #
+ # $virginica
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # -2.61819385
+ # Sepal.Length 0.26574097
+ # Sepal.Width -0.02005932
+ # Petal.Length 0.15495629
+ # Petal.Width 0.42122607
+ # nolint end
+
+ # Test multinomial logistic regression againt three classes
+ df <- suppressWarnings(createDataFrame(iris))
+ model <- spark.logit(df, Species ~ ., regParam = 0.5)
+ summary <- summary(model)
+
+ # test summary coefficients return matrix type
+ expect_true(class(summary$coefficients) == "matrix")
+ expect_true(class(summary$coefficients[, 1]) == "numeric")
+
+ versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
+ virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
+ setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
+ versicolorCoefs <- summary$coefficients[, "versicolor"]
+ virginicaCoefs <- summary$coefficients[, "virginica"]
+ setosaCoefs <- summary$coefficients[, "setosa"]
+ expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+ expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+ expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
+
+ # Test model save and load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ coefs <- summary(model)$coefficients
+ coefs2 <- summary(model2)$coefficients
+ expect_equal(coefs, coefs2)
+ unlink(modelPath)
+ }
+
+ # R code to reproduce the result.
+ # nolint start
+ #' library(glmnet)
+ #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+ #' iris.x = as.matrix(iris2[, 1:4])
+ #' iris.y = as.factor(as.character(iris2[, 5]))
+ #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+ #' coef(logit)
+ #
+ # $versicolor
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # 3.93844796
+ # Sepal.Length -0.13538675
+ # Sepal.Width -0.02386443
+ # Petal.Length -0.35076451
+ # Petal.Width -0.77971954
+ #
+ # $virginica
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # -3.93844796
+ # Sepal.Length 0.13538675
+ # Sepal.Width 0.02386443
+ # Petal.Length 0.35076451
+ # Petal.Width 0.77971954
+ #
+ #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
+ #' coef(logit)
+ #
+ # 5 x 1 sparse Matrix of class "dgCMatrix"
+ # s0
+ # (Intercept) -6.0824412
+ # Sepal.Length 0.2458260
+ # Sepal.Width 0.1642093
+ # Petal.Length 0.4759487
+ # Petal.Width 1.0383948
+ #
+ # nolint end
+
+ # Test multinomial logistic regression againt two classes
+ df <- suppressWarnings(createDataFrame(iris))
+ training <- df[df$Species %in% c("versicolor", "virginica"), ]
+ model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
+ summary <- summary(model)
+ versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
+ virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
+ versicolorCoefs <- summary$coefficients[, "versicolor"]
+ virginicaCoefs <- summary$coefficients[, "virginica"]
+ expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+ expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+
+ # Test binomial logistic regression againt two classes
+ model <- spark.logit(training, Species ~ ., regParam = 0.5)
+ summary <- summary(model)
+ coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
+ coefs <- summary$coefficients[, "Estimate"]
+ expect_true(all(abs(coefsR - coefs) < 0.1))
+
+ # Test prediction with string label
+ prediction <- predict(model, training)
+ expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+ expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
+ "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
+ expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
+ # Test prediction with numeric label
+ label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+ feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+ data <- as.data.frame(cbind(label, feature))
+ df <- createDataFrame(data)
+ model <- spark.logit(df, label ~ feature)
+ prediction <- collect(select(predict(model, df), "prediction"))
+ expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
+
+ # Test prediction with weightCol
+ weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
+ data2 <- as.data.frame(cbind(label, feature, weight))
+ df2 <- createDataFrame(data2)
+ model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
+ prediction2 <- collect(select(predict(model2, df2), "prediction"))
+ expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
+})
+
+test_that("spark.mlp", {
+ skip_on_cran()
+
+ df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+ source = "libsvm")
+ model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
+ solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+
+ # Test summary method
+ summary <- summary(model)
+ expect_equal(summary$numOfInputs, 4)
+ expect_equal(summary$numOfOutputs, 3)
+ expect_equal(summary$layers, c(4, 5, 4, 3))
+ expect_equal(length(summary$weights), 64)
+ expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
+ tolerance = 1e-6)
+
+ # Test predict method
+ mlpTestDF <- df
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ summary2 <- summary(model2)
+
+ expect_equal(summary2$numOfInputs, 4)
+ expect_equal(summary2$numOfOutputs, 3)
+ expect_equal(summary2$layers, c(4, 5, 4, 3))
+ expect_equal(length(summary2$weights), 64)
+
+ unlink(modelPath)
+ }
+
+ # Test default parameter
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+ # Test illegal parameter
+ expect_error(spark.mlp(df, label ~ features, layers = NULL),
+ "layers must be a integer vector with length > 1.")
+ expect_error(spark.mlp(df, label ~ features, layers = c()),
+ "layers must be a integer vector with length > 1.")
+ expect_error(spark.mlp(df, label ~ features, layers = c(3)),
+ "layers must be a integer vector with length > 1.")
+
+ # Test random seed
+ # default seed
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+ # seed equals 10
+ model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+ # test initialWeights
+ model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
+ c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+ mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+ expect_equal(head(mlpPredictions$prediction, 10),
+ c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+ # Test formula works well
+ df <- suppressWarnings(createDataFrame(iris))
+ model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+ layers = c(4, 3))
+ summary <- summary(model)
+ expect_equal(summary$numOfInputs, 4)
+ expect_equal(summary$numOfOutputs, 3)
+ expect_equal(summary$layers, c(4, 3))
+ expect_equal(length(summary$weights), 15)
+})
+
+test_that("spark.naiveBayes", {
+ # R code to reproduce the result.
+ # We do not support instance weights yet. So we ignore the frequencies.
+ #
+ #' library(e1071)
+ #' t <- as.data.frame(Titanic)
+ #' t1 <- t[t$Freq > 0, -5]
+ #' m <- naiveBayes(Survived ~ ., data = t1)
+ #' m
+ #' predict(m, t1)
+ #
+ # -- output of 'm'
+ #
+ # A-priori probabilities:
+ # Y
+ # No Yes
+ # 0.4166667 0.5833333
+ #
+ # Conditional probabilities:
+ # Class
+ # Y 1st 2nd 3rd Crew
+ # No 0.2000000 0.2000000 0.4000000 0.2000000
+ # Yes 0.2857143 0.2857143 0.2857143 0.1428571
+ #
+ # Sex
+ # Y Male Female
+ # No 0.5 0.5
+ # Yes 0.5 0.5
+ #
+ # Age
+ # Y Child Adult
+ # No 0.2000000 0.8000000
+ # Yes 0.4285714 0.5714286
+ #
+ # -- output of 'predict(m, t1)'
+ #
+ # Yes Yes Yes Yes No No Yes Yes No No Yes Yes Yes Yes Yes Yes Yes Yes No No Yes Yes No No
+ #
+
+ t <- as.data.frame(Titanic)
+ t1 <- t[t$Freq > 0, -5]
+ df <- suppressWarnings(createDataFrame(t1))
+ m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
+ s <- summary(m)
+ expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
+ expect_equal(sum(s$apriori), 1)
+ expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
+ p <- collect(select(predict(m, df), "prediction"))
+ expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
+ "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
+ "Yes", "Yes", "No", "No"))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
+ write.ml(m, modelPath)
+ expect_error(write.ml(m, modelPath))
+ write.ml(m, modelPath, overwrite = TRUE)
+ m2 <- read.ml(modelPath)
+ s2 <- summary(m2)
+ expect_equal(s$apriori, s2$apriori)
+ expect_equal(s$tables, s2$tables)
+
+ unlink(modelPath)
+ }
+
+ # Test e1071::naiveBayes
+ if (requireNamespace("e1071", quietly = TRUE)) {
+ expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
+ expect_equal(as.character(predict(m, t1[1, ])), "Yes")
+ }
+
+ # Test numeric response variable
+ t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1)
+ t2 <- t1[-4]
+ df <- suppressWarnings(createDataFrame(t2))
+ m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0)
+ s <- summary(m)
+ expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
+ expect_equal(sum(s$apriori), 1)
+ expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
+})
+
+sparkR.session.stop()
http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R
new file mode 100644
index 0000000..e827e96
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_clustering.R
@@ -0,0 +1,328 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib clustering algorithms")
+
+# Tests for MLlib clustering algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+ sparkHome <- sparkR.conf("spark.home")
+ file.path(sparkHome, x)
+}
+
+test_that("spark.bisectingKmeans", {
+ skip_on_cran()
+
+ newIris <- iris
+ newIris$Species <- NULL
+ training <- suppressWarnings(createDataFrame(newIris))
+
+ take(training, 1)
+
+ model <- spark.bisectingKmeans(data = training, ~ .)
+ sample <- take(select(predict(model, training), "prediction"), 1)
+ expect_equal(typeof(sample$prediction), "integer")
+ expect_equal(sample$prediction, 1)
+
+ # Test fitted works on Bisecting KMeans
+ fitted.model <- fitted(model)
+ expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
+ c(0, 1, 2, 3))
+
+ # Test summary works on KMeans
+ summary.model <- summary(model)
+ cluster <- summary.model$cluster
+ k <- summary.model$k
+ expect_equal(k, 4)
+ expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
+ c(0, 1, 2, 3))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ summary2 <- summary(model2)
+ expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
+ expect_equal(summary.model$coefficients, summary2$coefficients)
+ expect_true(!summary.model$is.loaded)
+ expect_true(summary2$is.loaded)
+
+ unlink(modelPath)
+ }
+})
+
+test_that("spark.gaussianMixture", {
+ # R code to reproduce the result.
+ # nolint start
+ #' library(mvtnorm)
+ #' set.seed(1)
+ #' a <- rmvnorm(7, c(0, 0))
+ #' b <- rmvnorm(8, c(10, 10))
+ #' data <- rbind(a, b)
+ #' model <- mvnormalmixEM(data, k = 2)
+ #' model$lambda
+ #
+ # [1] 0.4666667 0.5333333
+ #
+ #' model$mu
+ #
+ # [1] 0.11731091 -0.06192351
+ # [1] 10.363673 9.897081
+ #
+ #' model$sigma
+ #
+ # [[1]]
+ # [,1] [,2]
+ # [1,] 0.62049934 0.06880802
+ # [2,] 0.06880802 1.27431874
+ #
+ # [[2]]
+ # [,1] [,2]
+ # [1,] 0.2961543 0.160783
+ # [2,] 0.1607830 1.008878
+ #
+ #' model$loglik
+ #
+ # [1] -46.89499
+ # nolint end
+ data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
+ list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
+ list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
+ list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
+ list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
+ list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
+ list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
+ list(9.5218499, 10.4179416))
+ df <- createDataFrame(data, c("x1", "x2"))
+ model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
+ stats <- summary(model)
+ rLambda <- c(0.4666667, 0.5333333)
+ rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
+ rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
+ 0.2961543, 0.160783, 0.1607830, 1.008878)
+ rLoglik <- -46.89499
+ expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
+ expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
+ expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
+ expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
+ p <- collect(select(predict(model, df), "prediction"))
+ expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+ expect_equal(stats$lambda, stats2$lambda)
+ expect_equal(unlist(stats$mu), unlist(stats2$mu))
+ expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
+ expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
+
+ unlink(modelPath)
+ }
+})
+
+test_that("spark.kmeans", {
+ newIris <- iris
+ newIris$Species <- NULL
+ training <- suppressWarnings(createDataFrame(newIris))
+
+ take(training, 1)
+
+ model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
+ sample <- take(select(predict(model, training), "prediction"), 1)
+ expect_equal(typeof(sample$prediction), "integer")
+ expect_equal(sample$prediction, 1)
+
+ # Test stats::kmeans is working
+ statsModel <- kmeans(x = newIris, centers = 2)
+ expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
+
+ # Test fitted works on KMeans
+ fitted.model <- fitted(model)
+ expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
+
+ # Test summary works on KMeans
+ summary.model <- summary(model)
+ cluster <- summary.model$cluster
+ k <- summary.model$k
+ expect_equal(k, 2)
+ expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
+
+ # test summary coefficients return matrix type
+ expect_true(class(summary.model$coefficients) == "matrix")
+ expect_true(class(summary.model$coefficients[1, ]) == "numeric")
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ summary2 <- summary(model2)
+ expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
+ expect_equal(summary.model$coefficients, summary2$coefficients)
+ expect_true(!summary.model$is.loaded)
+ expect_true(summary2$is.loaded)
+
+ unlink(modelPath)
+ }
+
+ # Test Kmeans on dataset that is sensitive to seed value
+ col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+ col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+ col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+ cols <- as.data.frame(cbind(col1, col2, col3))
+ df <- createDataFrame(cols)
+
+ model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+ initMode = "random", seed = 1, tol = 1E-5)
+ model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+ initMode = "random", seed = 22222, tol = 1E-5)
+
+ summary.model1 <- summary(model1)
+ summary.model2 <- summary(model2)
+ cluster1 <- summary.model1$cluster
+ cluster2 <- summary.model2$cluster
+ clusterSize1 <- summary.model1$clusterSize
+ clusterSize2 <- summary.model2$clusterSize
+
+ # The predicted clusters are different
+ expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
+ c(0, 1, 2, 3))
+ expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
+ c(0, 1, 2))
+ expect_equal(clusterSize1, 4)
+ expect_equal(clusterSize2, 3)
+})
+
+test_that("spark.lda with libsvm", {
+ text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
+ model <- spark.lda(text, optimizer = "em")
+
+ stats <- summary(model, 10)
+ isDistributed <- stats$isDistributed
+ logLikelihood <- stats$logLikelihood
+ logPerplexity <- stats$logPerplexity
+ vocabSize <- stats$vocabSize
+ topics <- stats$topicTopTerms
+ weights <- stats$topicTopTermsWeights
+ vocabulary <- stats$vocabulary
+ trainingLogLikelihood <- stats$trainingLogLikelihood
+ logPrior <- stats$logPrior
+
+ expect_true(isDistributed)
+ expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
+ expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
+ expect_equal(vocabSize, 11)
+ expect_true(is.null(vocabulary))
+ expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
+ expect_true(logPrior <= 0 & !is.na(logPrior))
+
+ # Test model save/load
+ if (not_cran_or_windows_with_hadoop()) {
+ modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+
+ expect_true(stats2$isDistributed)
+ expect_equal(logLikelihood, stats2$logLikelihood)
+ expect_equal(logPerplexity, stats2$logPerplexity)
+ expect_equal(vocabSize, stats2$vocabSize)
+ expect_equal(vocabulary, stats2$vocabulary)
+ expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
+ expect_equal(logPrior, stats2$logPrior)
+
+ unlink(modelPath)
+ }
+})
+
+test_that("spark.lda with text input", {
+ skip_on_cran()
+
+ text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
+ model <- spark.lda(text, optimizer = "online", features = "value")
+
+ stats <- summary(model)
+ isDistributed <- stats$isDistributed
+ logLikelihood <- stats$logLikelihood
+ logPerplexity <- stats$logPerplexity
+ vocabSize <- stats$vocabSize
+ topics <- stats$topicTopTerms
+ weights <- stats$topicTopTermsWeights
+ vocabulary <- stats$vocabulary
+ trainingLogLikelihood <- stats$trainingLogLikelihood
+ logPrior <- stats$logPrior
+
+ expect_false(isDistributed)
+ expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
+ expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
+ expect_equal(vocabSize, 10)
+ expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
+ expect_true(is.na(trainingLogLikelihood))
+ expect_true(is.na(logPrior))
+
+ # Test model save/load
+ modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
+ write.ml(model, modelPath)
+ expect_error(write.ml(model, modelPath))
+ write.ml(model, modelPath, overwrite = TRUE)
+ model2 <- read.ml(modelPath)
+ stats2 <- summary(model2)
+
+ expect_false(stats2$isDistributed)
+ expect_equal(logLikelihood, stats2$logLikelihood)
+ expect_equal(logPerplexity, stats2$logPerplexity)
+ expect_equal(vocabSize, stats2$vocabSize)
+ expect_true(all.equal(vocabulary, stats2$vocabulary))
+ expect_true(is.na(stats2$trainingLogLikelihood))
+ expect_true(is.na(stats2$logPrior))
+
+ unlink(modelPath)
+})
+
+test_that("spark.posterior and spark.perplexity", {
+ skip_on_cran()
+
+ text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
+ model <- spark.lda(text, features = "value", k = 3)
+
+ # Assert perplexities are equal
+ stats <- summary(model)
+ logPerplexity <- spark.perplexity(model, text)
+ expect_equal(logPerplexity, stats$logPerplexity)
+
+ # Assert the sum of every topic distribution is equal to 1
+ posterior <- spark.posterior(model, text)
+ local.posterior <- collect(posterior)$topicDistribution
+ expect_equal(length(local.posterior), sum(unlist(local.posterior)))
+})
+
+sparkR.session.stop()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org