You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@spark.apache.org by fe...@apache.org on 2017/06/11 07:00:37 UTC

[1/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Repository: spark
Updated Branches:
  refs/heads/master 5301a19a0 -> dc4c35183


http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_streaming.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R
new file mode 100644
index 0000000..b20b431
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_streaming.R
@@ -0,0 +1,167 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("Structured Streaming")
+
+# Tests for Structured Streaming functions in SparkR
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+jsonSubDir <- file.path("sparkr-test", "json", "")
+if (.Platform$OS.type == "windows") {
+  # file.path removes the empty separator on Windows, adds it back
+  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
+}
+jsonDir <- file.path(tempdir(), jsonSubDir)
+dir.create(jsonDir, recursive = TRUE)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+               "{\"name\":\"Andy\", \"age\":30}",
+               "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+writeLines(mockLines, jsonPath)
+
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+                 "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+                 "{\"name\":\"David\",\"age\":60,\"height\":null}")
+jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
+
+schema <- structType(structField("name", "string"),
+                     structField("age", "integer"),
+                     structField("count", "double"))
+
+test_that("read.stream, write.stream, awaitTermination, stopQuery", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
+
+  writeLines(mockLinesNa, jsonPathNa)
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_error(awaitTermination(q), NA)
+})
+
+test_that("print from explain, lastProgress, status, isActive", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete")
+
+  awaitTermination(q, 5 * 1000)
+  callJMethod(q@ssq, "processAllAvailable")
+
+  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
+  expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q)))))
+  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)))))
+
+  expect_equal(queryName(q), "people2")
+  expect_true(isActive(q))
+
+  stopQuery(q)
+})
+
+test_that("Stream other format", {
+  skip_on_cran()
+
+  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+  df <- read.df(jsonPath, "json", schema)
+  write.df(df, parquetPath, "parquet", "overwrite")
+
+  df <- read.stream(path = parquetPath, schema = schema)
+  expect_true(isStreaming(df))
+  counts <- count(group_by(df, "name"))
+  q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete")
+
+  expect_false(awaitTermination(q, 5 * 1000))
+  callJMethod(q@ssq, "processAllAvailable")
+  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
+
+  expect_equal(queryName(q), "people3")
+  expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet",
+              capture.output(lastProgress(q)))))
+  expect_true(isActive(q))
+
+  stopQuery(q)
+  expect_true(awaitTermination(q, 1))
+  expect_false(isActive(q))
+
+  unlink(parquetPath)
+})
+
+test_that("Non-streaming DataFrame", {
+  skip_on_cran()
+
+  c <- as.DataFrame(cars)
+  expect_false(isStreaming(c))
+
+  expect_error(write.stream(c, "memory", queryName = "people", outputMode = "complete"),
+               paste0(".*(writeStream : analysis error - 'writeStream' can be called only on ",
+                      "streaming Dataset/DataFrame).*"))
+})
+
+test_that("Unsupported operation", {
+  skip_on_cran()
+
+  # memory sink without aggregation
+  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
+  expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
+               paste0(".*(start : analysis error - Complete output mode not supported when there ",
+                      "are no streaming aggregations on streaming DataFrames/Datasets).*"))
+})
+
+test_that("Terminated by error", {
+  skip_on_cran()
+
+  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = -1)
+  counts <- count(group_by(df, "name"))
+  # This would not fail before returning with a StreamingQuery,
+  # but could dump error log at just about the same time
+  expect_error(q <- write.stream(counts, "memory", queryName = "people4", outputMode = "complete"),
+               NA)
+
+  expect_error(awaitTermination(q, 5 * 1000),
+               paste0(".*(awaitTermination : streaming query error - Invalid value '-1' for option",
+                      " 'maxFilesPerTrigger', must be a positive integer).*"))
+
+  expect_true(any(grepl("\"message\" : \"Terminated with exception: Invalid value",
+              capture.output(status(q)))))
+  expect_true(any(grepl("Streaming query has no progress", capture.output(lastProgress(q)))))
+  expect_equal(queryName(q), "people4")
+  expect_false(isActive(q))
+
+  stopQuery(q)
+})
+
+unlink(jsonPath)
+unlink(jsonPathNa)
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_take.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_take.R b/R/pkg/tests/fulltests/test_take.R
new file mode 100644
index 0000000..c00723b
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_take.R
@@ -0,0 +1,71 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("tests RDD function take()")
+
+# Mock data
+numVector <- c(-10:97)
+numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
+strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
+               "violated, but I'm not. No, in fact, I think this is a friendly",
+               "message, like \"Hey, wanna play?\" and yes, I want to play. ",
+               "I really, really do.")
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
+                "other times it helps me control the chaos.",
+                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
+                "raising me. But they're both dead now. I didn't kill them. Honest.")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+test_that("take() gives back the original elements in correct count and order", {
+  skip_on_cran()
+
+  numVectorRDD <- parallelize(sc, numVector, 10)
+  # case: number of elements to take is less than the size of the first partition
+  expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1)))
+  # case: number of elements to take is the same as the size of the first partition
+  expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11)))
+  # case: number of elements to take is greater than all elements
+  expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector))
+  expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), as.list(numVector))
+
+  numListRDD <- parallelize(sc, numList, 1)
+  numListRDD2 <- parallelize(sc, numList, 4)
+  expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3))
+  expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5))
+  expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1)))
+  expect_equal(takeRDD(numListRDD2, 999), numList)
+
+  strVectorRDD <- parallelize(sc, strVector, 2)
+  strVectorRDD2 <- parallelize(sc, strVector, 3)
+  expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector))
+  expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2)))
+
+  strListRDD <- parallelize(sc, strList, 4)
+  strListRDD2 <- parallelize(sc, strList, 1)
+  expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3)))
+  expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1)))
+
+  expect_equal(length(takeRDD(strListRDD, 0)), 0)
+  expect_equal(length(takeRDD(strVectorRDD, 0)), 0)
+  expect_equal(length(takeRDD(numListRDD, 0)), 0)
+  expect_equal(length(takeRDD(numVectorRDD, 0)), 0)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_textFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_textFile.R b/R/pkg/tests/fulltests/test_textFile.R
new file mode 100644
index 0000000..e8a961c
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_textFile.R
@@ -0,0 +1,182 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("the textFile() function")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("textFile() on a local file returns an RDD", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+  expect_is(rdd, "RDD")
+  expect_true(countRDD(rdd) > 0)
+  expect_equal(countRDD(rdd), 2)
+
+  unlink(fileName)
+})
+
+test_that("textFile() followed by a collect() returns the same content", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+  expect_equal(collectRDD(rdd), as.list(mockFile))
+
+  unlink(fileName)
+})
+
+test_that("textFile() word count works as expected", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+
+  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  counts <- reduceByKey(wordCount, "+", 2L)
+  output <- collectRDD(counts)
+  expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1),
+                   list("Spark", 2))
+  expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
+
+  unlink(fileName)
+})
+
+test_that("several transformations on RDD created by textFile()", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName) # RDD
+  for (i in 1:10) {
+    # PipelinedRDD initially created from RDD
+    rdd <- lapply(rdd, function(x) paste(x, x))
+  }
+  collectRDD(rdd)
+
+  unlink(fileName)
+})
+
+test_that("textFile() followed by a saveAsTextFile() returns the same content", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1, 1L)
+  saveAsTextFile(rdd, fileName2)
+  rdd <- textFile(sc, fileName2)
+  expect_equal(collectRDD(rdd), as.list(mockFile))
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("saveAsTextFile() on a parallelized list works as expected", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  l <- list(1, 2, 3)
+  rdd <- parallelize(sc, l, 1L)
+  saveAsTextFile(rdd, fileName)
+  rdd <- textFile(sc, fileName)
+  expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)}))
+
+  unlink(fileName)
+})
+
+test_that("textFile() and saveAsTextFile() word count works as expected", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1)
+
+  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  counts <- reduceByKey(wordCount, "+", 2L)
+
+  saveAsTextFile(counts, fileName2)
+  rdd <- textFile(sc, fileName2)
+
+  output <- collectRDD(rdd)
+  expected <- list(list("awesome.", 1), list("Spark", 2),
+                   list("pretty.", 1), list("is", 2))
+  expectedStr <- lapply(expected, function(x) { toString(x) })
+  expect_equal(sortKeyValueList(output), sortKeyValueList(expectedStr))
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("textFile() on multiple paths", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines("Spark is pretty.", fileName1)
+  writeLines("Spark is awesome.", fileName2)
+
+  rdd <- textFile(sc, c(fileName1, fileName2))
+  expect_equal(countRDD(rdd), 2)
+
+  unlink(fileName1)
+  unlink(fileName2)
+})
+
+test_that("Pipelined operations on RDDs created using textFile", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+
+  lengths <- lapply(rdd, function(x) { length(x) })
+  expect_equal(collectRDD(lengths), list(1, 1))
+
+  lengthsPipelined <- lapply(lengths, function(x) { x + 10 })
+  expect_equal(collectRDD(lengthsPipelined), list(11, 11))
+
+  lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 })
+  expect_equal(collectRDD(lengths30), list(31, 31))
+
+  lengths20 <- lapply(lengths, function(x) { x + 20 })
+  expect_equal(collectRDD(lengths20), list(21, 21))
+
+  unlink(fileName)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R
new file mode 100644
index 0000000..6197ae7
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_utils.R
@@ -0,0 +1,248 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in utils.R")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+test_that("convertJListToRList() gives back (deserializes) the original JLists
+          of strings and integers", {
+  skip_on_cran()
+  # It's hard to manually create a Java List using rJava, since it does not
+  # support generics well. Instead, we rely on collectRDD() returning a
+  # JList.
+  nums <- as.list(1:10)
+  rdd <- parallelize(sc, nums, 1L)
+  jList <- callJMethod(rdd@jrdd, "collect")
+  rList <- convertJListToRList(jList, flatten = TRUE)
+  expect_equal(rList, nums)
+
+  strs <- as.list("hello", "spark")
+  rdd <- parallelize(sc, strs, 2L)
+  jList <- callJMethod(rdd@jrdd, "collect")
+  rList <- convertJListToRList(jList, flatten = TRUE)
+  expect_equal(rList, strs)
+})
+
+test_that("serializeToBytes on RDD", {
+  skip_on_cran()
+  # File content
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  text.rdd <- textFile(sc, fileName)
+  expect_equal(getSerializedMode(text.rdd), "string")
+  ser.rdd <- serializeToBytes(text.rdd)
+  expect_equal(collectRDD(ser.rdd), as.list(mockFile))
+  expect_equal(getSerializedMode(ser.rdd), "byte")
+
+  unlink(fileName)
+})
+
+test_that("cleanClosure on R functions", {
+  y <- c(1, 2, 3)
+  g <- function(x) { x + 1 }
+  f <- function(x) { g(x) + y }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 2)  # y, g
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+  actual <- get("g", envir = env, inherits = FALSE)
+  expect_equal(actual, g)
+
+  # Test for nested enclosures and package variables.
+  env2 <- new.env()
+  funcEnv <- new.env(parent = env2)
+  f <- function(x) { log(g(x) + y) }
+  environment(f) <- funcEnv  # enclosing relationship: f -> funcEnv -> env2 -> .GlobalEnv
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 2)  # "min" should not be included
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+  actual <- get("g", envir = env, inherits = FALSE)
+  expect_equal(actual, g)
+
+  base <- c(1, 2, 3)
+  l <- list(field = matrix(1))
+  field <- matrix(2)
+  defUse <- 3
+  g <- function(x) { x + y }
+  f <- function(x) {
+    defUse <- base::as.integer(x) + 1  # Test for access operators `::`.
+    lapply(x, g) + 1  # Test for capturing function call "g"'s closure as a argument of lapply.
+    l$field[1, 1] <- 3  # Test for access operators `$`.
+    res <- defUse + l$field[1, ]  # Test for def-use chain of "defUse", and "" symbol.
+    f(res)  # Test for recursive calls.
+  }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  # TODO(shivaram): length(ls(env)) is 4 here for some reason and `lapply` is included in `env`.
+  # Disabling this test till we debug this.
+  #
+  # nolint start
+  # expect_equal(length(ls(env)), 3)  # Only "g", "l" and "f". No "base", "field" or "defUse".
+  # nolint end
+  expect_true("g" %in% ls(env))
+  expect_true("l" %in% ls(env))
+  expect_true("f" %in% ls(env))
+  expect_equal(get("l", envir = env, inherits = FALSE), l)
+  # "y" should be in the environemnt of g.
+  newG <- get("g", envir = env, inherits = FALSE)
+  env <- environment(newG)
+  expect_equal(length(ls(env)), 1)
+  actual <- get("y", envir = env, inherits = FALSE)
+  expect_equal(actual, y)
+
+  # Test for function (and variable) definitions.
+  f <- function(x) {
+    g <- function(y) { y * 2 }
+    g(x)
+  }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(length(ls(env)), 0)  # "y" and "g" should not be included.
+
+  # Test for overriding variables in base namespace (Issue: SparkR-196).
+  nums <- as.list(1:10)
+  rdd <- parallelize(sc, nums, 2L)
+  t <- 4  # Override base::t in .GlobalEnv.
+  f <- function(x) { x > t }
+  newF <- cleanClosure(f)
+  env <- environment(newF)
+  expect_equal(ls(env), "t")
+  expect_equal(get("t", envir = env, inherits = FALSE), t)
+  actual <- collectRDD(lapply(rdd, f))
+  expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6)))
+  expect_equal(actual, expected)
+
+  # Test for broadcast variables.
+  a <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+  aBroadcast <- broadcastRDD(sc, a)
+  normMultiply <- function(x) { norm(aBroadcast$value) * x }
+  newnormMultiply <- SparkR:::cleanClosure(normMultiply)
+  env <- environment(newnormMultiply)
+  expect_equal(ls(env), "aBroadcast")
+  expect_equal(get("aBroadcast", envir = env, inherits = FALSE), aBroadcast)
+})
+
+test_that("varargsToJProperties", {
+  jprops <- newJObject("java.util.Properties")
+  expect_true(class(jprops) == "jobj")
+
+  jprops <- varargsToJProperties(abc = "123")
+  expect_true(class(jprops) == "jobj")
+  expect_equal(callJMethod(jprops, "getProperty", "abc"), "123")
+
+  jprops <- varargsToJProperties(abc = "abc", b = 1)
+  expect_equal(callJMethod(jprops, "getProperty", "abc"), "abc")
+  expect_equal(callJMethod(jprops, "getProperty", "b"), "1")
+
+  jprops <- varargsToJProperties()
+  expect_equal(callJMethod(jprops, "size"), 0L)
+})
+
+test_that("convertToJSaveMode", {
+  s <- convertToJSaveMode("error")
+  expect_true(class(s) == "jobj")
+  expect_match(capture.output(print.jobj(s)), "Java ref type org.apache.spark.sql.SaveMode id ")
+  expect_error(convertToJSaveMode("foo"),
+    'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
+})
+
+test_that("captureJVMException", {
+  skip_on_cran()
+
+  method <- "createStructField"
+  expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", method,
+                                    "col", "unknown", TRUE),
+                        error = function(e) {
+                          captureJVMException(e, method)
+                        }),
+               "parse error - .*DataType unknown.*not supported.")
+})
+
+test_that("hashCode", {
+  skip_on_cran()
+
+  expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA)
+})
+
+test_that("overrideEnvs", {
+  config <- new.env()
+  config[["spark.master"]] <- "foo"
+  config[["config_only"]] <- "ok"
+  param <- new.env()
+  param[["spark.master"]] <- "local"
+  param[["param_only"]] <- "blah"
+  overrideEnvs(config, param)
+  expect_equal(config[["spark.master"]], "local")
+  expect_equal(config[["param_only"]], "blah")
+  expect_equal(config[["config_only"]], "ok")
+})
+
+test_that("rbindRaws", {
+
+  # Mixed Column types
+  r <- serialize(1:5, connection = NULL)
+  r1 <- serialize(1, connection = NULL)
+  r2 <- serialize(letters, connection = NULL)
+  r3 <- serialize(1:10, connection = NULL)
+  inputData <- list(list(1L, r1, "a", r), list(2L, r2, "b", r),
+                    list(3L, r3, "c", r))
+  expected <- data.frame(V1 = 1:3)
+  expected$V2 <- list(r1, r2, r3)
+  expected$V3 <- c("a", "b", "c")
+  expected$V4 <- list(r, r, r)
+  result <- rbindRaws(inputData)
+  expect_equal(expected, result)
+
+  # Single binary column
+  input <- list(list(r1), list(r2), list(r3))
+  expected <- subset(expected, select = "V2")
+  result <- setNames(rbindRaws(input), "V2")
+  expect_equal(expected, result)
+
+})
+
+test_that("varargsToStrEnv", {
+  strenv <- varargsToStrEnv(a = 1, b = 1.1, c = TRUE, d = "abcd")
+  env <- varargsToEnv(a = "1", b = "1.1", c = "true", d = "abcd")
+  expect_equal(strenv, env)
+  expect_error(varargsToStrEnv(a = list(1, "a")),
+               paste0("Unsupported type for a : list. Supported types are logical, ",
+                      "numeric, character and NULL."))
+  expect_warning(varargsToStrEnv(a = 1, 2, 3, 4), "Unnamed arguments ignored: 2, 3, 4.")
+  expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
+})
+
+test_that("basenameSansExtFromUrl", {
+  x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
+              "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
+  expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
+  z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
+  expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
+})
+
+sparkR.session.stop()
+
+message("--- End test (utils) ", as.POSIXct(Sys.time(), tz = "GMT"))
+message("elapsed ", (proc.time() - timer_ptm)[3])

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/run-all.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R
index f0bef4f..d48e36c 100644
--- a/R/pkg/tests/run-all.R
+++ b/R/pkg/tests/run-all.R
@@ -43,3 +43,11 @@ if (identical(Sys.getenv("NOT_CRAN"), "true")) {
 }
 
 test_package("SparkR")
+
+if (identical(Sys.getenv("NOT_CRAN"), "true")) {
+  # for testthat 1.0.2 later, change reporter from "summary" to default_reporter()
+  testthat:::run_tests("SparkR",
+                       file.path(sparkRDir, "pkg", "tests", "fulltests"),
+                       NULL,
+                       "summary")
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[6/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_regression.R b/R/pkg/inst/tests/testthat/test_mllib_regression.R
deleted file mode 100644
index b05fdd3..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_regression.R
+++ /dev/null
@@ -1,480 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib regression algorithms, except for tree-based algorithms")
-
-# Tests for MLlib regression algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("formula of spark.glm", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  # directly calling the spark API
-  # dot minus and intercept vs native glm
-  model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # feature interaction vs native glm
-  model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # glm should work with long formula
-  training <- suppressWarnings(createDataFrame(iris))
-  training$LongLongLongLongLongName <- training$Sepal_Width
-  training$VeryLongLongLongLonLongName <- training$Sepal_Length
-  training$AnotherLongLongLongLongName <- training$Species
-  model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
-    AnotherLongLongLongLongName)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("spark.glm and predict", {
-  training <- suppressWarnings(createDataFrame(iris))
-  # gaussian family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # poisson family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = poisson(link = identity))
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-                                        data = iris, family = poisson(link = identity)), iris))
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # Gamma family
-  x <- runif(100, -1, 1)
-  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
-  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
-  model <- glm(y ~ x, family = Gamma, df)
-  out <- capture.output(print(summary(model)))
-  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
-
-  # tweedie family
-  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
-                     family = "tweedie", var.power = 1.2, link.power = 0.0)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-
-  # manual calculation of the R predicted values to avoid dependence on statmod
-  #' library(statmod)
-  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
-  #' print(coef(rModel))
-
-  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
-                                       data = iris) %*% rCoef))
-  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
-
-  # Test stats::predict is working
-  x <- rnorm(15)
-  y <- x + rnorm(15)
-  expect_equal(length(predict(lm(y ~ x))), 15)
-})
-
-test_that("spark.glm summary", {
-  # gaussian family
-  training <- suppressWarnings(createDataFrame(iris))
-  stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
-  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
-
-  # test summary coefficients return matrix type
-  expect_true(class(stats$coefficients) == "matrix")
-  expect_true(class(stats$coefficients[, 1]) == "numeric")
-
-  coefs <- stats$coefficients
-  rCoefs <- rStats$coefficients
-  expect_true(all(abs(rCoefs - coefs) < 1e-4))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
-  expect_equal(stats$dispersion, rStats$dispersion)
-  expect_equal(stats$null.deviance, rStats$null.deviance)
-  expect_equal(stats$deviance, rStats$deviance)
-  expect_equal(stats$df.null, rStats$df.null)
-  expect_equal(stats$df.residual, rStats$df.residual)
-  expect_equal(stats$aic, rStats$aic)
-
-  out <- capture.output(print(stats))
-  expect_match(out[2], "Deviance Residuals:")
-  expect_true(any(grepl("AIC: 59.22", out)))
-
-  # binomial family
-  df <- suppressWarnings(createDataFrame(iris))
-  training <- df[df$Species %in% c("versicolor", "virginica"), ]
-  stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
-                             family = binomial(link = "logit")))
-
-  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
-  rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
-                        family = binomial(link = "logit")))
-
-  coefs <- stats$coefficients
-  rCoefs <- rStats$coefficients
-  expect_true(all(abs(rCoefs - coefs) < 1e-4))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
-  expect_equal(stats$dispersion, rStats$dispersion)
-  expect_equal(stats$null.deviance, rStats$null.deviance)
-  expect_equal(stats$deviance, rStats$deviance)
-  expect_equal(stats$df.null, rStats$df.null)
-  expect_equal(stats$df.residual, rStats$df.residual)
-  expect_equal(stats$aic, rStats$aic)
-
-  # Test spark.glm works with weighted dataset
-  a1 <- c(0, 1, 2, 3)
-  a2 <- c(5, 2, 1, 3)
-  w <- c(1, 2, 3, 4)
-  b <- c(1, 0, 1, 0)
-  data <- as.data.frame(cbind(a1, a2, w, b))
-  df <- createDataFrame(data)
-
-  stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
-  rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
-
-  coefs <- stats$coefficients
-  rCoefs <- rStats$coefficients
-  expect_true(all(abs(rCoefs - coefs) < 1e-3))
-  expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
-  expect_equal(stats$dispersion, rStats$dispersion)
-  expect_equal(stats$null.deviance, rStats$null.deviance)
-  expect_equal(stats$deviance, rStats$deviance)
-  expect_equal(stats$df.null, rStats$df.null)
-  expect_equal(stats$df.residual, rStats$df.residual)
-  expect_equal(stats$aic, rStats$aic)
-
-  # Test summary works on base GLM models
-  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
-  baseSummary <- summary(baseModel)
-  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
-
-  # Test spark.glm works with regularization parameter
-  data <- as.data.frame(cbind(a1, a2, b))
-  df <- suppressWarnings(createDataFrame(data))
-  regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
-  expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
-
-  # Test spark.glm works on collinear data
-  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
-  b <- c(1, 2, 3, 4)
-  data <- as.data.frame(cbind(A, b))
-  df <- createDataFrame(data)
-  stats <- summary(spark.glm(df, b ~ . - 1))
-  coefs <- stats$coefficients
-  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
-})
-
-test_that("spark.glm save/load", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
-  s <- summary(m)
-
-  modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
-  write.ml(m, modelPath)
-  expect_error(write.ml(m, modelPath))
-  write.ml(m, modelPath, overwrite = TRUE)
-  m2 <- read.ml(modelPath)
-  s2 <- summary(m2)
-
-  expect_equal(s$coefficients, s2$coefficients)
-  expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
-  expect_equal(s$dispersion, s2$dispersion)
-  expect_equal(s$null.deviance, s2$null.deviance)
-  expect_equal(s$deviance, s2$deviance)
-  expect_equal(s$df.null, s2$df.null)
-  expect_equal(s$df.residual, s2$df.residual)
-  expect_equal(s$aic, s2$aic)
-  expect_equal(s$iter, s2$iter)
-  expect_true(!s$is.loaded)
-  expect_true(s2$is.loaded)
-
-  unlink(modelPath)
-})
-
-test_that("formula of glm", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  # dot minus and intercept vs native glm
-  model <- glm(Sepal_Width ~ . - Species + 0, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # feature interaction vs native glm
-  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # glm should work with long formula
-  training <- suppressWarnings(createDataFrame(iris))
-  training$LongLongLongLongLongName <- training$Sepal_Width
-  training$VeryLongLongLongLonLongName <- training$Sepal_Length
-  training$AnotherLongLongLongLongName <- training$Species
-  model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
-               data = training)
-  vals <- collect(select(predict(model, training), "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-})
-
-test_that("glm and predict", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  # gaussian family
-  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # poisson family
-  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
-               family = poisson(link = identity))
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
-                                        data = iris, family = poisson(link = identity)), iris))
-  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
-
-  # tweedie family
-  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
-               family = "tweedie", var.power = 1.2, link.power = 0.0)
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
-  vals <- collect(select(prediction, "prediction"))
-
-  # manual calculation of the R predicted values to avoid dependence on statmod
-  #' library(statmod)
-  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
-  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
-  #' print(coef(rModel))
-
-  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
-  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
-                                   data = iris) %*% rCoef))
-  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
-
-  # Test stats::predict is working
-  x <- rnorm(15)
-  y <- x + rnorm(15)
-  expect_equal(length(predict(lm(y ~ x))), 15)
-})
-
-test_that("glm summary", {
-  skip_on_cran()
-
-  # gaussian family
-  training <- suppressWarnings(createDataFrame(iris))
-  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
-
-  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
-
-  coefs <- stats$coefficients
-  rCoefs <- rStats$coefficients
-  expect_true(all(abs(rCoefs - coefs) < 1e-4))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
-  expect_equal(stats$dispersion, rStats$dispersion)
-  expect_equal(stats$null.deviance, rStats$null.deviance)
-  expect_equal(stats$deviance, rStats$deviance)
-  expect_equal(stats$df.null, rStats$df.null)
-  expect_equal(stats$df.residual, rStats$df.residual)
-  expect_equal(stats$aic, rStats$aic)
-
-  # binomial family
-  df <- suppressWarnings(createDataFrame(iris))
-  training <- df[df$Species %in% c("versicolor", "virginica"), ]
-  stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
-                       family = binomial(link = "logit")))
-
-  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
-  rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
-                        family = binomial(link = "logit")))
-
-  coefs <- stats$coefficients
-  rCoefs <- rStats$coefficients
-  expect_true(all(abs(rCoefs - coefs) < 1e-4))
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
-  expect_equal(stats$dispersion, rStats$dispersion)
-  expect_equal(stats$null.deviance, rStats$null.deviance)
-  expect_equal(stats$deviance, rStats$deviance)
-  expect_equal(stats$df.null, rStats$df.null)
-  expect_equal(stats$df.residual, rStats$df.residual)
-  expect_equal(stats$aic, rStats$aic)
-
-  # Test summary works on base GLM models
-  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
-  baseSummary <- summary(baseModel)
-  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
-})
-
-test_that("glm save/load", {
-  skip_on_cran()
-
-  training <- suppressWarnings(createDataFrame(iris))
-  m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
-  s <- summary(m)
-
-  modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
-  write.ml(m, modelPath)
-  expect_error(write.ml(m, modelPath))
-  write.ml(m, modelPath, overwrite = TRUE)
-  m2 <- read.ml(modelPath)
-  s2 <- summary(m2)
-
-  expect_equal(s$coefficients, s2$coefficients)
-  expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
-  expect_equal(s$dispersion, s2$dispersion)
-  expect_equal(s$null.deviance, s2$null.deviance)
-  expect_equal(s$deviance, s2$deviance)
-  expect_equal(s$df.null, s2$df.null)
-  expect_equal(s$df.residual, s2$df.residual)
-  expect_equal(s$aic, s2$aic)
-  expect_equal(s$iter, s2$iter)
-  expect_true(!s$is.loaded)
-  expect_true(s2$is.loaded)
-
-  unlink(modelPath)
-})
-
-test_that("spark.isoreg", {
-  label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
-  feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
-  weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
-  data <- as.data.frame(cbind(label, feature, weight))
-  df <- createDataFrame(data)
-
-  model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
-                        weightCol = "weight")
-  # only allow one variable on the right hand side of the formula
-  expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
-  result <- summary(model)
-  expect_equal(result$predictions, list(7, 5, 4, 4, 1))
-
-  # Test model prediction
-  predict_data <- list(list(-2.0), list(-1.0), list(0.5),
-                       list(0.75), list(1.0), list(2.0), list(9.0))
-  predict_df <- createDataFrame(predict_data, c("feature"))
-  predict_result <- collect(select(predict(model, predict_df), "prediction"))
-  expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-isoreg", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    expect_equal(result, summary(model2))
-
-    unlink(modelPath)
-  }
-})
-
-test_that("spark.survreg", {
-  # R code to reproduce the result.
-  #
-  #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
-  #'               x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
-  #' library(survival)
-  #' model <- survreg(Surv(time, status) ~ x + sex, rData)
-  #' summary(model)
-  #' predict(model, data)
-  #
-  # -- output of 'summary(model)'
-  #
-  #              Value Std. Error     z        p
-  # (Intercept)  1.315      0.270  4.88 1.07e-06
-  # x           -0.190      0.173 -1.10 2.72e-01
-  # sex         -0.253      0.329 -0.77 4.42e-01
-  # Log(scale)  -1.160      0.396 -2.93 3.41e-03
-  #
-  # -- output of 'predict(model, data)'
-  #
-  #        1        2        3        4        5        6        7
-  # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
-  #
-  data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
-          list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
-  df <- createDataFrame(data, c("time", "status", "x", "sex"))
-  model <- spark.survreg(df, Surv(time, status) ~ x + sex)
-  stats <- summary(model)
-  coefs <- as.vector(stats$coefficients[, 1])
-  rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
-  expect_equal(coefs, rCoefs, tolerance = 1e-4)
-  expect_true(all(
-    rownames(stats$coefficients) ==
-    c("(Intercept)", "x", "sex", "Log(scale)")))
-  p <- collect(select(predict(model, df), "prediction"))
-  expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
-               2.390146, 2.891269, 2.891269), tolerance = 1e-4)
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    coefs2 <- as.vector(stats2$coefficients[, 1])
-    expect_equal(coefs, coefs2)
-    expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
-
-    unlink(modelPath)
-  }
-
-  # Test survival::survreg
-  if (requireNamespace("survival", quietly = TRUE)) {
-    rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
-                 x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
-    expect_error(
-      model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
-                                 NA)
-    expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
-  }
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_stat.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_stat.R b/R/pkg/inst/tests/testthat/test_mllib_stat.R
deleted file mode 100644
index 1600833..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_stat.R
+++ /dev/null
@@ -1,53 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib statistics algorithms")
-
-# Tests for MLlib statistics algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.kstest", {
-  data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
-  df <- createDataFrame(data)
-  testResult <- spark.kstest(df, "test", "norm")
-  stats <- summary(testResult)
-
-  rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
-
-  expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
-  expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
-  expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
-
-  testResult <- spark.kstest(df, "test", "norm", -0.5)
-  stats <- summary(testResult)
-
-  rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
-
-  expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
-  expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
-  expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
-
-  # Test print.summary.KSTest
-  printStats <- capture.output(print.summary.KSTest(stats))
-  expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
-  expect_match(printStats[5],
-               "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_tree.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_tree.R b/R/pkg/inst/tests/testthat/test_mllib_tree.R
deleted file mode 100644
index 31427ee..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_tree.R
+++ /dev/null
@@ -1,320 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib tree-based algorithms")
-
-# Tests for MLlib tree-based algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
-  sparkHome <- sparkR.conf("spark.home")
-  file.path(sparkHome, x)
-}
-
-test_that("spark.gbt", {
-  skip_on_cran()
-
-  # regression
-  data <- suppressWarnings(createDataFrame(longley))
-  model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
-  predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
-                                         63.221, 63.639, 64.989, 63.761,
-                                         66.019, 67.857, 68.169, 66.513,
-                                         68.655, 69.564, 69.331, 70.551),
-               tolerance = 1e-4)
-  stats <- summary(model)
-  expect_equal(stats$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-  expect_equal(stats$formula, "Employed ~ .")
-  expect_equal(stats$numFeatures, 6)
-  expect_equal(length(stats$treeWeights), 20)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$formula, stats2$formula)
-    expect_equal(stats$numFeatures, stats2$numFeatures)
-    expect_equal(stats$features, stats2$features)
-    expect_equal(stats$featureImportances, stats2$featureImportances)
-    expect_equal(stats$maxDepth, stats2$maxDepth)
-    expect_equal(stats$numTrees, stats2$numTrees)
-    expect_equal(stats$treeWeights, stats2$treeWeights)
-
-    unlink(modelPath)
-  }
-
-  # classification
-  # label must be binary - GBTClassifier currently only supports binary classification.
-  iris2 <- iris[iris$Species != "virginica", ]
-  data <- suppressWarnings(createDataFrame(iris2))
-  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
-  stats <- summary(model)
-  expect_equal(stats$numFeatures, 2)
-  expect_equal(stats$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-  expect_error(capture.output(stats), NA)
-  expect_true(length(capture.output(stats)) > 6)
-  predictions <- collect(predict(model, data))$prediction
-  # test string prediction values
-  expect_equal(length(grep("setosa", predictions)), 50)
-  expect_equal(length(grep("versicolor", predictions)), 50)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$depth, stats2$depth)
-    expect_equal(stats$numNodes, stats2$numNodes)
-    expect_equal(stats$numClasses, stats2$numClasses)
-
-    unlink(modelPath)
-  }
-
-  iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
-  df <- suppressWarnings(createDataFrame(iris2))
-  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
-  s <- summary(m)
-  # test numeric prediction values
-  expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
-  expect_equal(s$numFeatures, 5)
-  expect_equal(s$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-
-  # spark.gbt classification can work on libsvm data
-  if (not_cran_or_windows_with_hadoop()) {
-    data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
-                  source = "libsvm")
-    model <- spark.gbt(data, label ~ features, "classification")
-    expect_equal(summary(model)$numFeatures, 692)
-  }
-})
-
-test_that("spark.randomForest", {
-  # regression
-  data <- suppressWarnings(createDataFrame(longley))
-  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
-                              numTrees = 1)
-
-  predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
-                                         63.221, 63.639, 64.989, 63.761,
-                                         66.019, 67.857, 68.169, 66.513,
-                                         68.655, 69.564, 69.331, 70.551),
-               tolerance = 1e-4)
-
-  stats <- summary(model)
-  expect_equal(stats$numTrees, 1)
-  expect_equal(stats$maxDepth, 5)
-  expect_error(capture.output(stats), NA)
-  expect_true(length(capture.output(stats)) > 6)
-
-  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
-                              numTrees = 20, seed = 123)
-  predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
-                                         63.53160, 64.05470, 65.12710, 64.30450,
-                                         66.70910, 67.86125, 68.08700, 67.21865,
-                                         68.89275, 69.53180, 69.39640, 69.68250),
-               tolerance = 1e-4)
-  stats <- summary(model)
-  expect_equal(stats$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$formula, stats2$formula)
-    expect_equal(stats$numFeatures, stats2$numFeatures)
-    expect_equal(stats$features, stats2$features)
-    expect_equal(stats$featureImportances, stats2$featureImportances)
-    expect_equal(stats$numTrees, stats2$numTrees)
-    expect_equal(stats$maxDepth, stats2$maxDepth)
-    expect_equal(stats$treeWeights, stats2$treeWeights)
-
-    unlink(modelPath)
-  }
-
-  # classification
-  data <- suppressWarnings(createDataFrame(iris))
-  model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
-
-  stats <- summary(model)
-  expect_equal(stats$numFeatures, 2)
-  expect_equal(stats$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-  expect_error(capture.output(stats), NA)
-  expect_true(length(capture.output(stats)) > 6)
-  # Test string prediction values
-  predictions <- collect(predict(model, data))$prediction
-  expect_equal(length(grep("setosa", predictions)), 50)
-  expect_equal(length(grep("versicolor", predictions)), 50)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$depth, stats2$depth)
-    expect_equal(stats$numNodes, stats2$numNodes)
-    expect_equal(stats$numClasses, stats2$numClasses)
-
-    unlink(modelPath)
-  }
-
-  # Test numeric response variable
-  labelToIndex <- function(species) {
-    switch(as.character(species),
-      setosa = 0.0,
-      versicolor = 1.0,
-      virginica = 2.0
-    )
-  }
-  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
-  data <- suppressWarnings(createDataFrame(iris[-5]))
-  model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
-  stats <- summary(model)
-  expect_equal(stats$numFeatures, 2)
-  expect_equal(stats$numTrees, 20)
-  expect_equal(stats$maxDepth, 5)
-
-  # Test numeric prediction values
-  predictions <- collect(predict(model, data))$prediction
-  expect_equal(length(grep("1.0", predictions)), 50)
-  expect_equal(length(grep("2.0", predictions)), 50)
-
-  # spark.randomForest classification can work on libsvm data
-  if (not_cran_or_windows_with_hadoop()) {
-    data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
-                  source = "libsvm")
-    model <- spark.randomForest(data, label ~ features, "classification")
-    expect_equal(summary(model)$numFeatures, 4)
-  }
-})
-
-test_that("spark.decisionTree", {
-  skip_on_cran()
-
-  # regression
-  data <- suppressWarnings(createDataFrame(longley))
-  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
-
-  predictions <- collect(predict(model, data))
-  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
-                                         63.221, 63.639, 64.989, 63.761,
-                                         66.019, 67.857, 68.169, 66.513,
-                                         68.655, 69.564, 69.331, 70.551),
-               tolerance = 1e-4)
-
-  stats <- summary(model)
-  expect_equal(stats$maxDepth, 5)
-  expect_error(capture.output(stats), NA)
-  expect_true(length(capture.output(stats)) > 6)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-decisionTreeRegression", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$formula, stats2$formula)
-    expect_equal(stats$numFeatures, stats2$numFeatures)
-    expect_equal(stats$features, stats2$features)
-    expect_equal(stats$featureImportances, stats2$featureImportances)
-    expect_equal(stats$maxDepth, stats2$maxDepth)
-
-    unlink(modelPath)
-  }
-
-  # classification
-  data <- suppressWarnings(createDataFrame(iris))
-  model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
-
-  stats <- summary(model)
-  expect_equal(stats$numFeatures, 2)
-  expect_equal(stats$maxDepth, 5)
-  expect_error(capture.output(stats), NA)
-  expect_true(length(capture.output(stats)) > 6)
-  # Test string prediction values
-  predictions <- collect(predict(model, data))$prediction
-  expect_equal(length(grep("setosa", predictions)), 50)
-  expect_equal(length(grep("versicolor", predictions)), 50)
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-decisionTreeClassification", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$depth, stats2$depth)
-    expect_equal(stats$numNodes, stats2$numNodes)
-    expect_equal(stats$numClasses, stats2$numClasses)
-
-    unlink(modelPath)
-  }
-
-  # Test numeric response variable
-  labelToIndex <- function(species) {
-    switch(as.character(species),
-      setosa = 0.0,
-      versicolor = 1.0,
-      virginica = 2.0
-    )
-  }
-  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
-  data <- suppressWarnings(createDataFrame(iris[-5]))
-  model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
-                              maxDepth = 5, maxBins = 16)
-  stats <- summary(model)
-  expect_equal(stats$numFeatures, 2)
-  expect_equal(stats$maxDepth, 5)
-
-  # Test numeric prediction values
-  predictions <- collect(predict(model, data))$prediction
-  expect_equal(length(grep("1.0", predictions)), 50)
-  expect_equal(length(grep("2.0", predictions)), 50)
-
-  # spark.decisionTree classification can work on libsvm data
-  if (not_cran_or_windows_with_hadoop()) {
-    data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
-                  source = "libsvm")
-    model <- spark.decisionTree(data, label ~ features, "classification")
-    expect_equal(summary(model)$numFeatures, 4)
-  }
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_parallelize_collect.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/testthat/test_parallelize_collect.R
deleted file mode 100644
index 52d4c93..0000000
--- a/R/pkg/inst/tests/testthat/test_parallelize_collect.R
+++ /dev/null
@@ -1,120 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("parallelize() and collect()")
-
-# Mock data
-numVector <- c(-10:97)
-numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
-strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
-               "violated, but I'm not. No, in fact, I think this is a friendly",
-               "message, like \"Hey, wanna play?\" and yes, I want to play. ",
-               "I really, really do.")
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
-                "other times it helps me control the chaos.",
-                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
-                "raising me. But they're both dead now. I didn't kill them. Honest.")
-
-numPairs <- list(list(1, 1), list(1, 2), list(2, 2), list(2, 3))
-strPairs <- list(list(strList, strList), list(strList, strList))
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-jsc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Tests
-
-test_that("parallelize() on simple vectors and lists returns an RDD", {
-  skip_on_cran()
-
-  numVectorRDD <- parallelize(jsc, numVector, 1)
-  numVectorRDD2 <- parallelize(jsc, numVector, 10)
-  numListRDD <- parallelize(jsc, numList, 1)
-  numListRDD2 <- parallelize(jsc, numList, 4)
-  strVectorRDD <- parallelize(jsc, strVector, 2)
-  strVectorRDD2 <- parallelize(jsc, strVector, 3)
-  strListRDD <- parallelize(jsc, strList, 4)
-  strListRDD2 <- parallelize(jsc, strList, 1)
-
-  rdds <- c(numVectorRDD,
-             numVectorRDD2,
-             numListRDD,
-             numListRDD2,
-             strVectorRDD,
-             strVectorRDD2,
-             strListRDD,
-             strListRDD2)
-
-  for (rdd in rdds) {
-    expect_is(rdd, "RDD")
-    expect_true(.hasSlot(rdd, "jrdd")
-                && inherits(rdd@jrdd, "jobj")
-                && isInstanceOf(rdd@jrdd, "org.apache.spark.api.java.JavaRDD"))
-  }
-})
-
-test_that("collect(), following a parallelize(), gives back the original collections", {
-  skip_on_cran()
-
-  numVectorRDD <- parallelize(jsc, numVector, 10)
-  expect_equal(collectRDD(numVectorRDD), as.list(numVector))
-
-  numListRDD <- parallelize(jsc, numList, 1)
-  numListRDD2 <- parallelize(jsc, numList, 4)
-  expect_equal(collectRDD(numListRDD), as.list(numList))
-  expect_equal(collectRDD(numListRDD2), as.list(numList))
-
-  strVectorRDD <- parallelize(jsc, strVector, 2)
-  strVectorRDD2 <- parallelize(jsc, strVector, 3)
-  expect_equal(collectRDD(strVectorRDD), as.list(strVector))
-  expect_equal(collectRDD(strVectorRDD2), as.list(strVector))
-
-  strListRDD <- parallelize(jsc, strList, 4)
-  strListRDD2 <- parallelize(jsc, strList, 1)
-  expect_equal(collectRDD(strListRDD), as.list(strList))
-  expect_equal(collectRDD(strListRDD2), as.list(strList))
-})
-
-test_that("regression: collect() following a parallelize() does not drop elements", {
-  skip_on_cran()
-
-  # 10 %/% 6 = 1, ceiling(10 / 6) = 2
-  collLen <- 10
-  numPart <- 6
-  expected <- runif(collLen)
-  actual <- collectRDD(parallelize(jsc, expected, numPart))
-  expect_equal(actual, as.list(expected))
-})
-
-test_that("parallelize() and collect() work for lists of pairs (pairwise data)", {
-  skip_on_cran()
-
-  # use the pairwise logical to indicate pairwise data
-  numPairsRDDD1 <- parallelize(jsc, numPairs, 1)
-  numPairsRDDD2 <- parallelize(jsc, numPairs, 2)
-  numPairsRDDD3 <- parallelize(jsc, numPairs, 3)
-  expect_equal(collectRDD(numPairsRDDD1), numPairs)
-  expect_equal(collectRDD(numPairsRDDD2), numPairs)
-  expect_equal(collectRDD(numPairsRDDD3), numPairs)
-  # can also leave out the parameter name, if the params are supplied in order
-  strPairsRDDD1 <- parallelize(jsc, strPairs, 1)
-  strPairsRDDD2 <- parallelize(jsc, strPairs, 2)
-  expect_equal(collectRDD(strPairsRDDD1), strPairs)
-  expect_equal(collectRDD(strPairsRDDD2), strPairs)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_rdd.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/testthat/test_rdd.R
deleted file mode 100644
index fb244e1..0000000
--- a/R/pkg/inst/tests/testthat/test_rdd.R
+++ /dev/null
@@ -1,906 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("basic RDD functions")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-nums <- 1:10
-rdd <- parallelize(sc, nums, 2L)
-
-intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
-intRdd <- parallelize(sc, intPairs, 2L)
-
-test_that("get number of partitions in RDD", {
-  skip_on_cran()
-
-  expect_equal(getNumPartitionsRDD(rdd), 2)
-  expect_equal(getNumPartitionsRDD(intRdd), 2)
-})
-
-test_that("first on RDD", {
-  skip_on_cran()
-
-  expect_equal(firstRDD(rdd), 1)
-  newrdd <- lapply(rdd, function(x) x + 1)
-  expect_equal(firstRDD(newrdd), 2)
-})
-
-test_that("count and length on RDD", {
-  skip_on_cran()
-
-  expect_equal(countRDD(rdd), 10)
-  expect_equal(lengthRDD(rdd), 10)
-})
-
-test_that("count by values and keys", {
-  skip_on_cran()
-
-  mods <- lapply(rdd, function(x) { x %% 3 })
-  actual <- countByValue(mods)
-  expected <- list(list(0, 3L), list(1, 4L), list(2, 3L))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  actual <- countByKey(intRdd)
-  expected <- list(list(2L, 2L), list(1L, 2L))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("lapply on RDD", {
-  skip_on_cran()
-
-  multiples <- lapply(rdd, function(x) { 2 * x })
-  actual <- collectRDD(multiples)
-  expect_equal(actual, as.list(nums * 2))
-})
-
-test_that("lapplyPartition on RDD", {
-  skip_on_cran()
-
-  sums <- lapplyPartition(rdd, function(part) { sum(unlist(part)) })
-  actual <- collectRDD(sums)
-  expect_equal(actual, list(15, 40))
-})
-
-test_that("mapPartitions on RDD", {
-  skip_on_cran()
-
-  sums <- mapPartitions(rdd, function(part) { sum(unlist(part)) })
-  actual <- collectRDD(sums)
-  expect_equal(actual, list(15, 40))
-})
-
-test_that("flatMap() on RDDs", {
-  skip_on_cran()
-
-  flat <- flatMap(intRdd, function(x) { list(x, x) })
-  actual <- collectRDD(flat)
-  expect_equal(actual, rep(intPairs, each = 2))
-})
-
-test_that("filterRDD on RDD", {
-  skip_on_cran()
-
-  filtered.rdd <- filterRDD(rdd, function(x) { x %% 2 == 0 })
-  actual <- collectRDD(filtered.rdd)
-  expect_equal(actual, list(2, 4, 6, 8, 10))
-
-  filtered.rdd <- Filter(function(x) { x[[2]] < 0 }, intRdd)
-  actual <- collectRDD(filtered.rdd)
-  expect_equal(actual, list(list(1L, -1)))
-
-  # Filter out all elements.
-  filtered.rdd <- filterRDD(rdd, function(x) { x > 10 })
-  actual <- collectRDD(filtered.rdd)
-  expect_equal(actual, list())
-})
-
-test_that("lookup on RDD", {
-  skip_on_cran()
-
-  vals <- lookup(intRdd, 1L)
-  expect_equal(vals, list(-1, 200))
-
-  vals <- lookup(intRdd, 3L)
-  expect_equal(vals, list())
-})
-
-test_that("several transformations on RDD (a benchmark on PipelinedRDD)", {
-  skip_on_cran()
-
-  rdd2 <- rdd
-  for (i in 1:12)
-    rdd2 <- lapplyPartitionsWithIndex(
-              rdd2, function(partIndex, part) {
-                part <- as.list(unlist(part) * partIndex + i)
-              })
-  rdd2 <- lapply(rdd2, function(x) x + x)
-  actual <- collectRDD(rdd2)
-  expected <- list(24, 24, 24, 24, 24,
-                   168, 170, 172, 174, 176)
-  expect_equal(actual, expected)
-})
-
-test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkpoint()", {
-  skip_on_cran()
-
-  # RDD
-  rdd2 <- rdd
-  # PipelinedRDD
-  rdd2 <- lapplyPartitionsWithIndex(
-            rdd2,
-            function(partIndex, part) {
-              part <- as.list(unlist(part) * partIndex)
-            })
-
-  cacheRDD(rdd2)
-  expect_true(rdd2@env$isCached)
-  rdd2 <- lapply(rdd2, function(x) x)
-  expect_false(rdd2@env$isCached)
-
-  unpersistRDD(rdd2)
-  expect_false(rdd2@env$isCached)
-
-  persistRDD(rdd2, "MEMORY_AND_DISK")
-  expect_true(rdd2@env$isCached)
-  rdd2 <- lapply(rdd2, function(x) x)
-  expect_false(rdd2@env$isCached)
-
-  unpersistRDD(rdd2)
-  expect_false(rdd2@env$isCached)
-
-  tempDir <- tempfile(pattern = "checkpoint")
-  setCheckpointDirSC(sc, tempDir)
-  checkpointRDD(rdd2)
-  expect_true(rdd2@env$isCheckpointed)
-
-  rdd2 <- lapply(rdd2, function(x) x)
-  expect_false(rdd2@env$isCached)
-  expect_false(rdd2@env$isCheckpointed)
-
-  # make sure the data is collectable
-  collectRDD(rdd2)
-
-  unlink(tempDir)
-})
-
-test_that("reduce on RDD", {
-  skip_on_cran()
-
-  sum <- reduce(rdd, "+")
-  expect_equal(sum, 55)
-
-  # Also test with an inline function
-  sumInline <- reduce(rdd, function(x, y) { x + y })
-  expect_equal(sumInline, 55)
-})
-
-test_that("lapply with dependency", {
-  skip_on_cran()
-
-  fa <- 5
-  multiples <- lapply(rdd, function(x) { fa * x })
-  actual <- collectRDD(multiples)
-
-  expect_equal(actual, as.list(nums * 5))
-})
-
-test_that("lapplyPartitionsWithIndex on RDDs", {
-  skip_on_cran()
-
-  func <- function(partIndex, part) { list(partIndex, Reduce("+", part)) }
-  actual <- collectRDD(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE)
-  expect_equal(actual, list(list(0, 15), list(1, 40)))
-
-  pairsRDD <- parallelize(sc, list(list(1, 2), list(3, 4), list(4, 8)), 1L)
-  partitionByParity <- function(key) { if (key %% 2 == 1) 0 else 1 }
-  mkTup <- function(partIndex, part) { list(partIndex, part) }
-  actual <- collectRDD(lapplyPartitionsWithIndex(
-                      partitionByRDD(pairsRDD, 2L, partitionByParity),
-                      mkTup),
-                    FALSE)
-  expect_equal(actual, list(list(0, list(list(1, 2), list(3, 4))),
-                            list(1, list(list(4, 8)))))
-})
-
-test_that("sampleRDD() on RDDs", {
-  skip_on_cran()
-
-  expect_equal(unlist(collectRDD(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums)
-})
-
-test_that("takeSample() on RDDs", {
-  skip_on_cran()
-
-  # ported from RDDSuite.scala, modified seeds
-  data <- parallelize(sc, 1:100, 2L)
-  for (seed in 4:5) {
-    s <- takeSample(data, FALSE, 20L, seed)
-    expect_equal(length(s), 20L)
-    expect_equal(length(unique(s)), 20L)
-    for (elem in s) {
-      expect_true(elem >= 1 && elem <= 100)
-    }
-  }
-  for (seed in 4:5) {
-    s <- takeSample(data, FALSE, 200L, seed)
-    expect_equal(length(s), 100L)
-    expect_equal(length(unique(s)), 100L)
-    for (elem in s) {
-      expect_true(elem >= 1 && elem <= 100)
-    }
-  }
-  for (seed in 4:5) {
-    s <- takeSample(data, TRUE, 20L, seed)
-    expect_equal(length(s), 20L)
-    for (elem in s) {
-      expect_true(elem >= 1 && elem <= 100)
-    }
-  }
-  for (seed in 4:5) {
-    s <- takeSample(data, TRUE, 100L, seed)
-    expect_equal(length(s), 100L)
-    # Chance of getting all distinct elements is astronomically low, so test we
-    # got less than 100
-    expect_true(length(unique(s)) < 100L)
-  }
-  for (seed in 4:5) {
-    s <- takeSample(data, TRUE, 200L, seed)
-    expect_equal(length(s), 200L)
-    # Chance of getting all distinct elements is still quite low, so test we
-    # got less than 100
-    expect_true(length(unique(s)) < 100L)
-  }
-})
-
-test_that("mapValues() on pairwise RDDs", {
-  skip_on_cran()
-
-  multiples <- mapValues(intRdd, function(x) { x * 2 })
-  actual <- collectRDD(multiples)
-  expected <- lapply(intPairs, function(x) {
-    list(x[[1]], x[[2]] * 2)
-  })
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("flatMapValues() on pairwise RDDs", {
-  skip_on_cran()
-
-  l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4))))
-  actual <- collectRDD(flatMapValues(l, function(x) { x }))
-  expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
-
-  # Generate x to x+1 for every value
-  actual <- collectRDD(flatMapValues(intRdd, function(x) { x: (x + 1) }))
-  expect_equal(actual,
-               list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101),
-                    list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
-})
-
-test_that("reduceByKeyLocally() on PairwiseRDDs", {
-  skip_on_cran()
-
-  pairs <- parallelize(sc, list(list(1, 2), list(1.1, 3), list(1, 4)), 2L)
-  actual <- reduceByKeyLocally(pairs, "+")
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list(1, 6), list(1.1, 3))))
-
-  pairs <- parallelize(sc, list(list("abc", 1.2), list(1.1, 0), list("abc", 1.3),
-                                list("bb", 5)), 4L)
-  actual <- reduceByKeyLocally(pairs, "+")
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list("abc", 2.5), list(1.1, 0), list("bb", 5))))
-})
-
-test_that("distinct() on RDDs", {
-  skip_on_cran()
-
-  nums.rep2 <- rep(1:10, 2)
-  rdd.rep2 <- parallelize(sc, nums.rep2, 2L)
-  uniques <- distinctRDD(rdd.rep2)
-  actual <- sort(unlist(collectRDD(uniques)))
-  expect_equal(actual, nums)
-})
-
-test_that("maximum() on RDDs", {
-  skip_on_cran()
-
-  max <- maximum(rdd)
-  expect_equal(max, 10)
-})
-
-test_that("minimum() on RDDs", {
-  skip_on_cran()
-
-  min <- minimum(rdd)
-  expect_equal(min, 1)
-})
-
-test_that("sumRDD() on RDDs", {
-  skip_on_cran()
-
-  sum <- sumRDD(rdd)
-  expect_equal(sum, 55)
-})
-
-test_that("keyBy on RDDs", {
-  skip_on_cran()
-
-  func <- function(x) { x * x }
-  keys <- keyBy(rdd, func)
-  actual <- collectRDD(keys)
-  expect_equal(actual, lapply(nums, function(x) { list(func(x), x) }))
-})
-
-test_that("repartition/coalesce on RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, 1:20, 4L) # each partition contains 5 elements
-
-  # repartition
-  r1 <- repartitionRDD(rdd, 2)
-  expect_equal(getNumPartitionsRDD(r1), 2L)
-  count <- length(collectPartition(r1, 0L))
-  expect_true(count >= 8 && count <= 12)
-
-  r2 <- repartitionRDD(rdd, 6)
-  expect_equal(getNumPartitionsRDD(r2), 6L)
-  count <- length(collectPartition(r2, 0L))
-  expect_true(count >= 0 && count <= 4)
-
-  # coalesce
-  r3 <- coalesceRDD(rdd, 1)
-  expect_equal(getNumPartitionsRDD(r3), 1L)
-  count <- length(collectPartition(r3, 0L))
-  expect_equal(count, 20)
-})
-
-test_that("sortBy() on RDDs", {
-  skip_on_cran()
-
-  sortedRdd <- sortBy(rdd, function(x) { x * x }, ascending = FALSE)
-  actual <- collectRDD(sortedRdd)
-  expect_equal(actual, as.list(sort(nums, decreasing = TRUE)))
-
-  rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
-  sortedRdd2 <- sortBy(rdd2, function(x) { x * x })
-  actual <- collectRDD(sortedRdd2)
-  expect_equal(actual, as.list(nums))
-})
-
-test_that("takeOrdered() on RDDs", {
-  skip_on_cran()
-
-  l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
-  rdd <- parallelize(sc, l)
-  actual <- takeOrdered(rdd, 6L)
-  expect_equal(actual, as.list(sort(unlist(l)))[1:6])
-
-  l <- list("e", "d", "c", "d", "a")
-  rdd <- parallelize(sc, l)
-  actual <- takeOrdered(rdd, 3L)
-  expect_equal(actual, as.list(sort(unlist(l)))[1:3])
-})
-
-test_that("top() on RDDs", {
-  skip_on_cran()
-
-  l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
-  rdd <- parallelize(sc, l)
-  actual <- top(rdd, 6L)
-  expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:6])
-
-  l <- list("e", "d", "c", "d", "a")
-  rdd <- parallelize(sc, l)
-  actual <- top(rdd, 3L)
-  expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:3])
-})
-
-test_that("fold() on RDDs", {
-  skip_on_cran()
-
-  actual <- fold(rdd, 0, "+")
-  expect_equal(actual, Reduce("+", nums, 0))
-
-  rdd <- parallelize(sc, list())
-  actual <- fold(rdd, 0, "+")
-  expect_equal(actual, 0)
-})
-
-test_that("aggregateRDD() on RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, list(1, 2, 3, 4))
-  zeroValue <- list(0, 0)
-  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
-  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
-  actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
-  expect_equal(actual, list(10, 4))
-
-  rdd <- parallelize(sc, list())
-  actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
-  expect_equal(actual, list(0, 0))
-})
-
-test_that("zipWithUniqueId() on RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
-  actual <- collectRDD(zipWithUniqueId(rdd))
-  expected <- list(list("a", 0), list("b", 1), list("c", 4),
-                   list("d", 2), list("e", 5))
-  expect_equal(actual, expected)
-
-  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
-  actual <- collectRDD(zipWithUniqueId(rdd))
-  expected <- list(list("a", 0), list("b", 1), list("c", 2),
-                   list("d", 3), list("e", 4))
-  expect_equal(actual, expected)
-})
-
-test_that("zipWithIndex() on RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
-  actual <- collectRDD(zipWithIndex(rdd))
-  expected <- list(list("a", 0), list("b", 1), list("c", 2),
-                   list("d", 3), list("e", 4))
-  expect_equal(actual, expected)
-
-  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
-  actual <- collectRDD(zipWithIndex(rdd))
-  expected <- list(list("a", 0), list("b", 1), list("c", 2),
-                   list("d", 3), list("e", 4))
-  expect_equal(actual, expected)
-})
-
-test_that("glom() on RDD", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, as.list(1:4), 2L)
-  actual <- collectRDD(glom(rdd))
-  expect_equal(actual, list(list(1, 2), list(3, 4)))
-})
-
-test_that("keys() on RDDs", {
-  skip_on_cran()
-
-  keys <- keys(intRdd)
-  actual <- collectRDD(keys)
-  expect_equal(actual, lapply(intPairs, function(x) { x[[1]] }))
-})
-
-test_that("values() on RDDs", {
-  skip_on_cran()
-
-  values <- values(intRdd)
-  actual <- collectRDD(values)
-  expect_equal(actual, lapply(intPairs, function(x) { x[[2]] }))
-})
-
-test_that("pipeRDD() on RDDs", {
-  skip_on_cran()
-
-  actual <- collectRDD(pipeRDD(rdd, "more"))
-  expected <- as.list(as.character(1:10))
-  expect_equal(actual, expected)
-
-  trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n"))
-  actual <- collectRDD(pipeRDD(trailed.rdd, "sort"))
-  expected <- list("", "1", "2", "3")
-  expect_equal(actual, expected)
-
-  rev.nums <- 9:0
-  rev.rdd <- parallelize(sc, rev.nums, 2L)
-  actual <- collectRDD(pipeRDD(rev.rdd, "sort"))
-  expected <- as.list(as.character(c(5:9, 0:4)))
-  expect_equal(actual, expected)
-})
-
-test_that("zipRDD() on RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, 0:4, 2)
-  rdd2 <- parallelize(sc, 1000:1004, 2)
-  actual <- collectRDD(zipRDD(rdd1, rdd2))
-  expect_equal(actual,
-               list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)))
-
-  mockFile <- c("Spark is pretty.", "Spark is awesome.")
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName, 1)
-  actual <- collectRDD(zipRDD(rdd, rdd))
-  expected <- lapply(mockFile, function(x) { list(x, x) })
-  expect_equal(actual, expected)
-
-  rdd1 <- parallelize(sc, 0:1, 1)
-  actual <- collectRDD(zipRDD(rdd1, rdd))
-  expected <- lapply(0:1, function(x) { list(x, mockFile[x + 1]) })
-  expect_equal(actual, expected)
-
-  rdd1 <- map(rdd, function(x) { x })
-  actual <- collectRDD(zipRDD(rdd, rdd1))
-  expected <- lapply(mockFile, function(x) { list(x, x) })
-  expect_equal(actual, expected)
-
-  unlink(fileName)
-})
-
-test_that("cartesian() on RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, 1:3)
-  actual <- collectRDD(cartesian(rdd, rdd))
-  expect_equal(sortKeyValueList(actual),
-               list(
-                 list(1, 1), list(1, 2), list(1, 3),
-                 list(2, 1), list(2, 2), list(2, 3),
-                 list(3, 1), list(3, 2), list(3, 3)))
-
-  # test case where one RDD is empty
-  emptyRdd <- parallelize(sc, list())
-  actual <- collectRDD(cartesian(rdd, emptyRdd))
-  expect_equal(actual, list())
-
-  mockFile <- c("Spark is pretty.", "Spark is awesome.")
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName)
-  actual <- collectRDD(cartesian(rdd, rdd))
-  expected <- list(
-    list("Spark is awesome.", "Spark is pretty."),
-    list("Spark is awesome.", "Spark is awesome."),
-    list("Spark is pretty.", "Spark is pretty."),
-    list("Spark is pretty.", "Spark is awesome."))
-  expect_equal(sortKeyValueList(actual), expected)
-
-  rdd1 <- parallelize(sc, 0:1)
-  actual <- collectRDD(cartesian(rdd1, rdd))
-  expect_equal(sortKeyValueList(actual),
-               list(
-                 list(0, "Spark is pretty."),
-                 list(0, "Spark is awesome."),
-                 list(1, "Spark is pretty."),
-                 list(1, "Spark is awesome.")))
-
-  rdd1 <- map(rdd, function(x) { x })
-  actual <- collectRDD(cartesian(rdd, rdd1))
-  expect_equal(sortKeyValueList(actual), expected)
-
-  unlink(fileName)
-})
-
-test_that("subtract() on RDDs", {
-  skip_on_cran()
-
-  l <- list(1, 1, 2, 2, 3, 4)
-  rdd1 <- parallelize(sc, l)
-
-  # subtract by itself
-  actual <- collectRDD(subtract(rdd1, rdd1))
-  expect_equal(actual, list())
-
-  # subtract by an empty RDD
-  rdd2 <- parallelize(sc, list())
-  actual <- collectRDD(subtract(rdd1, rdd2))
-  expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
-               l)
-
-  rdd2 <- parallelize(sc, list(2, 4))
-  actual <- collectRDD(subtract(rdd1, rdd2))
-  expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
-               list(1, 1, 3))
-
-  l <- list("a", "a", "b", "b", "c", "d")
-  rdd1 <- parallelize(sc, l)
-  rdd2 <- parallelize(sc, list("b", "d"))
-  actual <- collectRDD(subtract(rdd1, rdd2))
-  expect_equal(as.list(sort(as.vector(actual, mode = "character"))),
-               list("a", "a", "c"))
-})
-
-test_that("subtractByKey() on pairwise RDDs", {
-  skip_on_cran()
-
-  l <- list(list("a", 1), list("b", 4),
-            list("b", 5), list("a", 2))
-  rdd1 <- parallelize(sc, l)
-
-  # subtractByKey by itself
-  actual <- collectRDD(subtractByKey(rdd1, rdd1))
-  expect_equal(actual, list())
-
-  # subtractByKey by an empty RDD
-  rdd2 <- parallelize(sc, list())
-  actual <- collectRDD(subtractByKey(rdd1, rdd2))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(l))
-
-  rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1)))
-  actual <- collectRDD(subtractByKey(rdd1, rdd2))
-  expect_equal(actual,
-               list(list("b", 4), list("b", 5)))
-
-  l <- list(list(1, 1), list(2, 4),
-            list(2, 5), list(1, 2))
-  rdd1 <- parallelize(sc, l)
-  rdd2 <- parallelize(sc, list(list(1, 3), list(3, 1)))
-  actual <- collectRDD(subtractByKey(rdd1, rdd2))
-  expect_equal(actual,
-               list(list(2, 4), list(2, 5)))
-})
-
-test_that("intersection() on RDDs", {
-  skip_on_cran()
-
-  # intersection with self
-  actual <- collectRDD(intersection(rdd, rdd))
-  expect_equal(sort(as.integer(actual)), nums)
-
-  # intersection with an empty RDD
-  emptyRdd <- parallelize(sc, list())
-  actual <- collectRDD(intersection(rdd, emptyRdd))
-  expect_equal(actual, list())
-
-  rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5))
-  rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8))
-  actual <- collectRDD(intersection(rdd1, rdd2))
-  expect_equal(sort(as.integer(actual)), 1:3)
-})
-
-test_that("join() on pairwise RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
-  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
-  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3)))))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
-  rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
-  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3)))))
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
-  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
-  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
-  expect_equal(actual, list())
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
-  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
-  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
-  expect_equal(actual, list())
-})
-
-test_that("leftOuterJoin() on pairwise RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
-  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
-  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
-  rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
-  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
-  expected <-  list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
-  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
-  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
-  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
-  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-})
-
-test_that("rightOuterJoin() on pairwise RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3)))
-  rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
-  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3)))
-  rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
-  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
-  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
-  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
-  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
-  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
-})
-
-test_that("fullOuterJoin() on pairwise RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3)))
-  rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
-  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)),
-                   list(2, list(NULL, 4)), list(3, list(3, NULL)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1)))
-  rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
-  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
-  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)),
-                   list("a", list(3, 1)), list("c", list(1, NULL)))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
-  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
-  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)),
-                                     list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
-  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
-  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)),
-                                     list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
-})
-
-test_that("sortByKey() on pairwise RDDs", {
-  skip_on_cran()
-
-  numPairsRdd <- map(rdd, function(x) { list (x, x) })
-  sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
-  actual <- collectRDD(sortedRdd)
-  numPairs <- lapply(nums, function(x) { list (x, x) })
-  expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
-
-  rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
-  numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
-  sortedRdd2 <- sortByKey(numPairsRdd2)
-  actual <- collectRDD(sortedRdd2)
-  expect_equal(actual, numPairs)
-
-  # sort by string keys
-  l <- list(list("a", 1), list("b", 2), list("1", 3), list("d", 4), list("2", 5))
-  rdd3 <- parallelize(sc, l, 2L)
-  sortedRdd3 <- sortByKey(rdd3)
-  actual <- collectRDD(sortedRdd3)
-  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
-  # test on the boundary cases
-
-  # boundary case 1: the RDD to be sorted has only 1 partition
-  rdd4 <- parallelize(sc, l, 1L)
-  sortedRdd4 <- sortByKey(rdd4)
-  actual <- collectRDD(sortedRdd4)
-  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
-  # boundary case 2: the sorted RDD has only 1 partition
-  rdd5 <- parallelize(sc, l, 2L)
-  sortedRdd5 <- sortByKey(rdd5, numPartitions = 1L)
-  actual <- collectRDD(sortedRdd5)
-  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
-
-  # boundary case 3: the RDD to be sorted has only 1 element
-  l2 <- list(list("a", 1))
-  rdd6 <- parallelize(sc, l2, 2L)
-  sortedRdd6 <- sortByKey(rdd6)
-  actual <- collectRDD(sortedRdd6)
-  expect_equal(actual, l2)
-
-  # boundary case 4: the RDD to be sorted has 0 element
-  l3 <- list()
-  rdd7 <- parallelize(sc, l3, 2L)
-  sortedRdd7 <- sortByKey(rdd7)
-  actual <- collectRDD(sortedRdd7)
-  expect_equal(actual, l3)
-})
-
-test_that("collectAsMap() on a pairwise RDD", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
-  vals <- collectAsMap(rdd)
-  expect_equal(vals, list(`1` = 2, `3` = 4))
-
-  rdd <- parallelize(sc, list(list("a", 1), list("b", 2)))
-  vals <- collectAsMap(rdd)
-  expect_equal(vals, list(a = 1, b = 2))
-
-  rdd <- parallelize(sc, list(list(1.1, 2.2), list(1.2, 2.4)))
-  vals <- collectAsMap(rdd)
-  expect_equal(vals, list(`1.1` = 2.2, `1.2` = 2.4))
-
-  rdd <- parallelize(sc, list(list(1, "a"), list(2, "b")))
-  vals <- collectAsMap(rdd)
-  expect_equal(vals, list(`1` = "a", `2` = "b"))
-})
-
-test_that("show()", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, list(1:10))
-  expect_output(showRDD(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+")
-})
-
-test_that("sampleByKey() on pairwise RDDs", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, 1:2000)
-  pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list("a", x) else list("b", x) })
-  fractions <- list(a = 0.2, b = 0.1)
-  sample <- sampleByKey(pairsRDD, FALSE, fractions, 1618L)
-  expect_equal(100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")), TRUE)
-  expect_equal(50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")), TRUE)
-  expect_equal(lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0, TRUE)
-  expect_equal(lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000, TRUE)
-  expect_equal(lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0, TRUE)
-  expect_equal(lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000, TRUE)
-
-  rdd <- parallelize(sc, 1:2000)
-  pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list(2, x) else list(3, x) })
-  fractions <- list(`2` = 0.2, `3` = 0.1)
-  sample <- sampleByKey(pairsRDD, TRUE, fractions, 1618L)
-  expect_equal(100 < length(lookup(sample, 2)) && 300 > length(lookup(sample, 2)), TRUE)
-  expect_equal(50 < length(lookup(sample, 3)) && 150 > length(lookup(sample, 3)), TRUE)
-  expect_equal(lookup(sample, 2)[which.min(lookup(sample, 2))] >= 0, TRUE)
-  expect_equal(lookup(sample, 2)[which.max(lookup(sample, 2))] <= 2000, TRUE)
-  expect_equal(lookup(sample, 3)[which.min(lookup(sample, 3))] >= 0, TRUE)
-  expect_equal(lookup(sample, 3)[which.max(lookup(sample, 3))] <= 2000, TRUE)
-})
-
-test_that("Test correct concurrency of RRDD.compute()", {
-  skip_on_cran()
-
-  rdd <- parallelize(sc, 1:1000, 100)
-  jrdd <- getJRDD(lapply(rdd, function(x) { x }), "row")
-  zrdd <- callJMethod(jrdd, "zip", jrdd)
-  count <- callJMethod(zrdd, "count")
-  expect_equal(count, 1000)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_shuffle.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/testthat/test_shuffle.R
deleted file mode 100644
index 18320ea..0000000
--- a/R/pkg/inst/tests/testthat/test_shuffle.R
+++ /dev/null
@@ -1,248 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("partitionBy, groupByKey, reduceByKey etc.")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
-intRdd <- parallelize(sc, intPairs, 2L)
-
-doublePairs <- list(list(1.5, -1), list(2.5, 100), list(2.5, 1), list(1.5, 200))
-doubleRdd <- parallelize(sc, doublePairs, 2L)
-
-numPairs <- list(list(1L, 100), list(2L, 200), list(4L, -1), list(3L, 1),
-                 list(3L, 0))
-numPairsRdd <- parallelize(sc, numPairs, length(numPairs))
-
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge and ",
-                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ")
-strListRDD <- parallelize(sc, strList, 4)
-
-test_that("groupByKey for integers", {
-  skip_on_cran()
-
-  grouped <- groupByKey(intRdd, 2L)
-
-  actual <- collectRDD(grouped)
-
-  expected <- list(list(2L, list(100, 1)), list(1L, list(-1, 200)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("groupByKey for doubles", {
-  skip_on_cran()
-
-  grouped <- groupByKey(doubleRdd, 2L)
-
-  actual <- collectRDD(grouped)
-
-  expected <- list(list(1.5, list(-1, 200)), list(2.5, list(100, 1)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("reduceByKey for ints", {
-  skip_on_cran()
-
-  reduced <- reduceByKey(intRdd, "+", 2L)
-
-  actual <- collectRDD(reduced)
-
-  expected <- list(list(2L, 101), list(1L, 199))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("reduceByKey for doubles", {
-  skip_on_cran()
-
-  reduced <- reduceByKey(doubleRdd, "+", 2L)
-  actual <- collectRDD(reduced)
-
-  expected <- list(list(1.5, 199), list(2.5, 101))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for ints", {
-  skip_on_cran()
-
-  reduced <- combineByKey(intRdd, function(x) { x }, "+", "+", 2L)
-
-  actual <- collectRDD(reduced)
-
-  expected <- list(list(2L, 101), list(1L, 199))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for doubles", {
-  skip_on_cran()
-
-  reduced <- combineByKey(doubleRdd, function(x) { x }, "+", "+", 2L)
-  actual <- collectRDD(reduced)
-
-  expected <- list(list(1.5, 199), list(2.5, 101))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("combineByKey for characters", {
-  skip_on_cran()
-
-  stringKeyRDD <- parallelize(sc,
-                              list(list("max", 1L), list("min", 2L),
-                                   list("other", 3L), list("max", 4L)), 2L)
-  reduced <- combineByKey(stringKeyRDD,
-                          function(x) { x }, "+", "+", 2L)
-  actual <- collectRDD(reduced)
-
-  expected <- list(list("max", 5L), list("min", 2L), list("other", 3L))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("aggregateByKey", {
-  skip_on_cran()
-
-  # test aggregateByKey for int keys
-  rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
-
-  zeroValue <- list(0, 0)
-  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
-  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
-  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
-
-  actual <- collectRDD(aggregatedRDD)
-
-  expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  # test aggregateByKey for string keys
-  rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))
-
-  zeroValue <- list(0, 0)
-  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
-  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
-  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
-
-  actual <- collectRDD(aggregatedRDD)
-
-  expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-})
-
-test_that("foldByKey", {
-  skip_on_cran()
-
-  # test foldByKey for int keys
-  folded <- foldByKey(intRdd, 0, "+", 2L)
-
-  actual <- collectRDD(folded)
-
-  expected <- list(list(2L, 101), list(1L, 199))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  # test foldByKey for double keys
-  folded <- foldByKey(doubleRdd, 0, "+", 2L)
-
-  actual <- collectRDD(folded)
-
-  expected <- list(list(1.5, 199), list(2.5, 101))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  # test foldByKey for string keys
-  stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))
-
-  stringKeyRDD <- parallelize(sc, stringKeyPairs)
-  folded <- foldByKey(stringKeyRDD, 0, "+", 2L)
-
-  actual <- collectRDD(folded)
-
-  expected <- list(list("b", 101), list("a", 199))
-  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
-
-  # test foldByKey for empty pair RDD
-  rdd <- parallelize(sc, list())
-  folded <- foldByKey(rdd, 0, "+", 2L)
-  actual <- collectRDD(folded)
-  expected <- list()
-  expect_equal(actual, expected)
-
-  # test foldByKey for RDD with only 1 pair
-  rdd <- parallelize(sc,  list(list(1, 1)))
-  folded <- foldByKey(rdd, 0, "+", 2L)
-  actual <- collectRDD(folded)
-  expected <- list(list(1, 1))
-  expect_equal(actual, expected)
-})
-
-test_that("partitionBy() partitions data correctly", {
-  skip_on_cran()
-
-  # Partition by magnitude
-  partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
-
-  resultRDD <- partitionByRDD(numPairsRdd, 2L, partitionByMagnitude)
-
-  expected_first <- list(list(1, 100), list(2, 200)) # key less than 3
-  expected_second <- list(list(4, -1), list(3, 1), list(3, 0)) # key greater than or equal 3
-  actual_first <- collectPartition(resultRDD, 0L)
-  actual_second <- collectPartition(resultRDD, 1L)
-
-  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
-  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-test_that("partitionBy works with dependencies", {
-  skip_on_cran()
-
-  kOne <- 1
-  partitionByParity <- function(key) { if (key %% 2 == kOne) 7 else 4 }
-
-  # Partition by parity
-  resultRDD <- partitionByRDD(numPairsRdd, numPartitions = 2L, partitionByParity)
-
-  # keys even; 100 %% 2 == 0
-  expected_first <- list(list(2, 200), list(4, -1))
-  # keys odd; 3 %% 2 == 1
-  expected_second <- list(list(1, 100), list(3, 1), list(3, 0))
-  actual_first <- collectPartition(resultRDD, 0L)
-  actual_second <- collectPartition(resultRDD, 1L)
-
-  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
-  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-test_that("test partitionBy with string keys", {
-  skip_on_cran()
-
-  words <- flatMap(strListRDD, function(line) { strsplit(line, " ")[[1]] })
-  wordCount <- lapply(words, function(word) { list(word, 1L) })
-
-  resultRDD <- partitionByRDD(wordCount, 2L)
-  expected_first <- list(list("Dexter", 1), list("Dexter", 1))
-  expected_second <- list(list("and", 1), list("and", 1))
-
-  actual_first <- Filter(function(item) { item[[1]] == "Dexter" },
-                         collectPartition(resultRDD, 0L))
-  actual_second <- Filter(function(item) { item[[1]] == "and" },
-                          collectPartition(resultRDD, 1L))
-
-  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
-  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkR.R b/R/pkg/inst/tests/testthat/test_sparkR.R
deleted file mode 100644
index a40981c..0000000
--- a/R/pkg/inst/tests/testthat/test_sparkR.R
+++ /dev/null
@@ -1,48 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in sparkR.R")
-
-test_that("sparkCheckInstall", {
-  skip_on_cran()
-
-  # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
-  # and the SparkR job was submitted by "spark-submit"
-  sparkHome <- paste0(tempdir(), "/", "sparkHome")
-  dir.create(sparkHome)
-  master <- ""
-  deployMode <- ""
-  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
-  unlink(sparkHome, recursive = TRUE)
-
-  # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
-  # and the SparkR job was submitted by "spark-submit"
-  sparkHome <- ""
-  master <- ""
-  deployMode <- ""
-  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
-
-  # "yarn-client, mesos-client" mode, SPARK_HOME was not set
-  sparkHome <- ""
-  master <- "yarn-client"
-  deployMode <- ""
-  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
-  sparkHome <- ""
-  master <- ""
-  deployMode <- "client"
-  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
-})


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[7/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

[SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

## What changes were proposed in this pull request?

Move all existing tests to non-installed directory so that it will never run by installing SparkR package

For a follow-up PR:
- remove all skip_on_cran() calls in tests
- clean up test timer
- improve or change basic tests that do run on CRAN (if anyone has suggestion)

It looks like `R CMD build pkg` will still put pkg\tests (ie. the full tests) into the source package but `R CMD INSTALL` on such source package does not install these tests (and so `R CMD check` does not run them)

## How was this patch tested?

- [x] unit tests, Jenkins
- [x] AppVeyor
- [x] make a source package, install it, `R CMD check` it - verify the full tests are not installed or run

Author: Felix Cheung <fe...@hotmail.com>

Closes #18264 from felixcheung/rtestset.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc4c3518
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc4c3518
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc4c3518

Branch: refs/heads/master
Commit: dc4c351837879dab26ad8fb471dc51c06832a9e4
Parents: 5301a19
Author: Felix Cheung <fe...@hotmail.com>
Authored: Sun Jun 11 00:00:33 2017 -0700
Committer: Felix Cheung <fe...@apache.org>
Committed: Sun Jun 11 00:00:33 2017 -0700

----------------------------------------------------------------------
 R/pkg/inst/tests/testthat/jarTest.R             |   32 -
 R/pkg/inst/tests/testthat/packageInAJarTest.R   |   30 -
 R/pkg/inst/tests/testthat/test_Serde.R          |   85 -
 R/pkg/inst/tests/testthat/test_Windows.R        |   32 -
 R/pkg/inst/tests/testthat/test_basic.R          |   90 +
 R/pkg/inst/tests/testthat/test_binaryFile.R     |  100 -
 .../inst/tests/testthat/test_binary_function.R  |  110 -
 R/pkg/inst/tests/testthat/test_broadcast.R      |   55 -
 R/pkg/inst/tests/testthat/test_client.R         |   51 -
 R/pkg/inst/tests/testthat/test_context.R        |  226 --
 R/pkg/inst/tests/testthat/test_includePackage.R |   64 -
 R/pkg/inst/tests/testthat/test_jvm_api.R        |   36 -
 .../tests/testthat/test_mllib_classification.R  |  396 --
 .../inst/tests/testthat/test_mllib_clustering.R |  328 --
 R/pkg/inst/tests/testthat/test_mllib_fpm.R      |   85 -
 .../tests/testthat/test_mllib_recommendation.R  |   67 -
 .../inst/tests/testthat/test_mllib_regression.R |  480 ---
 R/pkg/inst/tests/testthat/test_mllib_stat.R     |   53 -
 R/pkg/inst/tests/testthat/test_mllib_tree.R     |  320 --
 .../tests/testthat/test_parallelize_collect.R   |  120 -
 R/pkg/inst/tests/testthat/test_rdd.R            |  906 -----
 R/pkg/inst/tests/testthat/test_shuffle.R        |  248 --
 R/pkg/inst/tests/testthat/test_sparkR.R         |   48 -
 R/pkg/inst/tests/testthat/test_sparkSQL.R       | 3474 ------------------
 R/pkg/inst/tests/testthat/test_streaming.R      |  167 -
 R/pkg/inst/tests/testthat/test_take.R           |   71 -
 R/pkg/inst/tests/testthat/test_textFile.R       |  182 -
 R/pkg/inst/tests/testthat/test_utils.R          |  248 --
 R/pkg/tests/fulltests/jarTest.R                 |   32 +
 R/pkg/tests/fulltests/packageInAJarTest.R       |   30 +
 R/pkg/tests/fulltests/test_Serde.R              |   85 +
 R/pkg/tests/fulltests/test_Windows.R            |   32 +
 R/pkg/tests/fulltests/test_binaryFile.R         |  100 +
 R/pkg/tests/fulltests/test_binary_function.R    |  110 +
 R/pkg/tests/fulltests/test_broadcast.R          |   55 +
 R/pkg/tests/fulltests/test_client.R             |   51 +
 R/pkg/tests/fulltests/test_context.R            |  226 ++
 R/pkg/tests/fulltests/test_includePackage.R     |   64 +
 R/pkg/tests/fulltests/test_jvm_api.R            |   36 +
 .../tests/fulltests/test_mllib_classification.R |  396 ++
 R/pkg/tests/fulltests/test_mllib_clustering.R   |  328 ++
 R/pkg/tests/fulltests/test_mllib_fpm.R          |   85 +
 .../tests/fulltests/test_mllib_recommendation.R |   67 +
 R/pkg/tests/fulltests/test_mllib_regression.R   |  480 +++
 R/pkg/tests/fulltests/test_mllib_stat.R         |   53 +
 R/pkg/tests/fulltests/test_mllib_tree.R         |  320 ++
 .../tests/fulltests/test_parallelize_collect.R  |  120 +
 R/pkg/tests/fulltests/test_rdd.R                |  906 +++++
 R/pkg/tests/fulltests/test_shuffle.R            |  248 ++
 R/pkg/tests/fulltests/test_sparkR.R             |   48 +
 R/pkg/tests/fulltests/test_sparkSQL.R           | 3474 ++++++++++++++++++
 R/pkg/tests/fulltests/test_streaming.R          |  167 +
 R/pkg/tests/fulltests/test_take.R               |   71 +
 R/pkg/tests/fulltests/test_textFile.R           |  182 +
 R/pkg/tests/fulltests/test_utils.R              |  248 ++
 R/pkg/tests/run-all.R                           |    8 +
 56 files changed, 8112 insertions(+), 8014 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/jarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/testthat/jarTest.R
deleted file mode 100644
index e2241e0..0000000
--- a/R/pkg/inst/tests/testthat/jarTest.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-library(SparkR)
-
-sc <- sparkR.session(master = "local[1]")
-
-helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
-                                  "helloWorld",
-                                  "Dave")
-stopifnot(identical(helloTest, "Hello Dave"))
-
-basicFunction <- SparkR:::callJStatic("sparkrtest.DummyClass",
-                                      "addStuff",
-                                      2L,
-                                      2L)
-stopifnot(basicFunction == 4L)
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/packageInAJarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/testthat/packageInAJarTest.R
deleted file mode 100644
index ac70626..0000000
--- a/R/pkg/inst/tests/testthat/packageInAJarTest.R
+++ /dev/null
@@ -1,30 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-library(SparkR)
-library(sparkPackageTest)
-
-sparkR.session(master = "local[1]")
-
-run1 <- myfunc(5L)
-
-run2 <- myfunc(-4L)
-
-sparkR.session.stop()
-
-if (run1 != 6) quit(save = "no", status = 1)
-
-if (run2 != -3) quit(save = "no", status = 1)

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_Serde.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/testthat/test_Serde.R
deleted file mode 100644
index 6e160fa..0000000
--- a/R/pkg/inst/tests/testthat/test_Serde.R
+++ /dev/null
@@ -1,85 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("SerDe functionality")
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("SerDe of primitive types", {
-  skip_on_cran()
-
-  x <- callJStatic("SparkRHandler", "echo", 1L)
-  expect_equal(x, 1L)
-  expect_equal(class(x), "integer")
-
-  x <- callJStatic("SparkRHandler", "echo", 1)
-  expect_equal(x, 1)
-  expect_equal(class(x), "numeric")
-
-  x <- callJStatic("SparkRHandler", "echo", TRUE)
-  expect_true(x)
-  expect_equal(class(x), "logical")
-
-  x <- callJStatic("SparkRHandler", "echo", "abc")
-  expect_equal(x, "abc")
-  expect_equal(class(x), "character")
-})
-
-test_that("SerDe of list of primitive types", {
-  skip_on_cran()
-
-  x <- list(1L, 2L, 3L)
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-  expect_equal(class(y[[1]]), "integer")
-
-  x <- list(1, 2, 3)
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-  expect_equal(class(y[[1]]), "numeric")
-
-  x <- list(TRUE, FALSE)
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-  expect_equal(class(y[[1]]), "logical")
-
-  x <- list("a", "b", "c")
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-  expect_equal(class(y[[1]]), "character")
-
-  # Empty list
-  x <- list()
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-})
-
-test_that("SerDe of list of lists", {
-  skip_on_cran()
-
-  x <- list(list(1L, 2L, 3L), list(1, 2, 3),
-            list(TRUE, FALSE), list("a", "b", "c"))
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-
-  # List of empty lists
-  x <- list(list(), list())
-  y <- callJStatic("SparkRHandler", "echo", x)
-  expect_equal(x, y)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_Windows.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/testthat/test_Windows.R
deleted file mode 100644
index 00d684e..0000000
--- a/R/pkg/inst/tests/testthat/test_Windows.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-context("Windows-specific tests")
-
-test_that("sparkJars tag in SparkContext", {
-  skip_on_cran()
-
-  if (.Platform$OS.type != "windows") {
-    skip("This test is only for Windows, skipped")
-  }
-
-  testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE)
-  abcPath <- testOutput[1]
-  expect_equal(abcPath, "a\\b\\c")
-})
-
-message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT"))
-message("elapsed ", (proc.time() - timer_ptm)[3])

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_basic.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R
new file mode 100644
index 0000000..de47162
--- /dev/null
+++ b/R/pkg/inst/tests/testthat/test_basic.R
@@ -0,0 +1,90 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("basic tests for CRAN")
+
+test_that("create DataFrame from list or data.frame", {
+  sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+  i <- 4
+  df <- createDataFrame(data.frame(dummy = 1:i))
+  expect_equal(count(df), i)
+
+  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+  df <- createDataFrame(l)
+  expect_equal(columns(df), c("a", "b"))
+
+  a <- 1:3
+  b <- c("a", "b", "c")
+  ldf <- data.frame(a, b)
+  df <- createDataFrame(ldf)
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+  expect_equal(count(df), 3)
+  ldf2 <- collect(df)
+  expect_equal(ldf$a, ldf2$a)
+
+  mtcarsdf <- createDataFrame(mtcars)
+  expect_equivalent(collect(mtcarsdf), mtcars)
+
+  bytes <- as.raw(c(1, 2, 3))
+  df <- createDataFrame(list(list(bytes)))
+  expect_equal(collect(df)[[1]][[1]], bytes)
+
+  sparkR.session.stop()
+})
+
+test_that("spark.glm and predict", {
+  sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+  training <- suppressWarnings(createDataFrame(iris))
+  # gaussian family
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # Gamma family
+  x <- runif(100, -1, 1)
+  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+  model <- glm(y ~ x, family = Gamma, df)
+  out <- capture.output(print(summary(model)))
+  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
+  # tweedie family
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = "tweedie", var.power = 1.2, link.power = 0.0)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+
+  # manual calculation of the R predicted values to avoid dependence on statmod
+  #' library(statmod)
+  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
+  #' print(coef(rModel))
+
+  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                       data = iris) %*% rCoef))
+  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+  sparkR.session.stop()
+})

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_binaryFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/testthat/test_binaryFile.R
deleted file mode 100644
index 00954fa..0000000
--- a/R/pkg/inst/tests/testthat/test_binaryFile.R
+++ /dev/null
@@ -1,100 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions on binary files")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("saveAsObjectFile()/objectFile() following textFile() works", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName1)
-
-  rdd <- textFile(sc, fileName1, 1)
-  saveAsObjectFile(rdd, fileName2)
-  rdd <- objectFile(sc, fileName2)
-  expect_equal(collectRDD(rdd), as.list(mockFile))
-
-  unlink(fileName1)
-  unlink(fileName2, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-
-  l <- list(1, 2, 3)
-  rdd <- parallelize(sc, l, 1)
-  saveAsObjectFile(rdd, fileName)
-  rdd <- objectFile(sc, fileName)
-  expect_equal(collectRDD(rdd), l)
-
-  unlink(fileName, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() following RDD transformations works", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName1)
-
-  rdd <- textFile(sc, fileName1)
-
-  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
-  wordCount <- lapply(words, function(word) { list(word, 1L) })
-
-  counts <- reduceByKey(wordCount, "+", 2L)
-
-  saveAsObjectFile(counts, fileName2)
-  counts <- objectFile(sc, fileName2)
-
-  output <- collectRDD(counts)
-  expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1),
-                    list("is", 2))
-  expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
-
-  unlink(fileName1)
-  unlink(fileName2, recursive = TRUE)
-})
-
-test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-
-  rdd1 <- parallelize(sc, "Spark is pretty.")
-  saveAsObjectFile(rdd1, fileName1)
-  rdd2 <- parallelize(sc, "Spark is awesome.")
-  saveAsObjectFile(rdd2, fileName2)
-
-  rdd <- objectFile(sc, c(fileName1, fileName2))
-  expect_equal(countRDD(rdd), 2)
-
-  unlink(fileName1, recursive = TRUE)
-  unlink(fileName2, recursive = TRUE)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_binary_function.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/testthat/test_binary_function.R
deleted file mode 100644
index 236cb38..0000000
--- a/R/pkg/inst/tests/testthat/test_binary_function.R
+++ /dev/null
@@ -1,110 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("binary functions")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Data
-nums <- 1:10
-rdd <- parallelize(sc, nums, 2L)
-
-# File content
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("union on two RDDs", {
-  skip_on_cran()
-
-  actual <- collectRDD(unionRDD(rdd, rdd))
-  expect_equal(actual, as.list(rep(nums, 2)))
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  text.rdd <- textFile(sc, fileName)
-  union.rdd <- unionRDD(rdd, text.rdd)
-  actual <- collectRDD(union.rdd)
-  expect_equal(actual, c(as.list(nums), mockFile))
-  expect_equal(getSerializedMode(union.rdd), "byte")
-
-  rdd <- map(text.rdd, function(x) {x})
-  union.rdd <- unionRDD(rdd, text.rdd)
-  actual <- collectRDD(union.rdd)
-  expect_equal(actual, as.list(c(mockFile, mockFile)))
-  expect_equal(getSerializedMode(union.rdd), "byte")
-
-  unlink(fileName)
-})
-
-test_that("cogroup on two RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
-  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
-  cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
-  actual <- collectRDD(cogroup.rdd)
-  expect_equal(actual,
-               list(list(1, list(list(1), list(2, 3))), list(2, list(list(4), list()))))
-
-  rdd1 <- parallelize(sc, list(list("a", 1), list("a", 4)))
-  rdd2 <- parallelize(sc, list(list("b", 2), list("a", 3)))
-  cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
-  actual <- collectRDD(cogroup.rdd)
-
-  expected <- list(list("b", list(list(), list(2))), list("a", list(list(1, 4), list(3))))
-  expect_equal(sortKeyValueList(actual),
-               sortKeyValueList(expected))
-})
-
-test_that("zipPartitions() on RDDs", {
-  skip_on_cran()
-
-  rdd1 <- parallelize(sc, 1:2, 2L)  # 1, 2
-  rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
-  rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6
-  actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
-                                  func = function(x, y, z) { list(list(x, y, z))} ))
-  expect_equal(actual,
-               list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6))))
-
-  mockFile <- c("Spark is pretty.", "Spark is awesome.")
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName, 1)
-  actual <- collectRDD(zipPartitions(rdd, rdd,
-                                  func = function(x, y) { list(paste(x, y, sep = "\n")) }))
-  expected <- list(paste(mockFile, mockFile, sep = "\n"))
-  expect_equal(actual, expected)
-
-  rdd1 <- parallelize(sc, 0:1, 1)
-  actual <- collectRDD(zipPartitions(rdd1, rdd,
-                                  func = function(x, y) { list(x + nchar(y)) }))
-  expected <- list(0:1 + nchar(mockFile))
-  expect_equal(actual, expected)
-
-  rdd <- map(rdd, function(x) { x })
-  actual <- collectRDD(zipPartitions(rdd, rdd1,
-                                  func = function(x, y) { list(y + nchar(x)) }))
-  expect_equal(actual, expected)
-
-  unlink(fileName)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_broadcast.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/testthat/test_broadcast.R
deleted file mode 100644
index 2c96740..0000000
--- a/R/pkg/inst/tests/testthat/test_broadcast.R
+++ /dev/null
@@ -1,55 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("broadcast variables")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Partitioned data
-nums <- 1:2
-rrdd <- parallelize(sc, nums, 2L)
-
-test_that("using broadcast variable", {
-  skip_on_cran()
-
-  randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
-  randomMatBr <- broadcastRDD(sc, randomMat)
-
-  useBroadcast <- function(x) {
-    sum(SparkR:::value(randomMatBr) * x)
-  }
-  actual <- collectRDD(lapply(rrdd, useBroadcast))
-  expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
-  expect_equal(actual, expected)
-})
-
-test_that("without using broadcast variable", {
-  skip_on_cran()
-
-  randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
-
-  useBroadcast <- function(x) {
-    sum(randomMat * x)
-  }
-  actual <- collectRDD(lapply(rrdd, useBroadcast))
-  expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
-  expect_equal(actual, expected)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_client.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/testthat/test_client.R
deleted file mode 100644
index 3d53beb..0000000
--- a/R/pkg/inst/tests/testthat/test_client.R
+++ /dev/null
@@ -1,51 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in client.R")
-
-test_that("adding spark-testing-base as a package works", {
-  skip_on_cran()
-
-  args <- generateSparkSubmitArgs("", "", "", "",
-                                  "holdenk:spark-testing-base:1.3.0_0.0.5")
-  expect_equal(gsub("[[:space:]]", "", args),
-               gsub("[[:space:]]", "",
-                    "--packages holdenk:spark-testing-base:1.3.0_0.0.5"))
-})
-
-test_that("no package specified doesn't add packages flag", {
-  skip_on_cran()
-
-  args <- generateSparkSubmitArgs("", "", "", "", "")
-  expect_equal(gsub("[[:space:]]", "", args),
-               "")
-})
-
-test_that("multiple packages don't produce a warning", {
-  skip_on_cran()
-
-  expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA)
-})
-
-test_that("sparkJars sparkPackages as character vectors", {
-  skip_on_cran()
-
-  args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
-                                  c("com.databricks:spark-avro_2.10:2.0.1"))
-  expect_match(args, "--jars one.jar,two.jar,three.jar")
-  expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
-})

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/testthat/test_context.R
deleted file mode 100644
index f6d9f54..0000000
--- a/R/pkg/inst/tests/testthat/test_context.R
+++ /dev/null
@@ -1,226 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("test functions in sparkR.R")
-
-test_that("Check masked functions", {
-  skip_on_cran()
-
-  # Check that we are not masking any new function from base, stats, testthat unexpectedly
-  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
-  # hard for users to use base R functions. Please check when in doubt.
-  namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
-  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
-                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
-                     "summary", "transform", "drop", "window", "as.data.frame", "union", "not")
-  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
-    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
-  }
-  masked <- conflicts(detail = TRUE)$`package:SparkR`
-  expect_true("describe" %in% masked)  # only when with testthat..
-  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
-  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
-  maskedBySparkR <- masked[funcSparkROrEmpty]
-  expect_equal(length(maskedBySparkR), length(namesOfMasked))
-  # make the 2 lists the same length so expect_equal will print their content
-  l <- max(length(maskedBySparkR), length(namesOfMasked))
-  length(maskedBySparkR) <- l
-  length(namesOfMasked) <- l
-  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
-  # above are those reported as masked when `library(SparkR)`
-  # note that many of these methods are still callable without base:: or stats:: prefix
-  # there should be a test for each of these, except followings, which are currently "broken"
-  funcHasAny <- unlist(lapply(masked, function(x) {
-                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
-                                      }))
-  maskedCompletely <- masked[!funcHasAny]
-  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
-  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
-  length(maskedCompletely) <- l
-  length(namesOfMaskedCompletely) <- l
-  expect_equal(sort(maskedCompletely, na.last = TRUE),
-               sort(namesOfMaskedCompletely, na.last = TRUE))
-})
-
-test_that("repeatedly starting and stopping SparkR", {
-  skip_on_cran()
-
-  for (i in 1:4) {
-    sc <- suppressWarnings(sparkR.init(master = sparkRTestMaster))
-    rdd <- parallelize(sc, 1:20, 2L)
-    expect_equal(countRDD(rdd), 20)
-    suppressWarnings(sparkR.stop())
-  }
-})
-
-test_that("repeatedly starting and stopping SparkSession", {
-  for (i in 1:4) {
-    sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-    df <- createDataFrame(data.frame(dummy = 1:i))
-    expect_equal(count(df), i)
-    sparkR.session.stop()
-  }
-})
-
-test_that("rdd GC across sparkR.stop", {
-  skip_on_cran()
-
-  sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0
-  rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
-  rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
-  sparkR.session.stop()
-
-  sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0 again
-
-  # GC rdd1 before creating rdd3 and rdd2 after
-  rm(rdd1)
-  gc()
-
-  rdd3 <- parallelize(sc, 1:20, 2L) # rdd3 should get id 1 now
-  rdd4 <- parallelize(sc, 1:10, 2L) # rdd4 should get id 2 now
-
-  rm(rdd2)
-  gc()
-
-  countRDD(rdd3)
-  countRDD(rdd4)
-  sparkR.session.stop()
-})
-
-test_that("job group functions can be called", {
-  skip_on_cran()
-
-  sc <- sparkR.sparkContext(master = sparkRTestMaster)
-  setJobGroup("groupId", "job description", TRUE)
-  cancelJobGroup("groupId")
-  clearJobGroup()
-
-  suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
-  suppressWarnings(cancelJobGroup(sc, "groupId"))
-  suppressWarnings(clearJobGroup(sc))
-  sparkR.session.stop()
-})
-
-test_that("utility function can be called", {
-  skip_on_cran()
-
-  sparkR.sparkContext(master = sparkRTestMaster)
-  setLogLevel("ERROR")
-  sparkR.session.stop()
-})
-
-test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
-  skip_on_cran()
-
-  e <- new.env()
-  e[["spark.driver.memory"]] <- "512m"
-  ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
-  expect_equal("--driver-memory \"512m\" sparkrmain", ops)
-
-  e[["spark.driver.memory"]] <- "5g"
-  e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
-  e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
-  e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
-  e[["random"]] <- "skipthis"
-  ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
-  # nolint start
-  expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
-                      "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
-                      "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
-  # nolint end
-
-  e[["spark.driver.extraClassPath"]] <- "/" # too short
-  ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
-  # nolint start
-  expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
-                      "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
-                      " --driver-memory 4g sparkr-shell2"))
-  # nolint end
-})
-
-test_that("sparkJars sparkPackages as comma-separated strings", {
-  skip_on_cran()
-
-  expect_warning(processSparkJars(" a, b "))
-  jars <- suppressWarnings(processSparkJars(" a, b "))
-  expect_equal(lapply(jars, basename), list("a", "b"))
-
-  jars <- suppressWarnings(processSparkJars(" abc ,, def "))
-  expect_equal(lapply(jars, basename), list("abc", "def"))
-
-  jars <- suppressWarnings(processSparkJars(c(" abc ,, def ", "", "xyz", " ", "a,b")))
-  expect_equal(lapply(jars, basename), list("abc", "def", "xyz", "a", "b"))
-
-  p <- processSparkPackages(c("ghi", "lmn"))
-  expect_equal(p, c("ghi", "lmn"))
-
-  # check normalizePath
-  f <- dir()[[1]]
-  expect_warning(processSparkJars(f), NA)
-  expect_match(processSparkJars(f), f)
-})
-
-test_that("spark.lapply should perform simple transforms", {
-  sparkR.sparkContext(master = sparkRTestMaster)
-  doubled <- spark.lapply(1:10, function(x) { 2 * x })
-  expect_equal(doubled, as.list(2 * 1:10))
-  sparkR.session.stop()
-})
-
-test_that("add and get file to be downloaded with Spark job on every node", {
-  skip_on_cran()
-
-  sparkR.sparkContext(master = sparkRTestMaster)
-  # Test add file.
-  path <- tempfile(pattern = "hello", fileext = ".txt")
-  filename <- basename(path)
-  words <- "Hello World!"
-  writeLines(words, path)
-  spark.addFile(path)
-  download_path <- spark.getSparkFiles(filename)
-  expect_equal(readLines(download_path), words)
-
-  # Test spark.getSparkFiles works well on executors.
-  seq <- seq(from = 1, to = 10, length.out = 5)
-  f <- function(seq) { spark.getSparkFiles(filename) }
-  results <- spark.lapply(seq, f)
-  for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
-
-  unlink(path)
-
-  # Test add directory recursively.
-  path <- paste0(tempdir(), "/", "recursive_dir")
-  dir.create(path)
-  dir_name <- basename(path)
-  path1 <- paste0(path, "/", "hello.txt")
-  file.create(path1)
-  sub_path <- paste0(path, "/", "sub_hello")
-  dir.create(sub_path)
-  path2 <- paste0(sub_path, "/", "sub_hello.txt")
-  file.create(path2)
-  words <- "Hello World!"
-  sub_words <- "Sub Hello World!"
-  writeLines(words, path1)
-  writeLines(sub_words, path2)
-  spark.addFile(path, recursive = TRUE)
-  download_path1 <- spark.getSparkFiles(paste0(dir_name, "/", "hello.txt"))
-  expect_equal(readLines(download_path1), words)
-  download_path2 <- spark.getSparkFiles(paste0(dir_name, "/", "sub_hello/sub_hello.txt"))
-  expect_equal(readLines(download_path2), sub_words)
-  unlink(path, recursive = TRUE)
-  sparkR.session.stop()
-})

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_includePackage.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/testthat/test_includePackage.R
deleted file mode 100644
index d7d9eee..0000000
--- a/R/pkg/inst/tests/testthat/test_includePackage.R
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("include R packages")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-# Partitioned data
-nums <- 1:2
-rdd <- parallelize(sc, nums, 2L)
-
-test_that("include inside function", {
-  skip_on_cran()
-
-  # Only run the test if plyr is installed.
-  if ("plyr" %in% rownames(installed.packages())) {
-    suppressPackageStartupMessages(library(plyr))
-    generateData <- function(x) {
-      suppressPackageStartupMessages(library(plyr))
-      attach(airquality)
-      result <- transform(Ozone, logOzone = log(Ozone))
-      result
-    }
-
-    data <- lapplyPartition(rdd, generateData)
-    actual <- collectRDD(data)
-  }
-})
-
-test_that("use include package", {
-  skip_on_cran()
-
-  # Only run the test if plyr is installed.
-  if ("plyr" %in% rownames(installed.packages())) {
-    suppressPackageStartupMessages(library(plyr))
-    generateData <- function(x) {
-      attach(airquality)
-      result <- transform(Ozone, logOzone = log(Ozone))
-      result
-    }
-
-    includePackage(sc, plyr)
-    data <- lapplyPartition(rdd, generateData)
-    actual <- collectRDD(data)
-  }
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_jvm_api.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_jvm_api.R b/R/pkg/inst/tests/testthat/test_jvm_api.R
deleted file mode 100644
index 8b3b4f7..0000000
--- a/R/pkg/inst/tests/testthat/test_jvm_api.R
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("JVM API")
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("Create and call methods on object", {
-  jarr <- sparkR.newJObject("java.util.ArrayList")
-  # Add an element to the array
-  sparkR.callJMethod(jarr, "add", 1L)
-  # Check if get returns the same element
-  expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L)
-})
-
-test_that("Call static methods", {
-  # Convert a boolean to a string
-  strTrue <- sparkR.callJStatic("java.lang.String", "valueOf", TRUE)
-  expect_equal(strTrue, "true")
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_classification.R b/R/pkg/inst/tests/testthat/test_mllib_classification.R
deleted file mode 100644
index 82e588d..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_classification.R
+++ /dev/null
@@ -1,396 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib classification algorithms, except for tree-based algorithms")
-
-# Tests for MLlib classification algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
-  sparkHome <- sparkR.conf("spark.home")
-  file.path(sparkHome, x)
-}
-
-test_that("spark.svmLinear", {
-  skip_on_cran()
-
-  df <- suppressWarnings(createDataFrame(iris))
-  training <- df[df$Species %in% c("versicolor", "virginica"), ]
-  model <- spark.svmLinear(training,  Species ~ ., regParam = 0.01, maxIter = 10)
-  summary <- summary(model)
-
-  # test summary coefficients return matrix type
-  expect_true(class(summary$coefficients) == "matrix")
-  expect_true(class(summary$coefficients[, 1]) == "numeric")
-
-  coefs <- summary$coefficients[, "Estimate"]
-  expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085)
-  expect_true(all(abs(coefs - expected_coefs) < 0.1))
-
-  # Test prediction with string label
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
-  expected <- c("versicolor", "versicolor", "versicolor", "virginica",  "virginica",
-                "virginica",  "virginica",  "virginica",  "virginica",  "virginica")
-  expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
-
-  # Test model save and load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    coefs <- summary(model)$coefficients
-    coefs2 <- summary(model2)$coefficients
-    expect_equal(coefs, coefs2)
-    unlink(modelPath)
-  }
-
-  # Test prediction with numeric label
-  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
-  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  data <- as.data.frame(cbind(label, feature))
-  df <- createDataFrame(data)
-  model <- spark.svmLinear(df, label ~ feature, regParam = 0.1)
-  prediction <- collect(select(predict(model, df), "prediction"))
-  expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
-
-})
-
-test_that("spark.logit", {
-  # R code to reproduce the result.
-  # nolint start
-  #' library(glmnet)
-  #' iris.x = as.matrix(iris[, 1:4])
-  #' iris.y = as.factor(as.character(iris[, 5]))
-  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
-  #' coef(logit)
-  #
-  # $setosa
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  #               1.0981324
-  # Sepal.Length -0.2909860
-  # Sepal.Width   0.5510907
-  # Petal.Length -0.1915217
-  # Petal.Width  -0.4211946
-  #
-  # $versicolor
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  #               1.520061e+00
-  # Sepal.Length  2.524501e-02
-  # Sepal.Width  -5.310313e-01
-  # Petal.Length  3.656543e-02
-  # Petal.Width  -3.144464e-05
-  #
-  # $virginica
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  #              -2.61819385
-  # Sepal.Length  0.26574097
-  # Sepal.Width  -0.02005932
-  # Petal.Length  0.15495629
-  # Petal.Width   0.42122607
-  # nolint end
-
-  # Test multinomial logistic regression againt three classes
-  df <- suppressWarnings(createDataFrame(iris))
-  model <- spark.logit(df, Species ~ ., regParam = 0.5)
-  summary <- summary(model)
-
-  # test summary coefficients return matrix type
-  expect_true(class(summary$coefficients) == "matrix")
-  expect_true(class(summary$coefficients[, 1]) == "numeric")
-
-  versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
-  virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
-  setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
-  versicolorCoefs <- summary$coefficients[, "versicolor"]
-  virginicaCoefs <- summary$coefficients[, "virginica"]
-  setosaCoefs <- summary$coefficients[, "setosa"]
-  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
-  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
-  expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
-
-  # Test model save and load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    coefs <- summary(model)$coefficients
-    coefs2 <- summary(model2)$coefficients
-    expect_equal(coefs, coefs2)
-    unlink(modelPath)
-  }
-
-  # R code to reproduce the result.
-  # nolint start
-  #' library(glmnet)
-  #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
-  #' iris.x = as.matrix(iris2[, 1:4])
-  #' iris.y = as.factor(as.character(iris2[, 5]))
-  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
-  #' coef(logit)
-  #
-  # $versicolor
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  #               3.93844796
-  # Sepal.Length -0.13538675
-  # Sepal.Width  -0.02386443
-  # Petal.Length -0.35076451
-  # Petal.Width  -0.77971954
-  #
-  # $virginica
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  #              -3.93844796
-  # Sepal.Length  0.13538675
-  # Sepal.Width   0.02386443
-  # Petal.Length  0.35076451
-  # Petal.Width   0.77971954
-  #
-  #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
-  #' coef(logit)
-  #
-  # 5 x 1 sparse Matrix of class "dgCMatrix"
-  # s0
-  # (Intercept)  -6.0824412
-  # Sepal.Length  0.2458260
-  # Sepal.Width   0.1642093
-  # Petal.Length  0.4759487
-  # Petal.Width   1.0383948
-  #
-  # nolint end
-
-  # Test multinomial logistic regression againt two classes
-  df <- suppressWarnings(createDataFrame(iris))
-  training <- df[df$Species %in% c("versicolor", "virginica"), ]
-  model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
-  summary <- summary(model)
-  versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
-  virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
-  versicolorCoefs <- summary$coefficients[, "versicolor"]
-  virginicaCoefs <- summary$coefficients[, "virginica"]
-  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
-  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
-
-  # Test binomial logistic regression againt two classes
-  model <- spark.logit(training, Species ~ ., regParam = 0.5)
-  summary <- summary(model)
-  coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
-  coefs <- summary$coefficients[, "Estimate"]
-  expect_true(all(abs(coefsR - coefs) < 0.1))
-
-  # Test prediction with string label
-  prediction <- predict(model, training)
-  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
-  expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
-                "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
-  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
-
-  # Test prediction with numeric label
-  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
-  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
-  data <- as.data.frame(cbind(label, feature))
-  df <- createDataFrame(data)
-  model <- spark.logit(df, label ~ feature)
-  prediction <- collect(select(predict(model, df), "prediction"))
-  expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
-
-  # Test prediction with weightCol
-  weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
-  data2 <- as.data.frame(cbind(label, feature, weight))
-  df2 <- createDataFrame(data2)
-  model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
-  prediction2 <- collect(select(predict(model2, df2), "prediction"))
-  expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
-})
-
-test_that("spark.mlp", {
-  skip_on_cran()
-
-  df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
-                source = "libsvm")
-  model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
-                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
-
-  # Test summary method
-  summary <- summary(model)
-  expect_equal(summary$numOfInputs, 4)
-  expect_equal(summary$numOfOutputs, 3)
-  expect_equal(summary$layers, c(4, 5, 4, 3))
-  expect_equal(length(summary$weights), 64)
-  expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
-               tolerance = 1e-6)
-
-  # Test predict method
-  mlpTestDF <- df
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    summary2 <- summary(model2)
-
-    expect_equal(summary2$numOfInputs, 4)
-    expect_equal(summary2$numOfOutputs, 3)
-    expect_equal(summary2$layers, c(4, 5, 4, 3))
-    expect_equal(length(summary2$weights), 64)
-
-    unlink(modelPath)
-  }
-
-  # Test default parameter
-  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
-  # Test illegal parameter
-  expect_error(spark.mlp(df, label ~ features, layers = NULL),
-               "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, label ~ features, layers = c()),
-               "layers must be a integer vector with length > 1.")
-  expect_error(spark.mlp(df, label ~ features, layers = c(3)),
-               "layers must be a integer vector with length > 1.")
-
-  # Test random seed
-  # default seed
-  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-  # seed equals 10
-  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
-  # test initialWeights
-  model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
-    c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
-  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
-  expect_equal(head(mlpPredictions$prediction, 10),
-               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
-
-  # Test formula works well
-  df <- suppressWarnings(createDataFrame(iris))
-  model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
-                     layers = c(4, 3))
-  summary <- summary(model)
-  expect_equal(summary$numOfInputs, 4)
-  expect_equal(summary$numOfOutputs, 3)
-  expect_equal(summary$layers, c(4, 3))
-  expect_equal(length(summary$weights), 15)
-})
-
-test_that("spark.naiveBayes", {
-  # R code to reproduce the result.
-  # We do not support instance weights yet. So we ignore the frequencies.
-  #
-  #' library(e1071)
-  #' t <- as.data.frame(Titanic)
-  #' t1 <- t[t$Freq > 0, -5]
-  #' m <- naiveBayes(Survived ~ ., data = t1)
-  #' m
-  #' predict(m, t1)
-  #
-  # -- output of 'm'
-  #
-  # A-priori probabilities:
-  # Y
-  #        No       Yes
-  # 0.4166667 0.5833333
-  #
-  # Conditional probabilities:
-  #      Class
-  # Y           1st       2nd       3rd      Crew
-  #   No  0.2000000 0.2000000 0.4000000 0.2000000
-  #   Yes 0.2857143 0.2857143 0.2857143 0.1428571
-  #
-  #      Sex
-  # Y     Male Female
-  #   No   0.5    0.5
-  #   Yes  0.5    0.5
-  #
-  #      Age
-  # Y         Child     Adult
-  #   No  0.2000000 0.8000000
-  #   Yes 0.4285714 0.5714286
-  #
-  # -- output of 'predict(m, t1)'
-  #
-  # Yes Yes Yes Yes No  No  Yes Yes No  No  Yes Yes Yes Yes Yes Yes Yes Yes No  No  Yes Yes No  No
-  #
-
-  t <- as.data.frame(Titanic)
-  t1 <- t[t$Freq > 0, -5]
-  df <- suppressWarnings(createDataFrame(t1))
-  m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
-  s <- summary(m)
-  expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
-  expect_equal(sum(s$apriori), 1)
-  expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
-  p <- collect(select(predict(m, df), "prediction"))
-  expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
-                               "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
-                               "Yes", "Yes", "No", "No"))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
-    write.ml(m, modelPath)
-    expect_error(write.ml(m, modelPath))
-    write.ml(m, modelPath, overwrite = TRUE)
-    m2 <- read.ml(modelPath)
-    s2 <- summary(m2)
-    expect_equal(s$apriori, s2$apriori)
-    expect_equal(s$tables, s2$tables)
-
-    unlink(modelPath)
-  }
-
-  # Test e1071::naiveBayes
-  if (requireNamespace("e1071", quietly = TRUE)) {
-    expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
-    expect_equal(as.character(predict(m, t1[1, ])), "Yes")
-  }
-
-  # Test numeric response variable
-  t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1)
-  t2 <- t1[-4]
-  df <- suppressWarnings(createDataFrame(t2))
-  m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0)
-  s <- summary(m)
-  expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
-  expect_equal(sum(s$apriori), 1)
-  expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
deleted file mode 100644
index e827e96..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R
+++ /dev/null
@@ -1,328 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib clustering algorithms")
-
-# Tests for MLlib clustering algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-absoluteSparkPath <- function(x) {
-  sparkHome <- sparkR.conf("spark.home")
-  file.path(sparkHome, x)
-}
-
-test_that("spark.bisectingKmeans", {
-  skip_on_cran()
-
-  newIris <- iris
-  newIris$Species <- NULL
-  training <- suppressWarnings(createDataFrame(newIris))
-
-  take(training, 1)
-
-  model <- spark.bisectingKmeans(data = training, ~ .)
-  sample <- take(select(predict(model, training), "prediction"), 1)
-  expect_equal(typeof(sample$prediction), "integer")
-  expect_equal(sample$prediction, 1)
-
-  # Test fitted works on Bisecting KMeans
-  fitted.model <- fitted(model)
-  expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
-               c(0, 1, 2, 3))
-
-  # Test summary works on KMeans
-  summary.model <- summary(model)
-  cluster <- summary.model$cluster
-  k <- summary.model$k
-  expect_equal(k, 4)
-  expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
-               c(0, 1, 2, 3))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    summary2 <- summary(model2)
-    expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
-    expect_equal(summary.model$coefficients, summary2$coefficients)
-    expect_true(!summary.model$is.loaded)
-    expect_true(summary2$is.loaded)
-
-    unlink(modelPath)
-  }
-})
-
-test_that("spark.gaussianMixture", {
-  # R code to reproduce the result.
-  # nolint start
-  #' library(mvtnorm)
-  #' set.seed(1)
-  #' a <- rmvnorm(7, c(0, 0))
-  #' b <- rmvnorm(8, c(10, 10))
-  #' data <- rbind(a, b)
-  #' model <- mvnormalmixEM(data, k = 2)
-  #' model$lambda
-  #
-  #  [1] 0.4666667 0.5333333
-  #
-  #' model$mu
-  #
-  #  [1] 0.11731091 -0.06192351
-  #  [1] 10.363673  9.897081
-  #
-  #' model$sigma
-  #
-  #  [[1]]
-  #             [,1]       [,2]
-  #  [1,] 0.62049934 0.06880802
-  #  [2,] 0.06880802 1.27431874
-  #
-  #  [[2]]
-  #            [,1]     [,2]
-  #  [1,] 0.2961543 0.160783
-  #  [2,] 0.1607830 1.008878
-  #
-  #' model$loglik
-  #
-  #  [1] -46.89499
-  # nolint end
-  data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
-               list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
-               list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
-               list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
-               list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
-               list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
-               list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
-               list(9.5218499, 10.4179416))
-  df <- createDataFrame(data, c("x1", "x2"))
-  model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
-  stats <- summary(model)
-  rLambda <- c(0.4666667, 0.5333333)
-  rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
-  rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
-              0.2961543, 0.160783, 0.1607830, 1.008878)
-  rLoglik <- -46.89499
-  expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
-  expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
-  expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
-  expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
-  p <- collect(select(predict(model, df), "prediction"))
-  expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats$lambda, stats2$lambda)
-    expect_equal(unlist(stats$mu), unlist(stats2$mu))
-    expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
-    expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
-
-    unlink(modelPath)
-  }
-})
-
-test_that("spark.kmeans", {
-  newIris <- iris
-  newIris$Species <- NULL
-  training <- suppressWarnings(createDataFrame(newIris))
-
-  take(training, 1)
-
-  model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
-  sample <- take(select(predict(model, training), "prediction"), 1)
-  expect_equal(typeof(sample$prediction), "integer")
-  expect_equal(sample$prediction, 1)
-
-  # Test stats::kmeans is working
-  statsModel <- kmeans(x = newIris, centers = 2)
-  expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
-
-  # Test fitted works on KMeans
-  fitted.model <- fitted(model)
-  expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
-
-  # Test summary works on KMeans
-  summary.model <- summary(model)
-  cluster <- summary.model$cluster
-  k <- summary.model$k
-  expect_equal(k, 2)
-  expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
-
-  # test summary coefficients return matrix type
-  expect_true(class(summary.model$coefficients) == "matrix")
-  expect_true(class(summary.model$coefficients[1, ]) == "numeric")
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    summary2 <- summary(model2)
-    expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
-    expect_equal(summary.model$coefficients, summary2$coefficients)
-    expect_true(!summary.model$is.loaded)
-    expect_true(summary2$is.loaded)
-
-    unlink(modelPath)
-  }
-
-  # Test Kmeans on dataset that is sensitive to seed value
-  col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
-  col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
-  col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
-  cols <- as.data.frame(cbind(col1, col2, col3))
-  df <- createDataFrame(cols)
-
-  model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
-                         initMode = "random", seed = 1, tol = 1E-5)
-  model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
-                         initMode = "random", seed = 22222, tol = 1E-5)
-
-  summary.model1 <- summary(model1)
-  summary.model2 <- summary(model2)
-  cluster1 <- summary.model1$cluster
-  cluster2 <- summary.model2$cluster
-  clusterSize1 <- summary.model1$clusterSize
-  clusterSize2 <- summary.model2$clusterSize
-
-  # The predicted clusters are different
-  expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
-             c(0, 1, 2, 3))
-  expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
-             c(0, 1, 2))
-  expect_equal(clusterSize1, 4)
-  expect_equal(clusterSize2, 3)
-})
-
-test_that("spark.lda with libsvm", {
-  text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
-  model <- spark.lda(text, optimizer = "em")
-
-  stats <- summary(model, 10)
-  isDistributed <- stats$isDistributed
-  logLikelihood <- stats$logLikelihood
-  logPerplexity <- stats$logPerplexity
-  vocabSize <- stats$vocabSize
-  topics <- stats$topicTopTerms
-  weights <- stats$topicTopTermsWeights
-  vocabulary <- stats$vocabulary
-  trainingLogLikelihood <- stats$trainingLogLikelihood
-  logPrior <- stats$logPrior
-
-  expect_true(isDistributed)
-  expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
-  expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
-  expect_equal(vocabSize, 11)
-  expect_true(is.null(vocabulary))
-  expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
-  expect_true(logPrior <= 0 & !is.na(logPrior))
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-
-    expect_true(stats2$isDistributed)
-    expect_equal(logLikelihood, stats2$logLikelihood)
-    expect_equal(logPerplexity, stats2$logPerplexity)
-    expect_equal(vocabSize, stats2$vocabSize)
-    expect_equal(vocabulary, stats2$vocabulary)
-    expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
-    expect_equal(logPrior, stats2$logPrior)
-
-    unlink(modelPath)
-  }
-})
-
-test_that("spark.lda with text input", {
-  skip_on_cran()
-
-  text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
-  model <- spark.lda(text, optimizer = "online", features = "value")
-
-  stats <- summary(model)
-  isDistributed <- stats$isDistributed
-  logLikelihood <- stats$logLikelihood
-  logPerplexity <- stats$logPerplexity
-  vocabSize <- stats$vocabSize
-  topics <- stats$topicTopTerms
-  weights <- stats$topicTopTermsWeights
-  vocabulary <- stats$vocabulary
-  trainingLogLikelihood <- stats$trainingLogLikelihood
-  logPrior <- stats$logPrior
-
-  expect_false(isDistributed)
-  expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
-  expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
-  expect_equal(vocabSize, 10)
-  expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
-  expect_true(is.na(trainingLogLikelihood))
-  expect_true(is.na(logPrior))
-
-  # Test model save/load
-  modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
-  write.ml(model, modelPath)
-  expect_error(write.ml(model, modelPath))
-  write.ml(model, modelPath, overwrite = TRUE)
-  model2 <- read.ml(modelPath)
-  stats2 <- summary(model2)
-
-  expect_false(stats2$isDistributed)
-  expect_equal(logLikelihood, stats2$logLikelihood)
-  expect_equal(logPerplexity, stats2$logPerplexity)
-  expect_equal(vocabSize, stats2$vocabSize)
-  expect_true(all.equal(vocabulary, stats2$vocabulary))
-  expect_true(is.na(stats2$trainingLogLikelihood))
-  expect_true(is.na(stats2$logPrior))
-
-  unlink(modelPath)
-})
-
-test_that("spark.posterior and spark.perplexity", {
-  skip_on_cran()
-
-  text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
-  model <- spark.lda(text, features = "value", k = 3)
-
-  # Assert perplexities are equal
-  stats <- summary(model)
-  logPerplexity <- spark.perplexity(model, text)
-  expect_equal(logPerplexity, stats$logPerplexity)
-
-  # Assert the sum of every topic distribution is equal to 1
-  posterior <- spark.posterior(model, text)
-  local.posterior <- collect(posterior)$topicDistribution
-  expect_equal(length(local.posterior), sum(unlist(local.posterior)))
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_fpm.R b/R/pkg/inst/tests/testthat/test_mllib_fpm.R
deleted file mode 100644
index 4e10ca1..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_fpm.R
+++ /dev/null
@@ -1,85 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib frequent pattern mining")
-
-# Tests for MLlib frequent pattern mining algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.fpGrowth", {
-  data <- selectExpr(createDataFrame(data.frame(items = c(
-    "1,2",
-    "1,2",
-    "1,2,3",
-    "1,3"
-  ))), "split(items, ',') as items")
-
-  model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1)
-
-  itemsets <- collect(spark.freqItemsets(model))
-
-  expected_itemsets <- data.frame(
-    items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))),
-    freq = c(2, 2, 3, 3, 4)
-  )
-
-  expect_equivalent(expected_itemsets, itemsets)
-
-  expected_association_rules <- data.frame(
-    antecedent = I(list(list("2"), list("3"))),
-    consequent = I(list(list("1"), list("1"))),
-    confidence = c(1, 1)
-  )
-
-  expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
-
-  new_data <- selectExpr(createDataFrame(data.frame(items = c(
-    "1,2",
-    "1,3",
-    "2,3"
-  ))), "split(items, ',') as items")
-
-  expected_predictions <- data.frame(
-    items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
-    prediction = I(list(list(), list(), list("1")))
-  )
-
-  expect_equivalent(expected_predictions, collect(predict(model, new_data)))
-
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
-    write.ml(model, modelPath, overwrite = TRUE)
-    loaded_model <- read.ml(modelPath)
-
-    expect_equivalent(
-      itemsets,
-      collect(spark.freqItemsets(loaded_model)))
-
-    unlink(modelPath)
-  }
-
-  model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
-  expect_equal(
-    count(spark.freqItemsets(model_without_numpartitions)),
-    count(spark.freqItemsets(model))
-  )
-
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_mllib_recommendation.R b/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
deleted file mode 100644
index cc8064f..0000000
--- a/R/pkg/inst/tests/testthat/test_mllib_recommendation.R
+++ /dev/null
@@ -1,67 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("MLlib recommendation algorithms")
-
-# Tests for MLlib recommendation algorithms in SparkR
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-test_that("spark.als", {
-  data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
-               list(2, 1, 1.0), list(2, 2, 5.0))
-  df <- createDataFrame(data, c("user", "item", "score"))
-  model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
-                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
-  stats <- summary(model)
-  expect_equal(stats$rank, 10)
-  test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
-  predictions <- collect(predict(model, test))
-
-  expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
-  tolerance = 1e-4)
-
-  # Test model save/load
-  if (not_cran_or_windows_with_hadoop()) {
-    modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
-    write.ml(model, modelPath)
-    expect_error(write.ml(model, modelPath))
-    write.ml(model, modelPath, overwrite = TRUE)
-    model2 <- read.ml(modelPath)
-    stats2 <- summary(model2)
-    expect_equal(stats2$rating, "score")
-    userFactors <- collect(stats$userFactors)
-    itemFactors <- collect(stats$itemFactors)
-    userFactors2 <- collect(stats2$userFactors)
-    itemFactors2 <- collect(stats2$itemFactors)
-
-    orderUser <- order(userFactors$id)
-    orderUser2 <- order(userFactors2$id)
-    expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
-    expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
-
-    orderItem <- order(itemFactors$id)
-    orderItem2 <- order(itemFactors2$id)
-    expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
-    expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
-
-    unlink(modelPath)
-  }
-})
-
-sparkR.session.stop()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[5/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R
deleted file mode 100644
index c790d02..0000000
--- a/R/pkg/inst/tests/testthat/test_sparkSQL.R
+++ /dev/null
@@ -1,3474 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("SparkSQL functions")
-
-# Utility function for easily checking the values of a StructField
-checkStructField <- function(actual, expectedName, expectedType, expectedNullable) {
-  expect_equal(class(actual), "structField")
-  expect_equal(actual$name(), expectedName)
-  expect_equal(actual$dataType.toString(), expectedType)
-  expect_equal(actual$nullable(), expectedNullable)
-}
-
-markUtf8 <- function(s) {
-  Encoding(s) <- "UTF-8"
-  s
-}
-
-setHiveContext <- function(sc) {
-  if (exists(".testHiveSession", envir = .sparkREnv)) {
-    hiveSession <- get(".testHiveSession", envir = .sparkREnv)
-  } else {
-    # initialize once and reuse
-    ssc <- callJMethod(sc, "sc")
-    hiveCtx <- tryCatch({
-      newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
-    },
-    error = function(err) {
-      skip("Hive is not build with SparkSQL, skipped")
-    })
-    hiveSession <- callJMethod(hiveCtx, "sparkSession")
-  }
-  previousSession <- get(".sparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
-  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
-  hiveSession
-}
-
-unsetHiveContext <- function() {
-  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
-  assign(".sparkRsession", previousSession, envir = .sparkREnv)
-  remove(".prevSparkRsession", envir = .sparkREnv)
-}
-
-# Tests for SparkSQL functions in SparkR
-
-filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
-sparkSession <- if (not_cran_or_windows_with_hadoop()) {
-    sparkR.session(master = sparkRTestMaster)
-  } else {
-    sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-  }
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-               "{\"name\":\"Andy\", \"age\":30}",
-               "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
-writeLines(mockLines, jsonPath)
-
-# For test nafunctions, like dropna(), fillna(),...
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
-                 "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
-                 "{\"name\":\"David\",\"age\":60,\"height\":null}",
-                 "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
-                 "{\"name\":null,\"age\":null,\"height\":null}")
-jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesNa, jsonPathNa)
-
-# For test complex types in DataFrame
-mockLinesComplexType <-
-  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
-    "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
-    "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
-complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesComplexType, complexTypeJsonPath)
-
-# For test map type and struct type in DataFrame
-mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
-                      "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
-                      "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
-mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-writeLines(mockLinesMapType, mapTypeJsonPath)
-
-if (.Platform$OS.type == "windows") {
-  Sys.setenv(TZ = "GMT")
-}
-
-test_that("calling sparkRSQL.init returns existing SQL context", {
-  skip_on_cran()
-
-  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
-  expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
-})
-
-test_that("calling sparkRSQL.init returns existing SparkSession", {
-  skip_on_cran()
-
-  expect_equal(suppressWarnings(sparkRSQL.init(sc)), sparkSession)
-})
-
-test_that("calling sparkR.session returns existing SparkSession", {
-  skip_on_cran()
-
-  expect_equal(sparkR.session(), sparkSession)
-})
-
-test_that("infer types and check types", {
-  expect_equal(infer_type(1L), "integer")
-  expect_equal(infer_type(1.0), "double")
-  expect_equal(infer_type("abc"), "string")
-  expect_equal(infer_type(TRUE), "boolean")
-  expect_equal(infer_type(as.Date("2015-03-11")), "date")
-  expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
-  expect_equal(infer_type(c(1L, 2L)), "array<integer>")
-  expect_equal(infer_type(list(1L, 2L)), "array<integer>")
-  expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
-  e <- new.env()
-  assign("a", 1L, envir = e)
-  expect_equal(infer_type(e), "map<string,integer>")
-
-  expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
-
-  expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
-})
-
-test_that("structType and structField", {
-  testField <- structField("a", "string")
-  expect_is(testField, "structField")
-  expect_equal(testField$name(), "a")
-  expect_true(testField$nullable())
-
-  testSchema <- structType(testField, structField("b", "integer"))
-  expect_is(testSchema, "structType")
-  expect_is(testSchema$fields()[[2]], "structField")
-  expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
-})
-
-test_that("structField type strings", {
-  # positive cases
-  primitiveTypes <- list(byte = "ByteType",
-                         integer = "IntegerType",
-                         float = "FloatType",
-                         double = "DoubleType",
-                         string = "StringType",
-                         binary = "BinaryType",
-                         boolean = "BooleanType",
-                         timestamp = "TimestampType",
-                         date = "DateType",
-                         tinyint = "ByteType",
-                         smallint = "ShortType",
-                         int = "IntegerType",
-                         bigint = "LongType",
-                         decimal = "DecimalType(10,0)")
-
-  complexTypes <- list("map<string,integer>" = "MapType(StringType,IntegerType,true)",
-                       "array<string>" = "ArrayType(StringType,true)",
-                       "struct<a:string>" = "StructType(StructField(a,StringType,true))")
-
-  typeList <- c(primitiveTypes, complexTypes)
-  typeStrings <- names(typeList)
-
-  for (i in seq_along(typeStrings)){
-    typeString <- typeStrings[i]
-    expected <- typeList[[i]]
-    testField <- structField("_col", typeString)
-    expect_is(testField, "structField")
-    expect_true(testField$nullable())
-    expect_equal(testField$dataType.toString(), expected)
-  }
-
-  # negative cases
-  primitiveErrors <- list(Byte = "Byte",
-                          INTEGER = "INTEGER",
-                          numeric = "numeric",
-                          character = "character",
-                          raw = "raw",
-                          logical = "logical",
-                          short = "short",
-                          varchar = "varchar",
-                          long = "long",
-                          char = "char")
-
-  complexErrors <- list("map<string, integer>" = " integer",
-                        "array<String>" = "String",
-                        "struct<a:string >" = "string ",
-                        "map <string,integer>" = "map <string,integer>",
-                        "array< string>" = " string",
-                        "struct<a: string>" = " string")
-
-  errorList <- c(primitiveErrors, complexErrors)
-  typeStrings <- names(errorList)
-
-  for (i in seq_along(typeStrings)){
-    typeString <- typeStrings[i]
-    expected <- paste0("Unsupported type for SparkDataframe: ", errorList[[i]])
-    expect_error(structField("_col", typeString), expected)
-  }
-})
-
-test_that("create DataFrame from RDD", {
-  skip_on_cran()
-
-  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
-  df <- createDataFrame(rdd, list("a", "b"))
-  dfAsDF <- as.DataFrame(rdd, list("a", "b"))
-  expect_is(df, "SparkDataFrame")
-  expect_is(dfAsDF, "SparkDataFrame")
-  expect_equal(count(df), 10)
-  expect_equal(count(dfAsDF), 10)
-  expect_equal(nrow(df), 10)
-  expect_equal(nrow(dfAsDF), 10)
-  expect_equal(ncol(df), 2)
-  expect_equal(ncol(dfAsDF), 2)
-  expect_equal(dim(df), c(10, 2))
-  expect_equal(dim(dfAsDF), c(10, 2))
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(columns(dfAsDF), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-  expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
-
-  df <- createDataFrame(rdd)
-  dfAsDF <- as.DataFrame(rdd)
-  expect_is(df, "SparkDataFrame")
-  expect_is(dfAsDF, "SparkDataFrame")
-  expect_equal(columns(df), c("_1", "_2"))
-  expect_equal(columns(dfAsDF), c("_1", "_2"))
-
-  schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
-                        structField(x = "b", type = "string", nullable = TRUE))
-  df <- createDataFrame(rdd, schema)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
-  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
-  df <- createDataFrame(rdd)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 10)
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
-  schema <- structType(structField("name", "string"), structField("age", "integer"),
-                       structField("height", "float"))
-  df <- read.df(jsonPathNa, "json", schema)
-  df2 <- createDataFrame(toRDD(df), schema)
-  df2AsDF <- as.DataFrame(toRDD(df), schema)
-  expect_equal(columns(df2), c("name", "age", "height"))
-  expect_equal(columns(df2AsDF), c("name", "age", "height"))
-  expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
-  expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
-  expect_equal(as.list(collect(where(df2, df2$name == "Bob"))),
-               list(name = "Bob", age = 16, height = 176.5))
-  expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))),
-               list(name = "Bob", age = 16, height = 176.5))
-
-  localDF <- data.frame(name = c("John", "Smith", "Sarah"),
-                        age = c(19L, 23L, 18L),
-                        height = c(176.5, 181.4, 173.7))
-  df <- createDataFrame(localDF, schema)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 3)
-  expect_equal(columns(df), c("name", "age", "height"))
-  expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
-  expect_equal(as.list(collect(where(df, df$name == "John"))),
-               list(name = "John", age = 19L, height = 176.5))
-  expect_equal(getNumPartitions(df), 1)
-
-  df <- as.DataFrame(cars, numPartitions = 2)
-  expect_equal(getNumPartitions(df), 2)
-  df <- createDataFrame(cars, numPartitions = 3)
-  expect_equal(getNumPartitions(df), 3)
-  # validate limit by num of rows
-  df <- createDataFrame(cars, numPartitions = 60)
-  expect_equal(getNumPartitions(df), 50)
-  # validate when 1 < (length(coll) / numSlices) << length(coll)
-  df <- createDataFrame(cars, numPartitions = 20)
-  expect_equal(getNumPartitions(df), 20)
-
-  df <- as.DataFrame(data.frame(0))
-  expect_is(df, "SparkDataFrame")
-  df <- createDataFrame(list(list(1)))
-  expect_is(df, "SparkDataFrame")
-  df <- as.DataFrame(data.frame(0), numPartitions = 2)
-  # no data to partition, goes to 1
-  expect_equal(getNumPartitions(df), 1)
-
-  setHiveContext(sc)
-  sql("CREATE TABLE people (name string, age double, height float)")
-  df <- read.df(jsonPathNa, "json", schema)
-  insertInto(df, "people")
-  expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age,
-               c(16))
-  expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
-               c(176.5))
-  sql("DROP TABLE people")
-  unsetHiveContext()
-})
-
-test_that("createDataFrame uses files for large objects", {
-  skip_on_cran()
-
-  # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
-  conf <- callJMethod(sparkSession, "conf")
-  callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
-  df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
-  expect_equal(getNumPartitions(df), 3)
-
-  # Resetting the conf back to default value
-  callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
-  expect_equal(dim(df), dim(iris))
-})
-
-test_that("read/write csv as DataFrame", {
-  if (not_cran_or_windows_with_hadoop()) {
-    csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
-    mockLinesCsv <- c("year,make,model,comment,blank",
-                     "\"2012\",\"Tesla\",\"S\",\"No comment\",",
-                     "1997,Ford,E350,\"Go get one now they are going fast\",",
-                     "2015,Chevy,Volt",
-                     "NA,Dummy,Placeholder")
-    writeLines(mockLinesCsv, csvPath)
-
-    # default "header" is false, inferSchema to handle "year" as "int"
-    df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
-    expect_equal(count(df), 4)
-    expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
-    expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
-                 sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
-
-    # since "year" is "int", let's skip the NA values
-    withoutna <- na.omit(df, how = "any", cols = "year")
-    expect_equal(count(withoutna), 3)
-
-    unlink(csvPath)
-    csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
-    mockLinesCsv <- c("year,make,model,comment,blank",
-                     "\"2012\",\"Tesla\",\"S\",\"No comment\",",
-                     "1997,Ford,E350,\"Go get one now they are going fast\",",
-                     "2015,Chevy,Volt",
-                     "Empty,Dummy,Placeholder")
-    writeLines(mockLinesCsv, csvPath)
-
-    df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty")
-    expect_equal(count(df2), 4)
-    withoutna2 <- na.omit(df2, how = "any", cols = "year")
-    expect_equal(count(withoutna2), 3)
-    expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
-
-    # writing csv file
-    csvPath2 <- tempfile(pattern = "csvtest2", fileext = ".csv")
-    write.df(df2, path = csvPath2, "csv", header = "true")
-    df3 <- read.df(csvPath2, "csv", header = "true")
-    expect_equal(nrow(df3), nrow(df2))
-    expect_equal(colnames(df3), colnames(df2))
-    csv <- read.csv(file = list.files(csvPath2, pattern = "^part", full.names = T)[[1]])
-    expect_equal(colnames(df3), colnames(csv))
-
-    unlink(csvPath)
-    unlink(csvPath2)
-  }
-})
-
-test_that("Support other types for options", {
-  skip_on_cran()
-
-  csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
-  mockLinesCsv <- c("year,make,model,comment,blank",
-  "\"2012\",\"Tesla\",\"S\",\"No comment\",",
-  "1997,Ford,E350,\"Go get one now they are going fast\",",
-  "2015,Chevy,Volt",
-  "NA,Dummy,Placeholder")
-  writeLines(mockLinesCsv, csvPath)
-
-  csvDf <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
-  expected <- read.df(csvPath, "csv", header = TRUE, inferSchema = TRUE)
-  expect_equal(collect(csvDf), collect(expected))
-
-  expect_error(read.df(csvPath, "csv", header = TRUE, maxColumns = 3))
-  unlink(csvPath)
-})
-
-test_that("convert NAs to null type in DataFrames", {
-  rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
-  df <- createDataFrame(rdd, list("a", "b"))
-  expect_true(is.na(collect(df)[2, "a"]))
-  expect_equal(collect(df)[2, "b"], 4L)
-
-  l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
-  df <- createDataFrame(l)
-  expect_equal(collect(df)[2, "x"], 1L)
-  expect_true(is.na(collect(df)[2, "y"]))
-
-  rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
-  df <- createDataFrame(rdd, list("a", "b"))
-  expect_true(is.na(collect(df)[2, "a"]))
-  expect_equal(collect(df)[2, "b"], 4)
-
-  l <- data.frame(x = 1, y = c(1, NA_real_, 3))
-  df <- createDataFrame(l)
-  expect_equal(collect(df)[2, "x"], 1)
-  expect_true(is.na(collect(df)[2, "y"]))
-
-  l <- list("a", "b", NA, "d")
-  df <- createDataFrame(l)
-  expect_true(is.na(collect(df)[3, "_1"]))
-  expect_equal(collect(df)[4, "_1"], "d")
-
-  l <- list("a", "b", NA_character_, "d")
-  df <- createDataFrame(l)
-  expect_true(is.na(collect(df)[3, "_1"]))
-  expect_equal(collect(df)[4, "_1"], "d")
-
-  l <- list(TRUE, FALSE, NA, TRUE)
-  df <- createDataFrame(l)
-  expect_true(is.na(collect(df)[3, "_1"]))
-  expect_equal(collect(df)[4, "_1"], TRUE)
-})
-
-test_that("toDF", {
-  skip_on_cran()
-
-  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
-  df <- toDF(rdd, list("a", "b"))
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 10)
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
-  df <- toDF(rdd)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(columns(df), c("_1", "_2"))
-
-  schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
-                        structField(x = "b", type = "string", nullable = TRUE))
-  df <- toDF(rdd, schema)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-
-  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
-  df <- toDF(rdd)
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 10)
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-})
-
-test_that("create DataFrame from list or data.frame", {
-  l <- list(list(1, 2), list(3, 4))
-  df <- createDataFrame(l, c("a", "b"))
-  expect_equal(columns(df), c("a", "b"))
-
-  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
-  df <- createDataFrame(l)
-  expect_equal(columns(df), c("a", "b"))
-
-  a <- 1:3
-  b <- c("a", "b", "c")
-  ldf <- data.frame(a, b)
-  df <- createDataFrame(ldf)
-  expect_equal(columns(df), c("a", "b"))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
-  expect_equal(count(df), 3)
-  ldf2 <- collect(df)
-  expect_equal(ldf$a, ldf2$a)
-
-  irisdf <- suppressWarnings(createDataFrame(iris))
-  iris_collected <- collect(irisdf)
-  expect_equivalent(iris_collected[, -5], iris[, -5])
-  expect_equal(iris_collected$Species, as.character(iris$Species))
-
-  mtcarsdf <- createDataFrame(mtcars)
-  expect_equivalent(collect(mtcarsdf), mtcars)
-
-  bytes <- as.raw(c(1, 2, 3))
-  df <- createDataFrame(list(list(bytes)))
-  expect_equal(collect(df)[[1]][[1]], bytes)
-})
-
-test_that("create DataFrame with different data types", {
-  l <- list(a = 1L, b = 2, c = TRUE, d = "ss", e = as.Date("2012-12-13"),
-            f = as.POSIXct("2015-03-15 12:13:14.056"))
-  df <- createDataFrame(list(l))
-  expect_equal(dtypes(df), list(c("a", "int"), c("b", "double"), c("c", "boolean"),
-                                c("d", "string"), c("e", "date"), c("f", "timestamp")))
-  expect_equal(count(df), 1)
-  expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
-})
-
-test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
-  df <- data.frame(
-    id = 1:2,
-    time = c(as.POSIXlt("2016-01-10"), NA),
-    date = c(as.Date("2016-10-01"), NA))
-
-  DF <- collect(createDataFrame(df))
-  expect_true(is.na(DF$date[2]))
-  expect_equal(DF$date[1], as.Date("2016-10-01"))
-  expect_true(is.na(DF$time[2]))
-  expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
-})
-
-test_that("create DataFrame with complex types", {
-  e <- new.env()
-  assign("n", 3L, envir = e)
-
-  s <- listToStruct(list(a = "aa", b = 3L))
-
-  l <- list(as.list(1:10), list("a", "b"), e, s)
-  df <- createDataFrame(list(l), c("a", "b", "c", "d"))
-  expect_equal(dtypes(df), list(c("a", "array<int>"),
-                                c("b", "array<string>"),
-                                c("c", "map<string,int>"),
-                                c("d", "struct<a:string,b:int>")))
-  expect_equal(count(df), 1)
-  ldf <- collect(df)
-  expect_equal(names(ldf), c("a", "b", "c", "d"))
-  expect_equal(ldf[1, 1][[1]], l[[1]])
-  expect_equal(ldf[1, 2][[1]], l[[2]])
-
-  e <- ldf$c[[1]]
-  expect_equal(class(e), "environment")
-  expect_equal(ls(e), "n")
-  expect_equal(e$n, 3L)
-
-  s <- ldf$d[[1]]
-  expect_equal(class(s), "struct")
-  expect_equal(s$a, "aa")
-  expect_equal(s$b, 3L)
-})
-
-test_that("create DataFrame from a data.frame with complex types", {
-  skip_on_cran()
-
-  ldf <- data.frame(row.names = 1:2)
-  ldf$a_list <- list(list(1, 2), list(3, 4))
-  ldf$an_envir <- c(as.environment(list(a = 1, b = 2)), as.environment(list(c = 3)))
-
-  sdf <- createDataFrame(ldf)
-  collected <- collect(sdf)
-
-  expect_identical(ldf[, 1, FALSE], collected[, 1, FALSE])
-  expect_equal(ldf$an_envir, collected$an_envir)
-})
-
-test_that("Collect DataFrame with complex types", {
-  skip_on_cran()
-
-  # ArrayType
-  df <- read.json(complexTypeJsonPath)
-  ldf <- collect(df)
-  expect_equal(nrow(ldf), 3)
-  expect_equal(ncol(ldf), 3)
-  expect_equal(names(ldf), c("c1", "c2", "c3"))
-  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
-  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
-  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
-
-  # MapType
-  schema <- structType(structField("name", "string"),
-                       structField("info", "map<string,double>"))
-  df <- read.df(mapTypeJsonPath, "json", schema)
-  expect_equal(dtypes(df), list(c("name", "string"),
-                                c("info", "map<string,double>")))
-  ldf <- collect(df)
-  expect_equal(nrow(ldf), 3)
-  expect_equal(ncol(ldf), 2)
-  expect_equal(names(ldf), c("name", "info"))
-  expect_equal(ldf$name, c("Bob", "Alice", "David"))
-  bob <- ldf$info[[1]]
-  expect_equal(class(bob), "environment")
-  expect_equal(bob$age, 16)
-  expect_equal(bob$height, 176.5)
-
-  # StructType
-  df <- read.json(mapTypeJsonPath)
-  expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
-                                c("name", "string")))
-  ldf <- collect(df)
-  expect_equal(nrow(ldf), 3)
-  expect_equal(ncol(ldf), 2)
-  expect_equal(names(ldf), c("info", "name"))
-  expect_equal(ldf$name, c("Bob", "Alice", "David"))
-  bob <- ldf$info[[1]]
-  expect_equal(class(bob), "struct")
-  expect_equal(bob$age, 16)
-  expect_equal(bob$height, 176.5)
-})
-
-test_that("read/write json files", {
-  if (not_cran_or_windows_with_hadoop()) {
-    # Test read.df
-    df <- read.df(jsonPath, "json")
-    expect_is(df, "SparkDataFrame")
-    expect_equal(count(df), 3)
-
-    # Test read.df with a user defined schema
-    schema <- structType(structField("name", type = "string"),
-                         structField("age", type = "double"))
-
-    df1 <- read.df(jsonPath, "json", schema)
-    expect_is(df1, "SparkDataFrame")
-    expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
-
-    # Test loadDF
-    df2 <- loadDF(jsonPath, "json", schema)
-    expect_is(df2, "SparkDataFrame")
-    expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
-
-    # Test read.json
-    df <- read.json(jsonPath)
-    expect_is(df, "SparkDataFrame")
-    expect_equal(count(df), 3)
-
-    # Test write.df
-    jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json")
-    write.df(df, jsonPath2, "json", mode = "overwrite")
-
-    # Test write.json
-    jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json")
-    write.json(df, jsonPath3)
-
-    # Test read.json()/jsonFile() works with multiple input paths
-    jsonDF1 <- read.json(c(jsonPath2, jsonPath3))
-    expect_is(jsonDF1, "SparkDataFrame")
-    expect_equal(count(jsonDF1), 6)
-    # Suppress warnings because jsonFile is deprecated
-    jsonDF2 <- suppressWarnings(jsonFile(c(jsonPath2, jsonPath3)))
-    expect_is(jsonDF2, "SparkDataFrame")
-    expect_equal(count(jsonDF2), 6)
-
-    unlink(jsonPath2)
-    unlink(jsonPath3)
-  }
-})
-
-test_that("read/write json files - compression option", {
-  skip_on_cran()
-
-  df <- read.df(jsonPath, "json")
-
-  jsonPath <- tempfile(pattern = "jsonPath", fileext = ".json")
-  write.json(df, jsonPath, compression = "gzip")
-  jsonDF <- read.json(jsonPath)
-  expect_is(jsonDF, "SparkDataFrame")
-  expect_equal(count(jsonDF), count(df))
-  expect_true(length(list.files(jsonPath, pattern = ".gz")) > 0)
-
-  unlink(jsonPath)
-})
-
-test_that("jsonRDD() on a RDD with json string", {
-  skip_on_cran()
-
-  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
-  rdd <- parallelize(sc, mockLines)
-  expect_equal(countRDD(rdd), 3)
-  df <- suppressWarnings(jsonRDD(sqlContext, rdd))
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 3)
-
-  rdd2 <- flatMap(rdd, function(x) c(x, x))
-  df <- suppressWarnings(jsonRDD(sqlContext, rdd2))
-  expect_is(df, "SparkDataFrame")
-  expect_equal(count(df), 6)
-})
-
-test_that("test tableNames and tables", {
-  count <- count(listTables())
-
-  df <- read.json(jsonPath)
-  createOrReplaceTempView(df, "table1")
-  expect_equal(length(tableNames()), count + 1)
-  expect_equal(length(tableNames("default")), count + 1)
-
-  tables <- listTables()
-  expect_equal(count(tables), count + 1)
-  expect_equal(count(tables()), count(tables))
-  expect_true("tableName" %in% colnames(tables()))
-  expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables())))
-
-  suppressWarnings(registerTempTable(df, "table2"))
-  tables <- listTables()
-  expect_equal(count(tables), count + 2)
-  suppressWarnings(dropTempTable("table1"))
-  expect_true(dropTempView("table2"))
-
-  tables <- listTables()
-  expect_equal(count(tables), count + 0)
-})
-
-test_that(
-  "createOrReplaceTempView() results in a queryable table and sql() results in a new DataFrame", {
-  df <- read.json(jsonPath)
-  createOrReplaceTempView(df, "table1")
-  newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
-  expect_is(newdf, "SparkDataFrame")
-  expect_equal(count(newdf), 1)
-  expect_true(dropTempView("table1"))
-
-  createOrReplaceTempView(df, "dfView")
-  sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
-  out <- capture.output(sqlCast)
-  expect_true(is.data.frame(sqlCast))
-  expect_equal(names(sqlCast)[1], "x")
-  expect_equal(nrow(sqlCast), 1)
-  expect_equal(ncol(sqlCast), 1)
-  expect_equal(out[1], "  x")
-  expect_equal(out[2], "1 2")
-  expect_true(dropTempView("dfView"))
-})
-
-test_that("test cache, uncache and clearCache", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  createOrReplaceTempView(df, "table1")
-  cacheTable("table1")
-  uncacheTable("table1")
-  clearCache()
-  expect_true(dropTempView("table1"))
-
-  expect_error(uncacheTable("foo"),
-      "Error in uncacheTable : no such table - Table or view 'foo' not found in database 'default'")
-})
-
-test_that("insertInto() on a registered table", {
-  if (not_cran_or_windows_with_hadoop()) {
-    df <- read.df(jsonPath, "json")
-    write.df(df, parquetPath, "parquet", "overwrite")
-    dfParquet <- read.df(parquetPath, "parquet")
-
-    lines <- c("{\"name\":\"Bob\", \"age\":24}",
-               "{\"name\":\"James\", \"age\":35}")
-    jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".tmp")
-    parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
-    writeLines(lines, jsonPath2)
-    df2 <- read.df(jsonPath2, "json")
-    write.df(df2, parquetPath2, "parquet", "overwrite")
-    dfParquet2 <- read.df(parquetPath2, "parquet")
-
-    createOrReplaceTempView(dfParquet, "table1")
-    insertInto(dfParquet2, "table1")
-    expect_equal(count(sql("select * from table1")), 5)
-    expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
-    expect_true(dropTempView("table1"))
-
-    createOrReplaceTempView(dfParquet, "table1")
-    insertInto(dfParquet2, "table1", overwrite = TRUE)
-    expect_equal(count(sql("select * from table1")), 2)
-    expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
-    expect_true(dropTempView("table1"))
-
-    unlink(jsonPath2)
-    unlink(parquetPath2)
-  }
-})
-
-test_that("tableToDF() returns a new DataFrame", {
-  df <- read.json(jsonPath)
-  createOrReplaceTempView(df, "table1")
-  tabledf <- tableToDF("table1")
-  expect_is(tabledf, "SparkDataFrame")
-  expect_equal(count(tabledf), 3)
-  tabledf2 <- tableToDF("table1")
-  expect_equal(count(tabledf2), 3)
-  expect_true(dropTempView("table1"))
-})
-
-test_that("toRDD() returns an RRDD", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  testRDD <- toRDD(df)
-  expect_is(testRDD, "RDD")
-  expect_equal(countRDD(testRDD), 3)
-})
-
-test_that("union on two RDDs created from DataFrames returns an RRDD", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  RDD1 <- toRDD(df)
-  RDD2 <- toRDD(df)
-  unioned <- unionRDD(RDD1, RDD2)
-  expect_is(unioned, "RDD")
-  expect_equal(getSerializedMode(unioned), "byte")
-  expect_equal(collectRDD(unioned)[[2]]$name, "Andy")
-})
-
-test_that("union on mixed serialization types correctly returns a byte RRDD", {
-  skip_on_cran()
-
-  # Byte RDD
-  nums <- 1:10
-  rdd <- parallelize(sc, nums, 2L)
-
-  # String RDD
-  textLines <- c("Michael",
-                 "Andy, 30",
-                 "Justin, 19")
-  textPath <- tempfile(pattern = "sparkr-textLines", fileext = ".tmp")
-  writeLines(textLines, textPath)
-  textRDD <- textFile(sc, textPath)
-
-  df <- read.json(jsonPath)
-  dfRDD <- toRDD(df)
-
-  unionByte <- unionRDD(rdd, dfRDD)
-  expect_is(unionByte, "RDD")
-  expect_equal(getSerializedMode(unionByte), "byte")
-  expect_equal(collectRDD(unionByte)[[1]], 1)
-  expect_equal(collectRDD(unionByte)[[12]]$name, "Andy")
-
-  unionString <- unionRDD(textRDD, dfRDD)
-  expect_is(unionString, "RDD")
-  expect_equal(getSerializedMode(unionString), "byte")
-  expect_equal(collectRDD(unionString)[[1]], "Michael")
-  expect_equal(collectRDD(unionString)[[5]]$name, "Andy")
-})
-
-test_that("objectFile() works with row serialization", {
-  skip_on_cran()
-
-  objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  df <- read.json(jsonPath)
-  dfRDD <- toRDD(df)
-  saveAsObjectFile(coalesceRDD(dfRDD, 1L), objectPath)
-  objectIn <- objectFile(sc, objectPath)
-
-  expect_is(objectIn, "RDD")
-  expect_equal(getSerializedMode(objectIn), "byte")
-  expect_equal(collectRDD(objectIn)[[2]]$age, 30)
-})
-
-test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  testRDD <- lapply(df, function(row) {
-    row$newCol <- row$age + 5
-    row
-    })
-  expect_is(testRDD, "RDD")
-  collected <- collectRDD(testRDD)
-  expect_equal(collected[[1]]$name, "Michael")
-  expect_equal(collected[[2]]$newCol, 35)
-})
-
-test_that("collect() returns a data.frame", {
-  df <- read.json(jsonPath)
-  rdf <- collect(df)
-  expect_true(is.data.frame(rdf))
-  expect_equal(names(rdf)[1], "age")
-  expect_equal(nrow(rdf), 3)
-  expect_equal(ncol(rdf), 2)
-
-  # collect() returns data correctly from a DataFrame with 0 row
-  df0 <- limit(df, 0)
-  rdf <- collect(df0)
-  expect_true(is.data.frame(rdf))
-  expect_equal(names(rdf)[1], "age")
-  expect_equal(nrow(rdf), 0)
-  expect_equal(ncol(rdf), 2)
-
-  # collect() correctly handles multiple columns with same name
-  df <- createDataFrame(list(list(1, 2)), schema = c("name", "name"))
-  ldf <- collect(df)
-  expect_equal(names(ldf), c("name", "name"))
-})
-
-test_that("limit() returns DataFrame with the correct number of rows", {
-  df <- read.json(jsonPath)
-  dfLimited <- limit(df, 2)
-  expect_is(dfLimited, "SparkDataFrame")
-  expect_equal(count(dfLimited), 2)
-})
-
-test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
-  df <- read.json(jsonPath)
-  expect_equal(nrow(collect(df)), nrow(take(df, 10)))
-  expect_equal(ncol(collect(df)), ncol(take(df, 10)))
-})
-
-test_that("collect() support Unicode characters", {
-  lines <- c("{\"name\":\"안녕하세요\"}",
-             "{\"name\":\"您好\", \"age\":30}",
-             "{\"name\":\"こんにちは\", \"age\":19}",
-             "{\"name\":\"Xin chào\"}")
-
-  jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPath)
-
-  df <- read.df(jsonPath, "json")
-  rdf <- collect(df)
-  expect_true(is.data.frame(rdf))
-  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
-  expect_equal(rdf$name[2], markUtf8("您好"))
-  expect_equal(rdf$name[3], markUtf8("こんにちは"))
-  expect_equal(rdf$name[4], markUtf8("Xin chào"))
-
-  df1 <- createDataFrame(rdf)
-  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
-})
-
-test_that("multiple pipeline transformations result in an RDD with the correct values", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  first <- lapply(df, function(row) {
-    row$age <- row$age + 5
-    row
-  })
-  second <- lapply(first, function(row) {
-    row$testCol <- if (row$age == 35 && !is.na(row$age)) TRUE else FALSE
-    row
-  })
-  expect_is(second, "RDD")
-  expect_equal(countRDD(second), 3)
-  expect_equal(collectRDD(second)[[2]]$age, 35)
-  expect_true(collectRDD(second)[[2]]$testCol)
-  expect_false(collectRDD(second)[[3]]$testCol)
-})
-
-test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
-  df <- read.json(jsonPath)
-  expect_false(df@env$isCached)
-  cache(df)
-  expect_true(df@env$isCached)
-
-  unpersist(df)
-  expect_false(df@env$isCached)
-
-  persist(df, "MEMORY_AND_DISK")
-  expect_true(df@env$isCached)
-
-  expect_equal(storageLevel(df),
-    "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
-
-  unpersist(df)
-  expect_false(df@env$isCached)
-
-  # make sure the data is collectable
-  expect_true(is.data.frame(collect(df)))
-})
-
-test_that("setCheckpointDir(), checkpoint() on a DataFrame", {
-  if (not_cran_or_windows_with_hadoop()) {
-    checkpointDir <- file.path(tempdir(), "cproot")
-    expect_true(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
-
-    setCheckpointDir(checkpointDir)
-    df <- read.json(jsonPath)
-    df <- checkpoint(df)
-    expect_is(df, "SparkDataFrame")
-    expect_false(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
-  }
-})
-
-test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
-  df <- read.json(jsonPath)
-  testSchema <- schema(df)
-  expect_equal(length(testSchema$fields()), 2)
-  expect_equal(testSchema$fields()[[1]]$dataType.toString(), "LongType")
-  expect_equal(testSchema$fields()[[2]]$dataType.simpleString(), "string")
-  expect_equal(testSchema$fields()[[1]]$name(), "age")
-
-  testTypes <- dtypes(df)
-  expect_equal(length(testTypes[[1]]), 2)
-  expect_equal(testTypes[[1]][1], "age")
-
-  testCols <- columns(df)
-  expect_equal(length(testCols), 2)
-  expect_equal(testCols[2], "name")
-
-  testNames <- names(df)
-  expect_equal(length(testNames), 2)
-  expect_equal(testNames[2], "name")
-})
-
-test_that("names() colnames() set the column names", {
-  df <- read.json(jsonPath)
-  names(df) <- c("col1", "col2")
-  expect_equal(colnames(df)[2], "col2")
-
-  colnames(df) <- c("col3", "col4")
-  expect_equal(names(df)[1], "col3")
-
-  expect_error(names(df) <- NULL, "Invalid column names.")
-  expect_error(names(df) <- c("sepal.length", "sepal_width"),
-               "Column names cannot contain the '.' symbol.")
-  expect_error(names(df) <- c(1, 2), "Invalid column names.")
-  expect_error(names(df) <- c("a"),
-               "Column names must have the same length as the number of columns in the dataset.")
-  expect_error(names(df) <- c("1", NA), "Column names cannot be NA.")
-
-  expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
-               "Column names cannot contain the '.' symbol.")
-  expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
-  expect_error(colnames(df) <- c("a"),
-               "Column names must have the same length as the number of columns in the dataset.")
-  expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.")
-
-  # Note: if this test is broken, remove check for "." character on colnames<- method
-  irisDF <- suppressWarnings(createDataFrame(iris))
-  expect_equal(names(irisDF)[1], "Sepal_Length")
-
-  # Test base::colnames base::names
-  m2 <- cbind(1, 1:4)
-  expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2"))
-  colnames(m2) <- c("x", "Y")
-  expect_equal(colnames(m2), c("x", "Y"))
-
-  z <- list(a = 1, b = "c", c = 1:3)
-  expect_equal(names(z)[3], "c")
-  names(z)[3] <- "c2"
-  expect_equal(names(z)[3], "c2")
-
-  # Test subset assignment
-  colnames(df)[1] <- "col5"
-  expect_equal(colnames(df)[1], "col5")
-  names(df)[2] <- "col6"
-  expect_equal(names(df)[2], "col6")
-})
-
-test_that("head() and first() return the correct data", {
-  df <- read.json(jsonPath)
-  testHead <- head(df)
-  expect_equal(nrow(testHead), 3)
-  expect_equal(ncol(testHead), 2)
-
-  testHead2 <- head(df, 2)
-  expect_equal(nrow(testHead2), 2)
-  expect_equal(ncol(testHead2), 2)
-
-  testFirst <- first(df)
-  expect_equal(nrow(testFirst), 1)
-
-  # head() and first() return the correct data on
-  # a DataFrame with 0 row
-  df0 <- limit(df, 0)
-
-  testHead <- head(df0)
-  expect_equal(nrow(testHead), 0)
-  expect_equal(ncol(testHead), 2)
-
-  testFirst <- first(df0)
-  expect_equal(nrow(testFirst), 0)
-  expect_equal(ncol(testFirst), 2)
-})
-
-test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
-  lines <- c("{\"name\":\"Michael\"}",
-             "{\"name\":\"Andy\", \"age\":30}",
-             "{\"name\":\"Justin\", \"age\":19}",
-             "{\"name\":\"Justin\", \"age\":19}")
-  jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPathWithDup)
-
-  df <- read.json(jsonPathWithDup)
-  uniques <- distinct(df)
-  expect_is(uniques, "SparkDataFrame")
-  expect_equal(count(uniques), 3)
-
-  uniques2 <- unique(df)
-  expect_is(uniques2, "SparkDataFrame")
-  expect_equal(count(uniques2), 3)
-
-  # Test dropDuplicates()
-  df <- createDataFrame(
-    list(
-      list(2, 1, 2), list(1, 1, 1),
-      list(1, 2, 1), list(2, 1, 2),
-      list(2, 2, 2), list(2, 2, 1),
-      list(2, 1, 1), list(1, 1, 2),
-      list(1, 2, 2), list(1, 2, 1)),
-    schema = c("key", "value1", "value2"))
-  result <- collect(dropDuplicates(df))
-  expected <- rbind.data.frame(
-    c(1, 1, 1), c(1, 1, 2), c(1, 2, 1),
-    c(1, 2, 2), c(2, 1, 1), c(2, 1, 2),
-    c(2, 2, 1), c(2, 2, 2))
-  names(expected) <- c("key", "value1", "value2")
-  expect_equivalent(
-    result[order(result$key, result$value1, result$value2), ],
-    expected)
-
-  result <- collect(dropDuplicates(df, c("key", "value1")))
-  expected <- rbind.data.frame(
-    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
-  names(expected) <- c("key", "value1", "value2")
-  expect_equivalent(
-    result[order(result$key, result$value1, result$value2), ],
-    expected)
-
-  result <- collect(dropDuplicates(df, "key", "value1"))
-  expected <- rbind.data.frame(
-    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
-  names(expected) <- c("key", "value1", "value2")
-  expect_equivalent(
-    result[order(result$key, result$value1, result$value2), ],
-    expected)
-
-  result <- collect(dropDuplicates(df, "key"))
-  expected <- rbind.data.frame(
-    c(1, 1, 1), c(2, 1, 2))
-  names(expected) <- c("key", "value1", "value2")
-  expect_equivalent(
-    result[order(result$key, result$value1, result$value2), ],
-    expected)
-})
-
-test_that("sample on a DataFrame", {
-  df <- read.json(jsonPath)
-  sampled <- sample(df, FALSE, 1.0)
-  expect_equal(nrow(collect(sampled)), count(df))
-  expect_is(sampled, "SparkDataFrame")
-  sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
-  expect_true(count(sampled2) < 3)
-
-  count1 <- count(sample(df, FALSE, 0.1, 0))
-  count2 <- count(sample(df, FALSE, 0.1, 0))
-  expect_equal(count1, count2)
-
-  # Also test sample_frac
-  sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
-  expect_true(count(sampled3) < 3)
-
-  # nolint start
-  # Test base::sample is working
-  #expect_equal(length(sample(1:12)), 12)
-  # nolint end
-})
-
-test_that("select operators", {
-  df <- select(read.json(jsonPath), "name", "age")
-  expect_is(df$name, "Column")
-  expect_is(df[[2]], "Column")
-  expect_is(df[["age"]], "Column")
-
-  expect_warning(df[[1:2]],
-                 "Subset index has length > 1. Only the first index is used.")
-  expect_is(suppressWarnings(df[[1:2]]), "Column")
-  expect_warning(df[[c("name", "age")]],
-                 "Subset index has length > 1. Only the first index is used.")
-  expect_is(suppressWarnings(df[[c("name", "age")]]), "Column")
-
-  expect_warning(df[[1:2]] <- df[[1]],
-                 "Subset index has length > 1. Only the first index is used.")
-  expect_warning(df[[c("name", "age")]] <- df[[1]],
-                 "Subset index has length > 1. Only the first index is used.")
-
-  expect_is(df[, 1, drop = F], "SparkDataFrame")
-  expect_equal(columns(df[, 1, drop = F]), c("name"))
-  expect_equal(columns(df[, "age", drop = F]), c("age"))
-
-  df2 <- df[, c("age", "name")]
-  expect_is(df2, "SparkDataFrame")
-  expect_equal(columns(df2), c("age", "name"))
-
-  df$age2 <- df$age
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == df$age)), 2)
-  df$age2 <- df$age * 2
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
-  df$age2 <- df[["age"]] * 3
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == df$age * 3)), 2)
-
-  df$age2 <- 21
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == 21)), 3)
-
-  df$age2 <- c(22)
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == 22)), 3)
-
-  expect_error(df$age3 <- c(22, NA),
-              "value must be a Column, literal value as atomic in length of 1, or NULL")
-
-  df[["age2"]] <- 23
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == 23)), 3)
-
-  df[[3]] <- 24
-  expect_equal(columns(df), c("name", "age", "age2"))
-  expect_equal(count(where(df, df$age2 == 24)), 3)
-
-  df[[3]] <- df$age
-  expect_equal(count(where(df, df$age2 == df$age)), 2)
-
-  df[["age2"]] <- df[["name"]]
-  expect_equal(count(where(df, df$age2 == df$name)), 3)
-
-  expect_error(df[["age3"]] <- c(22, 23),
-              "value must be a Column, literal value as atomic in length of 1, or NULL")
-
-  # Test parameter drop
-  expect_equal(class(df[, 1]) == "SparkDataFrame", T)
-  expect_equal(class(df[, 1, drop = T]) == "Column", T)
-  expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T)
-  expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T)
-  expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T)
-})
-
-test_that("select with column", {
-  df <- read.json(jsonPath)
-  df1 <- select(df, "name")
-  expect_equal(columns(df1), c("name"))
-  expect_equal(count(df1), 3)
-
-  df2 <- select(df, df$age)
-  expect_equal(columns(df2), c("age"))
-  expect_equal(count(df2), 3)
-
-  df3 <- select(df, lit("x"))
-  expect_equal(columns(df3), c("x"))
-  expect_equal(count(df3), 3)
-  expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
-
-  df4 <- select(df, c("name", "age"))
-  expect_equal(columns(df4), c("name", "age"))
-  expect_equal(count(df4), 3)
-
-  # Test select with alias
-  df5 <- alias(df, "table")
-
-  expect_equal(columns(select(df5, column("table.name"))), "name")
-  expect_equal(columns(select(df5, "table.name")), "name")
-
-  # Test that stats::alias is not masked
-  expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
-
-
-  expect_error(select(df, c("name", "age"), "name"),
-                "To select multiple columns, use a character vector or list for col")
-})
-
-test_that("drop column", {
-  df <- select(read.json(jsonPath), "name", "age")
-  df1 <- drop(df, "name")
-  expect_equal(columns(df1), c("age"))
-
-  df$age2 <- df$age
-  df1 <- drop(df, c("name", "age"))
-  expect_equal(columns(df1), c("age2"))
-
-  df1 <- drop(df, df$age)
-  expect_equal(columns(df1), c("name", "age2"))
-
-  df$age2 <- NULL
-  expect_equal(columns(df), c("name", "age"))
-  df$age3 <- NULL
-  expect_equal(columns(df), c("name", "age"))
-
-  # Test to make sure base::drop is not masked
-  expect_equal(drop(1:3 %*% 2:4), 20)
-})
-
-test_that("subsetting", {
-  # read.json returns columns in random order
-  df <- select(read.json(jsonPath), "name", "age")
-  filtered <- df[df$age > 20, ]
-  expect_equal(count(filtered), 1)
-  expect_equal(columns(filtered), c("name", "age"))
-  expect_equal(collect(filtered)$name, "Andy")
-
-  df2 <- df[df$age == 19, 1, drop = F]
-  expect_is(df2, "SparkDataFrame")
-  expect_equal(count(df2), 1)
-  expect_equal(columns(df2), c("name"))
-  expect_equal(collect(df2)$name, "Justin")
-
-  df3 <- df[df$age > 20, 2, drop = F]
-  expect_equal(count(df3), 1)
-  expect_equal(columns(df3), c("age"))
-
-  df4 <- df[df$age %in% c(19, 30), 1:2]
-  expect_equal(count(df4), 2)
-  expect_equal(columns(df4), c("name", "age"))
-
-  df5 <- df[df$age %in% c(19), c(1, 2)]
-  expect_equal(count(df5), 1)
-  expect_equal(columns(df5), c("name", "age"))
-
-  df6 <- subset(df, df$age %in% c(30), c(1, 2))
-  expect_equal(count(df6), 1)
-  expect_equal(columns(df6), c("name", "age"))
-
-  df7 <- subset(df, select = "name", drop = F)
-  expect_equal(count(df7), 3)
-  expect_equal(columns(df7), c("name"))
-
-  # Test base::subset is working
-  expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68)
-})
-
-test_that("selectExpr() on a DataFrame", {
-  df <- read.json(jsonPath)
-  selected <- selectExpr(df, "age * 2")
-  expect_equal(names(selected), "(age * 2)")
-  expect_equal(collect(selected), collect(select(df, df$age * 2L)))
-
-  selected2 <- selectExpr(df, "name as newName", "abs(age) as age")
-  expect_equal(names(selected2), c("newName", "age"))
-  expect_equal(count(selected2), 3)
-})
-
-test_that("expr() on a DataFrame", {
-  df <- read.json(jsonPath)
-  expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123)
-})
-
-test_that("column calculation", {
-  df <- read.json(jsonPath)
-  d <- collect(select(df, alias(df$age + 1, "age2")))
-  expect_equal(names(d), c("age2"))
-  df2 <- select(df, lower(df$name), abs(df$age))
-  expect_is(df2, "SparkDataFrame")
-  expect_equal(count(df2), 3)
-})
-
-test_that("test HiveContext", {
-  if (not_cran_or_windows_with_hadoop()) {
-    setHiveContext(sc)
-
-    schema <- structType(structField("name", "string"), structField("age", "integer"),
-                         structField("height", "float"))
-    createTable("people", source = "json", schema = schema)
-    df <- read.df(jsonPathNa, "json", schema)
-    insertInto(df, "people")
-    expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16))
-    sql("DROP TABLE people")
-
-    df <- createTable("json", jsonPath, "json")
-    expect_is(df, "SparkDataFrame")
-    expect_equal(count(df), 3)
-    df2 <- sql("select * from json")
-    expect_is(df2, "SparkDataFrame")
-    expect_equal(count(df2), 3)
-
-    jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-    saveAsTable(df, "json2", "json", "append", path = jsonPath2)
-    df3 <- sql("select * from json2")
-    expect_is(df3, "SparkDataFrame")
-    expect_equal(count(df3), 3)
-    unlink(jsonPath2)
-
-    hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-    saveAsTable(df, "hivetestbl", path = hivetestDataPath)
-    df4 <- sql("select * from hivetestbl")
-    expect_is(df4, "SparkDataFrame")
-    expect_equal(count(df4), 3)
-    unlink(hivetestDataPath)
-
-    parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-    saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)
-    df5 <- sql("select * from parquetest")
-    expect_is(df5, "SparkDataFrame")
-    expect_equal(count(df5), 3)
-    unlink(parquetDataPath)
-
-    unsetHiveContext()
-  }
-})
-
-test_that("column operators", {
-  c <- column("a")
-  c2 <- (- c + 1 - 2) * 3 / 4.0
-  c3 <- (c + c2 - c2) * c2 %% c2
-  c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
-  c5 <- c2 ^ c3 ^ c4
-  c6 <- c2 %<=>% c3
-  c7 <- !c6
-})
-
-test_that("column functions", {
-  skip_on_cran()
-
-  c <- column("a")
-  c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
-  c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
-  c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
-  c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
-  c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
-  c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
-  c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
-  c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
-  c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
-  c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
-  c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
-  c12 <- variance(c)
-  c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
-  c14 <- cume_dist() + ntile(1) + corr(c, c1)
-  c15 <- dense_rank() + percent_rank() + rank() + row_number()
-  c16 <- is.nan(c) + isnan(c) + isNaN(c)
-  c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
-  c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
-  c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
-  c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
-  c21 <- posexplode_outer(c) + explode_outer(c)
-  c22 <- not(c)
-
-  # Test if base::is.nan() is exposed
-  expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
-
-  # Test if base::rank() is exposed
-  expect_equal(class(rank())[[1]], "Column")
-  expect_equal(rank(1:3), as.numeric(c(1:3)))
-
-  df <- read.json(jsonPath)
-  df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
-  expect_equal(collect(df2)[[2, 1]], TRUE)
-  expect_equal(collect(df2)[[2, 2]], FALSE)
-  expect_equal(collect(df2)[[3, 1]], FALSE)
-  expect_equal(collect(df2)[[3, 2]], TRUE)
-
-  # Test that input_file_name()
-  actual_names <- sort(collect(distinct(select(df, input_file_name()))))
-  expect_equal(length(actual_names), 1)
-  expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
-
-  df3 <- select(df, between(df$name, c("Apache", "Spark")))
-  expect_equal(collect(df3)[[1, 1]], TRUE)
-  expect_equal(collect(df3)[[2, 1]], FALSE)
-  expect_equal(collect(df3)[[3, 1]], TRUE)
-
-  df4 <- select(df, countDistinct(df$age, df$name))
-  expect_equal(collect(df4)[[1, 1]], 2)
-
-  expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
-  expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
-  expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
-
-  df5 <- createDataFrame(list(list(a = "010101")))
-  expect_equal(collect(select(df5, conv(df5$a, 2, 16)))[1, 1], "15")
-
-  # Test array_contains() and sort_array()
-  df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L))))
-  result <- collect(select(df, array_contains(df[[1]], 1L)))[[1]]
-  expect_equal(result, c(TRUE, FALSE))
-
-  result <- collect(select(df, sort_array(df[[1]], FALSE)))[[1]]
-  expect_equal(result, list(list(3L, 2L, 1L), list(6L, 5L, 4L)))
-  result <- collect(select(df, sort_array(df[[1]])))[[1]]
-  expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L)))
-
-  # Test that stats::lag is working
-  expect_equal(length(lag(ldeaths, 12)), 72)
-
-  # Test struct()
-  df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
-                        schema = c("a", "b", "c"))
-  result <- collect(select(df, alias(struct("a", "c"), "d")))
-  expected <- data.frame(row.names = 1:2)
-  expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
-                      listToStruct(list(a = 4L, c = 6L)))
-  expect_equal(result, expected)
-
-  result <- collect(select(df, alias(struct(df$a, df$b), "d")))
-  expected <- data.frame(row.names = 1:2)
-  expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
-                      listToStruct(list(a = 4L, b = 5L)))
-  expect_equal(result, expected)
-
-  # Test encode(), decode()
-  bytes <- as.raw(c(0xe5, 0xa4, 0xa7, 0xe5, 0x8d, 0x83, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c))
-  df <- createDataFrame(list(list(markUtf8("大千世界"), "utf-8", bytes)),
-                        schema = c("a", "b", "c"))
-  result <- collect(select(df, encode(df$a, "utf-8"), decode(df$c, "utf-8")))
-  expect_equal(result[[1]][[1]], bytes)
-  expect_equal(result[[2]], markUtf8("大千世界"))
-
-  # Test first(), last()
-  df <- read.json(jsonPath)
-  expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
-  expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
-  expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
-  expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
-  expect_equal(collect(select(df, last(df$age)))[[1]], 19)
-  expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
-  expect_equal(collect(select(df, last("age")))[[1]], 19)
-  expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)
-
-  # Test bround()
-  df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
-  expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
-  expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
-
-  # Test to_json(), from_json()
-  df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
-  j <- collect(select(df, alias(to_json(df$people), "json")))
-  expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
-
-  df <- read.json(mapTypeJsonPath)
-  j <- collect(select(df, alias(to_json(df$info), "json")))
-  expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
-  df <- as.DataFrame(j)
-  schema <- structType(structField("age", "integer"),
-                       structField("height", "double"))
-  s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
-  expect_equal(ncol(s), 1)
-  expect_equal(nrow(s), 3)
-  expect_is(s[[1]][[1]], "struct")
-  expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
-
-  # passing option
-  df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
-  schema2 <- structType(structField("date", "date"))
-  s <- collect(select(df, from_json(df$col, schema2)))
-  expect_equal(s[[1]][[1]], NA)
-  s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
-  expect_is(s[[1]][[1]]$date, "Date")
-  expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
-
-  # check for unparseable
-  df <- as.DataFrame(list(list("a" = "")))
-  expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
-
-  # check if array type in string is correctly supported.
-  jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
-  df <- as.DataFrame(list(list("people" = jsonArr)))
-  schema <- structType(structField("name", "string"))
-  arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
-  expect_equal(ncol(arr), 1)
-  expect_equal(nrow(arr), 1)
-  expect_is(arr[[1]][[1]], "list")
-  expect_equal(length(arr$arrcol[[1]]), 2)
-  expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
-  expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
-
-  # Test create_array() and create_map()
-  df <- as.DataFrame(data.frame(
-    x = c(1.0, 2.0), y = c(-1.0, 3.0), z = c(-2.0, 5.0)
-  ))
-
-  arrs <- collect(select(df, create_array(df$x, df$y, df$z)))
-  expect_equal(arrs[, 1], list(list(1, -1, -2), list(2, 3, 5)))
-
-  maps <- collect(select(
-    df, create_map(lit("x"), df$x, lit("y"), df$y, lit("z"), df$z)))
-
-  expect_equal(
-    maps[, 1],
-    lapply(
-      list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3,  z = 5)),
-      as.environment))
-
-  df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
-  expect_equal(
-    collect(select(df, alias(not(df$is_true), "is_false"))),
-    data.frame(is_false = c(FALSE, TRUE, NA))
-  )
-})
-
-test_that("column binary mathfunctions", {
-  lines <- c("{\"a\":1, \"b\":5}",
-             "{\"a\":2, \"b\":6}",
-             "{\"a\":3, \"b\":7}",
-             "{\"a\":4, \"b\":8}")
-  jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPathWithDup)
-  df <- read.json(jsonPathWithDup)
-  expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
-  expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
-  expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
-  expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
-  ## nolint start
-  expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
-  expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
-  expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
-  expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
-  ## nolint end
-  expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
-  expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
-  expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
-  expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
-  expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
-  expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
-})
-
-test_that("string operators", {
-  df <- read.json(jsonPath)
-  expect_equal(count(where(df, like(df$name, "A%"))), 1)
-  expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
-  expect_true(first(select(df, startsWith(df$name, "M")))[[1]])
-  expect_false(first(select(df, startsWith(df$name, "m")))[[1]])
-  expect_true(first(select(df, endsWith(df$name, "el")))[[1]])
-  expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
-  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
-    expect_true(startsWith("Hello World", "Hello"))
-    expect_false(endsWith("Hello World", "a"))
-  }
-  expect_equal(collect(select(df, cast(df$age, "string")))[[2, 1]], "30")
-  expect_equal(collect(select(df, concat(df$name, lit(":"), df$age)))[[2, 1]], "Andy:30")
-  expect_equal(collect(select(df, concat_ws(":", df$name)))[[2, 1]], "Andy")
-  expect_equal(collect(select(df, concat_ws(":", df$name, df$age)))[[2, 1]], "Andy:30")
-  expect_equal(collect(select(df, instr(df$name, "i")))[, 1], c(2, 0, 5))
-  expect_equal(collect(select(df, format_number(df$age, 2)))[2, 1], "30.00")
-  expect_equal(collect(select(df, sha1(df$name)))[2, 1],
-               "ab5a000e88b5d9d0fa2575f5c6263eb93452405d")
-  expect_equal(collect(select(df, sha2(df$name, 256)))[2, 1],
-               "80f2aed3c618c423ddf05a2891229fba44942d907173152442cf6591441ed6dc")
-  expect_equal(collect(select(df, format_string("Name:%s", df$name)))[2, 1], "Name:Andy")
-  expect_equal(collect(select(df, format_string("%s, %d", df$name, df$age)))[2, 1], "Andy, 30")
-  expect_equal(collect(select(df, regexp_extract(df$name, "(n.y)", 1)))[2, 1], "ndy")
-  expect_equal(collect(select(df, regexp_replace(df$name, "(n.y)", "ydn")))[2, 1], "Aydn")
-
-  l2 <- list(list(a = "aaads"))
-  df2 <- createDataFrame(l2)
-  expect_equal(collect(select(df2, locate("aa", df2$a)))[1, 1], 1)
-  expect_equal(collect(select(df2, locate("aa", df2$a, 2)))[1, 1], 2)
-  expect_equal(collect(select(df2, lpad(df2$a, 8, "#")))[1, 1], "###aaads") # nolint
-  expect_equal(collect(select(df2, rpad(df2$a, 8, "#")))[1, 1], "aaads###") # nolint
-
-  l3 <- list(list(a = "a.b.c.d"))
-  df3 <- createDataFrame(l3)
-  expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
-  expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
-  expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
-
-  l4 <- list(list(a = "a.b@c.d   1\\b"))
-  df4 <- createDataFrame(l4)
-  expect_equal(
-    collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
-    list(list("a.b@c.d", "1\\b"))
-  )
-  expect_equal(
-    collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
-    list(list("a", "b@c", "d   1\\b"))
-  )
-  expect_equal(
-    collect(select(df4, split_string(df4$a, "@")))[1, 1],
-    list(list("a.b", "c.d   1\\b"))
-  )
-  expect_equal(
-    collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
-    list(list("a.b@c.d   1", "b"))
-  )
-
-  l5 <- list(list(a = "abc"))
-  df5 <- createDataFrame(l5)
-  expect_equal(
-    collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
-    "abc"
-  )
-  expect_equal(
-    collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
-    "abcabcabc"
-  )
-  expect_equal(
-    collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
-    ""
-  )
-})
-
-test_that("date functions on a DataFrame", {
-  .originalTimeZone <- Sys.getenv("TZ")
-  Sys.setenv(TZ = "UTC")
-  l <- list(list(a = 1L, b = as.Date("2012-12-13")),
-            list(a = 2L, b = as.Date("2013-12-14")),
-            list(a = 3L, b = as.Date("2014-12-15")))
-  df <- createDataFrame(l)
-  expect_equal(collect(select(df, dayofmonth(df$b)))[, 1], c(13, 14, 15))
-  expect_equal(collect(select(df, dayofyear(df$b)))[, 1], c(348, 348, 349))
-  expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51))
-  expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014))
-  expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12))
-  expect_equal(collect(select(df, last_day(df$b)))[, 1],
-               c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31")))
-  expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1],
-               c(as.Date("2012-12-17"), as.Date("2013-12-16"), as.Date("2014-12-22")))
-  expect_equal(collect(select(df, date_format(df$b, "y")))[, 1], c("2012", "2013", "2014"))
-  expect_equal(collect(select(df, add_months(df$b, 3)))[, 1],
-               c(as.Date("2013-03-13"), as.Date("2014-03-14"), as.Date("2015-03-15")))
-  expect_equal(collect(select(df, date_add(df$b, 1)))[, 1],
-               c(as.Date("2012-12-14"), as.Date("2013-12-15"), as.Date("2014-12-16")))
-  expect_equal(collect(select(df, date_sub(df$b, 1)))[, 1],
-               c(as.Date("2012-12-12"), as.Date("2013-12-13"), as.Date("2014-12-14")))
-
-  l2 <- list(list(a = 1L, b = as.POSIXlt("2012-12-13 12:34:00", tz = "UTC")),
-            list(a = 2L, b = as.POSIXlt("2014-12-15 01:24:34", tz = "UTC")))
-  df2 <- createDataFrame(l2)
-  expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
-  expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
-  expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
-               c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC")))
-  expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
-               c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC")))
-  expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
-  expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
-  expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
-
-  l3 <- list(list(a = 1000), list(a = -1000))
-  df3 <- createDataFrame(l3)
-  result31 <- collect(select(df3, from_unixtime(df3$a)))
-  expect_equal(grep("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", result31[, 1], perl = TRUE),
-               c(1, 2))
-  result32 <- collect(select(df3, from_unixtime(df3$a, "yyyy")))
-  expect_equal(grep("\\d{4}", result32[, 1]), c(1, 2))
-  Sys.setenv(TZ = .originalTimeZone)
-})
-
-test_that("greatest() and least() on a DataFrame", {
-  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
-  df <- createDataFrame(l)
-  expect_equal(collect(select(df, greatest(df$a, df$b)))[, 1], c(2, 4))
-  expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3))
-})
-
-test_that("time windowing (window()) with all inputs", {
-  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
-  df$window <- window(df$t, "5 seconds", "5 seconds", "0 seconds")
-  local <- collect(df)$v
-  # Not checking time windows because of possible time zone issues. Just checking that the function
-  # works
-  expect_equal(local, c(1))
-})
-
-test_that("time windowing (window()) with slide duration", {
-  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
-  df$window <- window(df$t, "5 seconds", "2 seconds")
-  local <- collect(df)$v
-  # Not checking time windows because of possible time zone issues. Just checking that the function
-  # works
-  expect_equal(local, c(1, 1))
-})
-
-test_that("time windowing (window()) with start time", {
-  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
-  df$window <- window(df$t, "5 seconds", startTime = "2 seconds")
-  local <- collect(df)$v
-  # Not checking time windows because of possible time zone issues. Just checking that the function
-  # works
-  expect_equal(local, c(1))
-})
-
-test_that("time windowing (window()) with just window duration", {
-  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
-  df$window <- window(df$t, "5 seconds")
-  local <- collect(df)$v
-  # Not checking time windows because of possible time zone issues. Just checking that the function
-  # works
-  expect_equal(local, c(1))
-})
-
-test_that("when(), otherwise() and ifelse() on a DataFrame", {
-  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
-  df <- createDataFrame(l)
-  expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, 1)))[, 1], c(NA, 1))
-  expect_equal(collect(select(df, otherwise(when(df$a > 1, 1), 0)))[, 1], c(0, 1))
-  expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
-})
-
-test_that("when(), otherwise() and ifelse() with column on a DataFrame", {
-  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
-  df <- createDataFrame(l)
-  expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, lit(1))))[, 1], c(NA, 1))
-  expect_equal(collect(select(df, otherwise(when(df$a > 1, lit(1)), lit(0))))[, 1], c(0, 1))
-  expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, lit(0), lit(1))))[, 1], c(1, 0))
-})
-
-test_that("group by, agg functions", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-  df1 <- agg(df, name = "max", age = "sum")
-  expect_equal(1, count(df1))
-  df1 <- agg(df, age2 = max(df$age))
-  expect_equal(1, count(df1))
-  expect_equal(columns(df1), c("age2"))
-
-  gd <- groupBy(df, "name")
-  expect_is(gd, "GroupedData")
-  df2 <- count(gd)
-  expect_is(df2, "SparkDataFrame")
-  expect_equal(3, count(df2))
-
-  # Also test group_by, summarize, mean
-  gd1 <- group_by(df, "name")
-  expect_is(gd1, "GroupedData")
-  df_summarized <- summarize(gd, mean_age = mean(df$age))
-  expect_is(df_summarized, "SparkDataFrame")
-  expect_equal(3, count(df_summarized))
-
-  df3 <- agg(gd, age = "stddev")
-  expect_is(df3, "SparkDataFrame")
-  df3_local <- collect(df3)
-  expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
-
-  df4 <- agg(gd, sumAge = sum(df$age))
-  expect_is(df4, "SparkDataFrame")
-  expect_equal(3, count(df4))
-  expect_equal(columns(df4), c("name", "sumAge"))
-
-  df5 <- sum(gd, "age")
-  expect_is(df5, "SparkDataFrame")
-  expect_equal(3, count(df5))
-
-  expect_equal(3, count(mean(gd)))
-  expect_equal(3, count(max(gd)))
-  expect_equal(30, collect(max(gd))[2, 2])
-  expect_equal(1, collect(count(gd))[1, 2])
-
-  mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
-                  "{\"name\":\"ID1\", \"value\": \"10\"}",
-                  "{\"name\":\"ID1\", \"value\": \"22\"}",
-                  "{\"name\":\"ID2\", \"value\": \"-3\"}")
-  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(mockLines2, jsonPath2)
-  gd2 <- groupBy(read.json(jsonPath2), "name")
-  df6 <- agg(gd2, value = "sum")
-  df6_local <- collect(df6)
-  expect_equal(42, df6_local[df6_local$name == "ID1", ][1, 2])
-  expect_equal(-3, df6_local[df6_local$name == "ID2", ][1, 2])
-
-  df7 <- agg(gd2, value = "stddev")
-  df7_local <- collect(df7)
-  expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
-  expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
-
-  mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
-                  "{\"name\":\"Andy\", \"age\":30}",
-                  "{\"name\":\"Justin\", \"age\":19}",
-                  "{\"name\":\"Justin\", \"age\":1}")
-  jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(mockLines3, jsonPath3)
-  df8 <- read.json(jsonPath3)
-  gd3 <- groupBy(df8, "name")
-  gd3_local <- collect(sum(gd3))
-  expect_equal(60, gd3_local[gd3_local$name == "Andy", ][1, 2])
-  expect_equal(20, gd3_local[gd3_local$name == "Justin", ][1, 2])
-
-  expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
-  gd3_local <- collect(agg(gd3, var(df8$age)))
-  expect_equal(162, gd3_local[gd3_local$name == "Justin", ][1, 2])
-
-  # Test stats::sd, stats::var are working
-  expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
-  expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
-
-  # Test collect_list and collect_set
-  gd3_collections_local <- collect(
-    agg(gd3, collect_set(df8$age), collect_list(df8$age))
-  )
-
-  expect_equal(
-    unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 2]),
-    c(30)
-  )
-
-  expect_equal(
-    unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 3]),
-    c(30, 30)
-  )
-
-  expect_equal(
-    sort(unlist(
-      gd3_collections_local[gd3_collections_local$name == "Justin", 3]
-    )),
-    c(1, 19)
-  )
-
-  unlink(jsonPath2)
-  unlink(jsonPath3)
-})
-
-test_that("pivot GroupedData column", {
-  df <- createDataFrame(data.frame(
-    earnings = c(10000, 10000, 11000, 15000, 12000, 20000, 21000, 22000),
-    course = c("R", "Python", "R", "Python", "R", "Python", "R", "Python"),
-    year = c(2013, 2013, 2014, 2014, 2015, 2015, 2016, 2016)
-  ))
-  sum1 <- collect(sum(pivot(groupBy(df, "year"), "course"), "earnings"))
-  sum2 <- collect(sum(pivot(groupBy(df, "year"), "course", c("Python", "R")), "earnings"))
-  sum3 <- collect(sum(pivot(groupBy(df, "year"), "course", list("Python", "R")), "earnings"))
-  sum4 <- collect(sum(pivot(groupBy(df, "year"), "course", "R"), "earnings"))
-
-  correct_answer <- data.frame(
-    year = c(2013, 2014, 2015, 2016),
-    Python = c(10000, 15000, 20000, 22000),
-    R = c(10000, 11000, 12000, 21000)
-  )
-  expect_equal(sum1, correct_answer)
-  expect_equal(sum2, correct_answer)
-  expect_equal(sum3, correct_answer)
-  expect_equal(sum4, correct_answer[, c("year", "R")])
-
-  expect_error(collect(sum(pivot(groupBy(df, "year"), "course", c("R", "R")), "earnings")))
-  expect_error(collect(sum(pivot(groupBy(df, "year"), "course", list("R", "R")), "earnings")))
-})
-
-test_that("test multi-dimensional aggregations with cube and rollup", {
-  df <- createDataFrame(data.frame(
-    id = 1:6,
-    year = c(2016, 2016, 2016, 2017, 2017, 2017),
-    salary = c(10000, 15000, 20000, 22000, 32000, 21000),
-    department = c("management", "rnd", "sales", "management", "rnd", "sales")
-  ))
-
-  actual_cube <- collect(
-    orderBy(
-      agg(
-        cube(df, "year", "department"),
-        expr("sum(salary) AS total_salary"),
-        expr("avg(salary) AS average_salary"),
-        alias(grouping_bit(df$year), "grouping_year"),
-        alias(grouping_bit(df$department), "grouping_department"),
-        alias(grouping_id(df$year, df$department), "grouping_id")
-      ),
-      "year", "department"
-    )
-  )
-
-  expected_cube <- data.frame(
-    year = c(rep(NA, 4), rep(2016, 4), rep(2017, 4)),
-    department = rep(c(NA, "management", "rnd", "sales"), times = 3),
-    total_salary = c(
-      120000, # Total
-      10000 + 22000, 15000 + 32000, 20000 + 21000, # Department only
-      20000 + 15000 + 10000, # 2016
-      10000, 15000, 20000, # 2016 each department
-      21000 + 32000 + 22000, # 2017
-      22000, 32000, 21000 # 2017 each department
-    ),
-    average_salary = c(
-      # Total
-      mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
-      # Mean by department
-      mean(c(10000, 22000)), mean(c(15000, 32000)), mean(c(20000, 21000)),
-      mean(c(10000, 15000, 20000)), # 2016
-      10000, 15000, 20000, # 2016 each department
-      mean(c(21000, 32000, 22000)), # 2017
-      22000, 32000, 21000 # 2017 each department
-    ),
-    grouping_year = c(
-      1, # global
-      1, 1, 1, # by department
-      0, # 2016
-      0, 0, 0, # 2016 by department
-      0, # 2017
-      0, 0, 0 # 2017 by department
-    ),
-    grouping_department = c(
-      1, # global
-      0, 0, 0, # by department
-      1, # 2016
-      0, 0, 0, # 2016 by department
-      1, # 2017
-      0, 0, 0 # 2017 by department
-    ),
-    grouping_id = c(
-      3, #  11
-      2, 2, 2, # 10
-      1, # 01
-      0, 0, 0, # 00
-      1, # 01
-      0, 0, 0 # 00
-    ),
-    stringsAsFactors = FALSE
-  )
-
-  expect_equal(actual_cube, expected_cube)
-
-  # cube should accept column objects
-  expect_equal(
-    count(sum(cube(df, df$year, df$department), "salary")),
-    12
-  )
-
-  # cube without columns should result in a single aggregate
-  expect_equal(
-    collect(agg(cube(df), expr("sum(salary) as total_salary"))),
-    data.frame(total_salary = 120000)
-  )
-
-  actual_rollup <- collect(
-    orderBy(
-      agg(
-        rollup(df, "year", "department"),
-        expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"),
-        alias(grouping_bit(df$year), "grouping_year"),
-        alias(grouping_bit(df$department), "grouping_department"),
-        alias(grouping_id(df$year, df$department), "grouping_id")
-      ),
-      "year", "department"
-    )
-  )
-
-  expected_rollup <- data.frame(
-    year = c(NA, rep(2016, 4), rep(2017, 4)),
-    department = c(NA, rep(c(NA, "management", "rnd", "sales"), times = 2)),
-    total_salary = c(
-      120000, # Total
-      20000 + 15000 + 10000, # 2016
-      10000, 15000, 20000, # 2016 each department
-      21000 + 32000 + 22000, # 2017
-      22000, 32000, 21000 # 2017 each department
-    ),
-    average_salary = c(
-      # Total
-      mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
-      mean(c(10000, 15000, 20000)), # 2016
-      10000, 15000, 20000, # 2016 each department
-      mean(c(21000, 32000, 22000)), # 2017
-      22000, 32000, 21000 # 2017 each department
-    ),
-    grouping_year = c(
-      1, # global
-      0, # 2016
-      0, 0, 0, # 2016 each department
-      0, # 2017
-      0, 0, 0 # 2017 each department
-    ),
-    grouping_department = c(
-      1, # global
-      1, # 2016
-      0, 0, 0, # 2016 each department
-      1, # 2017
-      0, 0, 0 # 2017 each department
-    ),
-    grouping_id = c(
-      3, # 11
-      1, # 01
-      0, 0, 0, # 00
-      1, # 01
-      0, 0, 0 # 00
-    ),
-    stringsAsFactors = FALSE
-  )
-
-  expect_equal(actual_rollup, expected_rollup)
-
-  # cube should accept column objects
-  expect_equal(
-    count(sum(rollup(df, df$year, df$department), "salary")),
-    9
-  )
-
-  # rollup without columns should result in a single aggregate
-  expect_equal(
-    collect(agg(rollup(df), expr("sum(salary) as total_salary"))),
-    data.frame(total_salary = 120000)
-  )
-})
-
-test_that("arrange() and orderBy() on a DataFrame", {
-  df <- read.json(jsonPath)
-  sorted <- arrange(df, df$age)
-  expect_equal(collect(sorted)[1, 2], "Michael")
-
-  sorted2 <- arrange(df, "name", decreasing = FALSE)
-  expect_equal(collect(sorted2)[2, "age"], 19)
-
-  sorted3 <- orderBy(df, asc(df$age))
-  expect_true(is.na(first(sorted3)$age))
-  expect_equal(collect(sorted3)[2, "age"], 19)
-
-  sorted4 <- orderBy(df, desc(df$name))
-  expect_equal(first(sorted4)$name, "Michael")
-  expect_equal(collect(sorted4)[3, "name"], "Andy")
-
-  sorted5 <- arrange(df, "age", "name", decreasing = TRUE)
-  expect_equal(collect(sorted5)[1, 2], "Andy")
-
-  sorted6 <- arrange(df, "age", "name", decreasing = c(T, F))
-  expect_equal(collect(sorted6)[1, 2], "Andy")
-
-  sorted7 <- arrange(df, "name", decreasing = FALSE)
-  expect_equal(collect(sorted7)[2, "age"], 19)
-})
-
-test_that("filter() on a DataFrame", {
-  df <- read.json(jsonPath)
-  filtered <- filter(df, "age > 20")
-  expect_equal(count(filtered), 1)
-  expect_equal(collect(filtered)$name, "Andy")
-  filtered2 <- where(df, df$name != "Michael")
-  expect_equal(count(filtered2), 2)
-  expect_equal(collect(filtered2)$age[2], 19)
-
-  # test suites for %in%
-  filtered3 <- filter(df, "age in (19)")
-  expect_equal(count(filtered3), 1)
-  filtered4 <- filter(df, "age in (19, 30)")
-  expect_equal(count(filtered4), 2)
-  filtered5 <- where(df, df$age %in% c(19))
-  expect_equal(count(filtered5), 1)
-  filtered6 <- where(df, df$age %in% c(19, 30))
-  expect_equal(count(filtered6), 2)
-
-  # test suites for %<=>%
-  dfNa <- read.json(jsonPathNa)
-  expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
-  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
-  expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
-  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
-  # match NA from two columns
-  expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
-  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)
-
-  # Test stats::filter is working
-  #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
-})
-
-test_that("join(), crossJoin() and merge() on a DataFrame", {
-  skip_on_cran()
-
-  df <- read.json(jsonPath)
-
-  mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
-                  "{\"name\":\"Andy\",  \"test\": \"no\"}",
-                  "{\"name\":\"Justin\", \"test\": \"yes\"}",
-                  "{\"name\":\"Bob\", \"test\": \"yes\"}")
-  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(mockLines2, jsonPath2)
-  df2 <- read.json(jsonPath2)
-
-  # inner join, not cartesian join
-  expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
-  # cartesian join
-  expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
-               paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
-                      " INNER join between logical plans).*"))
-
-  joined <- crossJoin(df, df2)
-  expect_equal(names(joined), c("age", "name", "name", "test"))
-  expect_equal(count(joined), 12)
-  expect_equal(names(collect(joined)), c("age", "name", "name", "test"))
-
-  joined2 <- join(df, df2, df$name == df2$name)
-  expect_equal(names(joined2), c("age", "name", "name", "test"))
-  expect_equal(count(joined2), 3)
-
-  joined3 <- join(df, df2, df$name == df2$name, "rightouter")
-  expect_equal(names(joined3), c("age", "name", "name", "test"))
-  expect_equal(count(joined3), 4)
-  expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
-
-  joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
-                    alias(df$age + 5, "newAge"), df$name, df2$test)
-  expect_equal(names(joined4), c("newAge", "name", "test"))
-  expect_equal(count(joined4), 4)
-  expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
-
-  joined5 <- join(df, df2, df$name == df2$name, "leftouter")
-  expect_equal(names(joined5), c("age", "name", "name", "test"))
-  expect_equal(count(joined5), 3)
-  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
-
-  joined6 <- join(df, df2, df$name == df2$name, "inner")
-  expect_equal(names(joined6), c("age", "name", "name", "test"))
-  expect_equal(count(joined6), 3)
-
-  joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
-  expect_equal(names(joined7), c("age", "name"))
-  expect_equal(count(joined7), 3)
-
-  joined8 <- join(df, df2, df$name == df2$name, "left_outer")
-  expect_equal(names(joined8), c("age", "name", "name", "test"))
-  expect_equal(count(joined8), 3)
-  expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
-
-  joined9 <- join(df, df2, df$name == df2$name, "right_outer")
-  expect_equal(names(joined9), c("age", "name", "name", "test"))
-  expect_equal(count(joined9), 4)
-  expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
-
-  merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
-  expect_equal(count(merged), 4)
-  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
-  expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19)
-
-  merged <- merge(df, df2, suffixes = c("-X", "-Y"))
-  expect_equal(count(merged), 3)
-  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
-  expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30)
-
-  merged <- merge(df, df2, by = "name", suffixes = c("-X", "-Y"), sort = FALSE)
-  expect_equal(count(merged), 3)
-  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
-  expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael")
-
-  merged <- merge(df, df2, by = "name", all = T, sort = T)
-  expect_equal(count(merged), 4)
-  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
-  expect_equal(collect(orderBy(merged, merged$"name_y"))$"name_x"[1], "Andy")
-
-  merged <- merge(df, df2, by = NULL)
-  expect_equal(count(merged), 12)
-  expect_equal(names(merged), c("age", "name", "name", "test"))
-
-  mockLines3 <- c("{\"name\":\"Michael\", \"name_y\":\"Michael\", \"test\": \"yes\"}",
-                  "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}",
-                  "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}",
-                  "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
-  jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(mockLines3, jsonPath3)
-  df3 <- read.json(jsonPath3)
-  expect_error(merge(df, df3),
-               paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
-                     "Please use different suffixes for the intersected columns.", sep = ""))
-
-  unlink(jsonPath2)
-  unlink(jsonPath3)
-
-  # Join with broadcast hint
-  df1 <- sql("SELECT * FROM range(10e10)")
-  df2 <- sql("SELECT * FROM range(10e10)")
-
-  execution_plan <- capture.output(explain(join(df1, df2, df1$id == df2$id)))
-  expect_false(any(grepl("BroadcastHashJoin", execution_plan)))
-
-  execution_plan_hint <- capture.output(
-    explain(join(df1, hint(df2, "broadcast"), df1$id == df2$id))
-  )
-  expect_true(any(grepl("BroadcastHashJoin", execution_plan_hint)))
-
-  execution_plan_broadcast <- capture.output(
-    explain(join(df1, broadcast(df2), df1$id == df2$id))
-  )
-  expect_true(any(grepl("BroadcastHashJoin", execution_plan_broadcast)))
-})
-
-test_that("toJSON() on DataFrame", {
-  df <- as.DataFrame(cars)
-  df_json <- toJSON(df)
-  expect_is(df_json, "SparkDataFrame")
-  expect_equal(colnames(df_json), c("value"))
-  expect_equal(head(df_json, 1),
-              data.frame(value = "{\"speed\":4.0,\"dist\":2.0}", stringsAsFactors = FALSE))
-})
-
-test_that("showDF()", {
-  df <- read.json(jsonPath)
-  expected <- paste("+----+-------+\n",
-                    "| age|   name|\n",
-                    "+----+-------+\n",
-                    "|null|Michael|\n",
-                    "|  30|   Andy|\n",
-                    "|  19| Justin|\n",
-                    "+----+-------+\n", sep = "")
-  expected2 <- paste("+---+----+\n",
-                     "|age|name|\n",
-                     "+---+----+\n",
-                     "|nul| Mic|\n",
-                     "| 30| And|\n",
-                     "| 19| Jus|\n",
-                     "+---+----+\n", sep = "")
-  expect_output(showDF(df), expected)
-  expect_output(showDF(df, truncate = 3), expected2)
-})
-
-test_that("isLocal()", {
-  df <- read.json(jsonPath)
-  expect_false(isLocal(df))
-})
-
-test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
-  df <- read.json(jsonPath)
-
-  lines <- c("{\"name\":\"Bob\", \"age\":24}",
-             "{\"name\":\"Andy\", \"age\":30}",
-             "{\"name\":\"James\", \"age\":35}")
-  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPath2)
-  df2 <- read.df(jsonPath2, "json")
-
-  unioned <- arrange(union(df, df2), df$age)
-  expect_is(unioned, "SparkDataFrame")
-  expect_equal(count(unioned), 6)
-  expect_equal(first(unioned)$name, "Michael")
-  expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
-
-  unioned2 <- arrange(rbind(unioned, df, df2), df$age)
-  expect_is(unioned2, "SparkDataFrame")
-  expect_equal(count(unioned2), 12)
-  expect_equal(first(unioned2)$name, "Michael")
-
-  df3 <- df2
-  names(df3)[1] <- "newName"
-  expect_error(rbind(df, df3),
-               "Names of input data frames are different.")
-  expect_error(rbind(df, df2, df3),
-               "Names of input data frames are different.")
-
-  excepted <- arrange(except(df, df2), desc(df$age))
-  expect_is(unioned, "SparkDataFrame")
-  expect_equal(count(excepted), 2)
-  expect_equal(first(excepted)$name, "Justin")
-
-  intersected <- arrange(intersect(df, df2), df$age)
-  expect_is(unioned, "SparkDataFrame")
-  expect_equal(count(intersected), 1)
-  expect_equal(first(intersected)$name, "Andy")
-
-  # Test base::union is working
-  expect_equal(union(c(1:3), c(3:5)), c(1:5))
-
-  # Test base::rbind is working
-  expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)
-
-  # Test base::intersect is working
-  expect_equal(length(intersect(1:20, 3:23)), 18)
-
-  unlink(jsonPath2)
-})
-
-test_that("withColumn() and withColumnRenamed()", {
-  df <- read.json(jsonPath)
-  newDF <- withColumn(df, "newAge", df$age + 2)
-  expect_equal(length(columns(newDF)), 3)
-  expect_equal(columns(newDF)[3], "newAge")
-  expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
-
-  # Replace existing column
-  newDF <- withColumn(df, "age", df$age + 2)
-  expect_equal(length(columns(newDF)), 2)
-  expect_equal(first(filter(newDF, df$name != "Michael"))$age, 32)
-
-  newDF <- withColumn(df, "age", 18)
-  expect_equal(length(columns(newDF)), 2)
-  expect_equal(first(newDF)$age, 18)
-
-  expect_error(withColumn(df, "age", list("a")),
-              "Literal value must be atomic in length of 1")
-
-  newDF2 <- withColumnRenamed(df, "age", "newerAge")
-  expect_equal(length(columns(newDF2)), 2)
-  expect_equal(columns(newDF2)[1], "newerAge")
-})
-
-test_that("mutate(), transform(), rename() and names()", {
-  df <- read.json(jsonPath)
-  newDF <- mutate(df, newAge = df$age + 2)
-  expect_equal(length(columns(newDF)), 3)
-  expect_equal(columns(newDF)[3], "newAge")
-  expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
-
-  newDF <- mut

<TRUNCATED>

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[2/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
new file mode 100644
index 0000000..c790d02
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -0,0 +1,3474 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("SparkSQL functions")
+
+# Utility function for easily checking the values of a StructField
+checkStructField <- function(actual, expectedName, expectedType, expectedNullable) {
+  expect_equal(class(actual), "structField")
+  expect_equal(actual$name(), expectedName)
+  expect_equal(actual$dataType.toString(), expectedType)
+  expect_equal(actual$nullable(), expectedNullable)
+}
+
+markUtf8 <- function(s) {
+  Encoding(s) <- "UTF-8"
+  s
+}
+
+setHiveContext <- function(sc) {
+  if (exists(".testHiveSession", envir = .sparkREnv)) {
+    hiveSession <- get(".testHiveSession", envir = .sparkREnv)
+  } else {
+    # initialize once and reuse
+    ssc <- callJMethod(sc, "sc")
+    hiveCtx <- tryCatch({
+      newJObject("org.apache.spark.sql.hive.test.TestHiveContext", ssc, FALSE)
+    },
+    error = function(err) {
+      skip("Hive is not build with SparkSQL, skipped")
+    })
+    hiveSession <- callJMethod(hiveCtx, "sparkSession")
+  }
+  previousSession <- get(".sparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", hiveSession, envir = .sparkREnv)
+  assign(".prevSparkRsession", previousSession, envir = .sparkREnv)
+  hiveSession
+}
+
+unsetHiveContext <- function() {
+  previousSession <- get(".prevSparkRsession", envir = .sparkREnv)
+  assign(".sparkRsession", previousSession, envir = .sparkREnv)
+  remove(".prevSparkRsession", envir = .sparkREnv)
+}
+
+# Tests for SparkSQL functions in SparkR
+
+filesBefore <- list.files(path = sparkRDir, all.files = TRUE)
+sparkSession <- if (not_cran_or_windows_with_hadoop()) {
+    sparkR.session(master = sparkRTestMaster)
+  } else {
+    sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+  }
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockLines <- c("{\"name\":\"Michael\"}",
+               "{\"name\":\"Andy\", \"age\":30}",
+               "{\"name\":\"Justin\", \"age\":19}")
+jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
+orcPath <- tempfile(pattern = "sparkr-test", fileext = ".orc")
+writeLines(mockLines, jsonPath)
+
+# For test nafunctions, like dropna(), fillna(),...
+mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
+                 "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
+                 "{\"name\":\"David\",\"age\":60,\"height\":null}",
+                 "{\"name\":\"Amy\",\"age\":null,\"height\":null}",
+                 "{\"name\":null,\"age\":null,\"height\":null}")
+jsonPathNa <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesNa, jsonPathNa)
+
+# For test complex types in DataFrame
+mockLinesComplexType <-
+  c("{\"c1\":[1, 2, 3], \"c2\":[\"a\", \"b\", \"c\"], \"c3\":[1.0, 2.0, 3.0]}",
+    "{\"c1\":[4, 5, 6], \"c2\":[\"d\", \"e\", \"f\"], \"c3\":[4.0, 5.0, 6.0]}",
+    "{\"c1\":[7, 8, 9], \"c2\":[\"g\", \"h\", \"i\"], \"c3\":[7.0, 8.0, 9.0]}")
+complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesComplexType, complexTypeJsonPath)
+
+# For test map type and struct type in DataFrame
+mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
+                      "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
+                      "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
+mapTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+writeLines(mockLinesMapType, mapTypeJsonPath)
+
+if (.Platform$OS.type == "windows") {
+  Sys.setenv(TZ = "GMT")
+}
+
+test_that("calling sparkRSQL.init returns existing SQL context", {
+  skip_on_cran()
+
+  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+  expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext)
+})
+
+test_that("calling sparkRSQL.init returns existing SparkSession", {
+  skip_on_cran()
+
+  expect_equal(suppressWarnings(sparkRSQL.init(sc)), sparkSession)
+})
+
+test_that("calling sparkR.session returns existing SparkSession", {
+  skip_on_cran()
+
+  expect_equal(sparkR.session(), sparkSession)
+})
+
+test_that("infer types and check types", {
+  expect_equal(infer_type(1L), "integer")
+  expect_equal(infer_type(1.0), "double")
+  expect_equal(infer_type("abc"), "string")
+  expect_equal(infer_type(TRUE), "boolean")
+  expect_equal(infer_type(as.Date("2015-03-11")), "date")
+  expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
+  expect_equal(infer_type(c(1L, 2L)), "array<integer>")
+  expect_equal(infer_type(list(1L, 2L)), "array<integer>")
+  expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
+  e <- new.env()
+  assign("a", 1L, envir = e)
+  expect_equal(infer_type(e), "map<string,integer>")
+
+  expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
+
+  expect_equal(infer_type(as.raw(c(1, 2, 3))), "binary")
+})
+
+test_that("structType and structField", {
+  testField <- structField("a", "string")
+  expect_is(testField, "structField")
+  expect_equal(testField$name(), "a")
+  expect_true(testField$nullable())
+
+  testSchema <- structType(testField, structField("b", "integer"))
+  expect_is(testSchema, "structType")
+  expect_is(testSchema$fields()[[2]], "structField")
+  expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType")
+})
+
+test_that("structField type strings", {
+  # positive cases
+  primitiveTypes <- list(byte = "ByteType",
+                         integer = "IntegerType",
+                         float = "FloatType",
+                         double = "DoubleType",
+                         string = "StringType",
+                         binary = "BinaryType",
+                         boolean = "BooleanType",
+                         timestamp = "TimestampType",
+                         date = "DateType",
+                         tinyint = "ByteType",
+                         smallint = "ShortType",
+                         int = "IntegerType",
+                         bigint = "LongType",
+                         decimal = "DecimalType(10,0)")
+
+  complexTypes <- list("map<string,integer>" = "MapType(StringType,IntegerType,true)",
+                       "array<string>" = "ArrayType(StringType,true)",
+                       "struct<a:string>" = "StructType(StructField(a,StringType,true))")
+
+  typeList <- c(primitiveTypes, complexTypes)
+  typeStrings <- names(typeList)
+
+  for (i in seq_along(typeStrings)){
+    typeString <- typeStrings[i]
+    expected <- typeList[[i]]
+    testField <- structField("_col", typeString)
+    expect_is(testField, "structField")
+    expect_true(testField$nullable())
+    expect_equal(testField$dataType.toString(), expected)
+  }
+
+  # negative cases
+  primitiveErrors <- list(Byte = "Byte",
+                          INTEGER = "INTEGER",
+                          numeric = "numeric",
+                          character = "character",
+                          raw = "raw",
+                          logical = "logical",
+                          short = "short",
+                          varchar = "varchar",
+                          long = "long",
+                          char = "char")
+
+  complexErrors <- list("map<string, integer>" = " integer",
+                        "array<String>" = "String",
+                        "struct<a:string >" = "string ",
+                        "map <string,integer>" = "map <string,integer>",
+                        "array< string>" = " string",
+                        "struct<a: string>" = " string")
+
+  errorList <- c(primitiveErrors, complexErrors)
+  typeStrings <- names(errorList)
+
+  for (i in seq_along(typeStrings)){
+    typeString <- typeStrings[i]
+    expected <- paste0("Unsupported type for SparkDataframe: ", errorList[[i]])
+    expect_error(structField("_col", typeString), expected)
+  }
+})
+
+test_that("create DataFrame from RDD", {
+  skip_on_cran()
+
+  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
+  df <- createDataFrame(rdd, list("a", "b"))
+  dfAsDF <- as.DataFrame(rdd, list("a", "b"))
+  expect_is(df, "SparkDataFrame")
+  expect_is(dfAsDF, "SparkDataFrame")
+  expect_equal(count(df), 10)
+  expect_equal(count(dfAsDF), 10)
+  expect_equal(nrow(df), 10)
+  expect_equal(nrow(dfAsDF), 10)
+  expect_equal(ncol(df), 2)
+  expect_equal(ncol(dfAsDF), 2)
+  expect_equal(dim(df), c(10, 2))
+  expect_equal(dim(dfAsDF), c(10, 2))
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(columns(dfAsDF), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+  expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
+
+  df <- createDataFrame(rdd)
+  dfAsDF <- as.DataFrame(rdd)
+  expect_is(df, "SparkDataFrame")
+  expect_is(dfAsDF, "SparkDataFrame")
+  expect_equal(columns(df), c("_1", "_2"))
+  expect_equal(columns(dfAsDF), c("_1", "_2"))
+
+  schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
+                        structField(x = "b", type = "string", nullable = TRUE))
+  df <- createDataFrame(rdd, schema)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
+  df <- createDataFrame(rdd)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 10)
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+  schema <- structType(structField("name", "string"), structField("age", "integer"),
+                       structField("height", "float"))
+  df <- read.df(jsonPathNa, "json", schema)
+  df2 <- createDataFrame(toRDD(df), schema)
+  df2AsDF <- as.DataFrame(toRDD(df), schema)
+  expect_equal(columns(df2), c("name", "age", "height"))
+  expect_equal(columns(df2AsDF), c("name", "age", "height"))
+  expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(as.list(collect(where(df2, df2$name == "Bob"))),
+               list(name = "Bob", age = 16, height = 176.5))
+  expect_equal(as.list(collect(where(df2AsDF, df2AsDF$name == "Bob"))),
+               list(name = "Bob", age = 16, height = 176.5))
+
+  localDF <- data.frame(name = c("John", "Smith", "Sarah"),
+                        age = c(19L, 23L, 18L),
+                        height = c(176.5, 181.4, 173.7))
+  df <- createDataFrame(localDF, schema)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 3)
+  expect_equal(columns(df), c("name", "age", "height"))
+  expect_equal(dtypes(df), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(as.list(collect(where(df, df$name == "John"))),
+               list(name = "John", age = 19L, height = 176.5))
+  expect_equal(getNumPartitions(df), 1)
+
+  df <- as.DataFrame(cars, numPartitions = 2)
+  expect_equal(getNumPartitions(df), 2)
+  df <- createDataFrame(cars, numPartitions = 3)
+  expect_equal(getNumPartitions(df), 3)
+  # validate limit by num of rows
+  df <- createDataFrame(cars, numPartitions = 60)
+  expect_equal(getNumPartitions(df), 50)
+  # validate when 1 < (length(coll) / numSlices) << length(coll)
+  df <- createDataFrame(cars, numPartitions = 20)
+  expect_equal(getNumPartitions(df), 20)
+
+  df <- as.DataFrame(data.frame(0))
+  expect_is(df, "SparkDataFrame")
+  df <- createDataFrame(list(list(1)))
+  expect_is(df, "SparkDataFrame")
+  df <- as.DataFrame(data.frame(0), numPartitions = 2)
+  # no data to partition, goes to 1
+  expect_equal(getNumPartitions(df), 1)
+
+  setHiveContext(sc)
+  sql("CREATE TABLE people (name string, age double, height float)")
+  df <- read.df(jsonPathNa, "json", schema)
+  insertInto(df, "people")
+  expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age,
+               c(16))
+  expect_equal(collect(sql("SELECT height from people WHERE name ='Bob'"))$height,
+               c(176.5))
+  sql("DROP TABLE people")
+  unsetHiveContext()
+})
+
+test_that("createDataFrame uses files for large objects", {
+  skip_on_cran()
+
+  # To simulate a large file scenario, we set spark.r.maxAllocationLimit to a smaller value
+  conf <- callJMethod(sparkSession, "conf")
+  callJMethod(conf, "set", "spark.r.maxAllocationLimit", "100")
+  df <- suppressWarnings(createDataFrame(iris, numPartitions = 3))
+  expect_equal(getNumPartitions(df), 3)
+
+  # Resetting the conf back to default value
+  callJMethod(conf, "set", "spark.r.maxAllocationLimit", toString(.Machine$integer.max / 10))
+  expect_equal(dim(df), dim(iris))
+})
+
+test_that("read/write csv as DataFrame", {
+  if (not_cran_or_windows_with_hadoop()) {
+    csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+    mockLinesCsv <- c("year,make,model,comment,blank",
+                     "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+                     "1997,Ford,E350,\"Go get one now they are going fast\",",
+                     "2015,Chevy,Volt",
+                     "NA,Dummy,Placeholder")
+    writeLines(mockLinesCsv, csvPath)
+
+    # default "header" is false, inferSchema to handle "year" as "int"
+    df <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+    expect_equal(count(df), 4)
+    expect_equal(columns(df), c("year", "make", "model", "comment", "blank"))
+    expect_equal(sort(unlist(collect(where(df, df$year == 2015)))),
+                 sort(unlist(list(year = 2015, make = "Chevy", model = "Volt"))))
+
+    # since "year" is "int", let's skip the NA values
+    withoutna <- na.omit(df, how = "any", cols = "year")
+    expect_equal(count(withoutna), 3)
+
+    unlink(csvPath)
+    csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+    mockLinesCsv <- c("year,make,model,comment,blank",
+                     "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+                     "1997,Ford,E350,\"Go get one now they are going fast\",",
+                     "2015,Chevy,Volt",
+                     "Empty,Dummy,Placeholder")
+    writeLines(mockLinesCsv, csvPath)
+
+    df2 <- read.df(csvPath, "csv", header = "true", inferSchema = "true", na.strings = "Empty")
+    expect_equal(count(df2), 4)
+    withoutna2 <- na.omit(df2, how = "any", cols = "year")
+    expect_equal(count(withoutna2), 3)
+    expect_equal(count(where(withoutna2, withoutna2$make == "Dummy")), 0)
+
+    # writing csv file
+    csvPath2 <- tempfile(pattern = "csvtest2", fileext = ".csv")
+    write.df(df2, path = csvPath2, "csv", header = "true")
+    df3 <- read.df(csvPath2, "csv", header = "true")
+    expect_equal(nrow(df3), nrow(df2))
+    expect_equal(colnames(df3), colnames(df2))
+    csv <- read.csv(file = list.files(csvPath2, pattern = "^part", full.names = T)[[1]])
+    expect_equal(colnames(df3), colnames(csv))
+
+    unlink(csvPath)
+    unlink(csvPath2)
+  }
+})
+
+test_that("Support other types for options", {
+  skip_on_cran()
+
+  csvPath <- tempfile(pattern = "sparkr-test", fileext = ".csv")
+  mockLinesCsv <- c("year,make,model,comment,blank",
+  "\"2012\",\"Tesla\",\"S\",\"No comment\",",
+  "1997,Ford,E350,\"Go get one now they are going fast\",",
+  "2015,Chevy,Volt",
+  "NA,Dummy,Placeholder")
+  writeLines(mockLinesCsv, csvPath)
+
+  csvDf <- read.df(csvPath, "csv", header = "true", inferSchema = "true")
+  expected <- read.df(csvPath, "csv", header = TRUE, inferSchema = TRUE)
+  expect_equal(collect(csvDf), collect(expected))
+
+  expect_error(read.df(csvPath, "csv", header = TRUE, maxColumns = 3))
+  unlink(csvPath)
+})
+
+test_that("convert NAs to null type in DataFrames", {
+  rdd <- parallelize(sc, list(list(1L, 2L), list(NA, 4L)))
+  df <- createDataFrame(rdd, list("a", "b"))
+  expect_true(is.na(collect(df)[2, "a"]))
+  expect_equal(collect(df)[2, "b"], 4L)
+
+  l <- data.frame(x = 1L, y = c(1L, NA_integer_, 3L))
+  df <- createDataFrame(l)
+  expect_equal(collect(df)[2, "x"], 1L)
+  expect_true(is.na(collect(df)[2, "y"]))
+
+  rdd <- parallelize(sc, list(list(1, 2), list(NA, 4)))
+  df <- createDataFrame(rdd, list("a", "b"))
+  expect_true(is.na(collect(df)[2, "a"]))
+  expect_equal(collect(df)[2, "b"], 4)
+
+  l <- data.frame(x = 1, y = c(1, NA_real_, 3))
+  df <- createDataFrame(l)
+  expect_equal(collect(df)[2, "x"], 1)
+  expect_true(is.na(collect(df)[2, "y"]))
+
+  l <- list("a", "b", NA, "d")
+  df <- createDataFrame(l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], "d")
+
+  l <- list("a", "b", NA_character_, "d")
+  df <- createDataFrame(l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], "d")
+
+  l <- list(TRUE, FALSE, NA, TRUE)
+  df <- createDataFrame(l)
+  expect_true(is.na(collect(df)[3, "_1"]))
+  expect_equal(collect(df)[4, "_1"], TRUE)
+})
+
+test_that("toDF", {
+  skip_on_cran()
+
+  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
+  df <- toDF(rdd, list("a", "b"))
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 10)
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+  df <- toDF(rdd)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(columns(df), c("_1", "_2"))
+
+  schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
+                        structField(x = "b", type = "string", nullable = TRUE))
+  df <- toDF(rdd, schema)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+
+  rdd <- lapply(parallelize(sc, 1:10), function(x) { list(a = x, b = as.character(x)) })
+  df <- toDF(rdd)
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 10)
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+})
+
+test_that("create DataFrame from list or data.frame", {
+  l <- list(list(1, 2), list(3, 4))
+  df <- createDataFrame(l, c("a", "b"))
+  expect_equal(columns(df), c("a", "b"))
+
+  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+  df <- createDataFrame(l)
+  expect_equal(columns(df), c("a", "b"))
+
+  a <- 1:3
+  b <- c("a", "b", "c")
+  ldf <- data.frame(a, b)
+  df <- createDataFrame(ldf)
+  expect_equal(columns(df), c("a", "b"))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+  expect_equal(count(df), 3)
+  ldf2 <- collect(df)
+  expect_equal(ldf$a, ldf2$a)
+
+  irisdf <- suppressWarnings(createDataFrame(iris))
+  iris_collected <- collect(irisdf)
+  expect_equivalent(iris_collected[, -5], iris[, -5])
+  expect_equal(iris_collected$Species, as.character(iris$Species))
+
+  mtcarsdf <- createDataFrame(mtcars)
+  expect_equivalent(collect(mtcarsdf), mtcars)
+
+  bytes <- as.raw(c(1, 2, 3))
+  df <- createDataFrame(list(list(bytes)))
+  expect_equal(collect(df)[[1]][[1]], bytes)
+})
+
+test_that("create DataFrame with different data types", {
+  l <- list(a = 1L, b = 2, c = TRUE, d = "ss", e = as.Date("2012-12-13"),
+            f = as.POSIXct("2015-03-15 12:13:14.056"))
+  df <- createDataFrame(list(l))
+  expect_equal(dtypes(df), list(c("a", "int"), c("b", "double"), c("c", "boolean"),
+                                c("d", "string"), c("e", "date"), c("f", "timestamp")))
+  expect_equal(count(df), 1)
+  expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
+})
+
+test_that("SPARK-17811: can create DataFrame containing NA as date and time", {
+  df <- data.frame(
+    id = 1:2,
+    time = c(as.POSIXlt("2016-01-10"), NA),
+    date = c(as.Date("2016-10-01"), NA))
+
+  DF <- collect(createDataFrame(df))
+  expect_true(is.na(DF$date[2]))
+  expect_equal(DF$date[1], as.Date("2016-10-01"))
+  expect_true(is.na(DF$time[2]))
+  expect_equal(DF$time[1], as.POSIXlt("2016-01-10"))
+})
+
+test_that("create DataFrame with complex types", {
+  e <- new.env()
+  assign("n", 3L, envir = e)
+
+  s <- listToStruct(list(a = "aa", b = 3L))
+
+  l <- list(as.list(1:10), list("a", "b"), e, s)
+  df <- createDataFrame(list(l), c("a", "b", "c", "d"))
+  expect_equal(dtypes(df), list(c("a", "array<int>"),
+                                c("b", "array<string>"),
+                                c("c", "map<string,int>"),
+                                c("d", "struct<a:string,b:int>")))
+  expect_equal(count(df), 1)
+  ldf <- collect(df)
+  expect_equal(names(ldf), c("a", "b", "c", "d"))
+  expect_equal(ldf[1, 1][[1]], l[[1]])
+  expect_equal(ldf[1, 2][[1]], l[[2]])
+
+  e <- ldf$c[[1]]
+  expect_equal(class(e), "environment")
+  expect_equal(ls(e), "n")
+  expect_equal(e$n, 3L)
+
+  s <- ldf$d[[1]]
+  expect_equal(class(s), "struct")
+  expect_equal(s$a, "aa")
+  expect_equal(s$b, 3L)
+})
+
+test_that("create DataFrame from a data.frame with complex types", {
+  skip_on_cran()
+
+  ldf <- data.frame(row.names = 1:2)
+  ldf$a_list <- list(list(1, 2), list(3, 4))
+  ldf$an_envir <- c(as.environment(list(a = 1, b = 2)), as.environment(list(c = 3)))
+
+  sdf <- createDataFrame(ldf)
+  collected <- collect(sdf)
+
+  expect_identical(ldf[, 1, FALSE], collected[, 1, FALSE])
+  expect_equal(ldf$an_envir, collected$an_envir)
+})
+
+test_that("Collect DataFrame with complex types", {
+  skip_on_cran()
+
+  # ArrayType
+  df <- read.json(complexTypeJsonPath)
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 3)
+  expect_equal(names(ldf), c("c1", "c2", "c3"))
+  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
+  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
+  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
+
+  # MapType
+  schema <- structType(structField("name", "string"),
+                       structField("info", "map<string,double>"))
+  df <- read.df(mapTypeJsonPath, "json", schema)
+  expect_equal(dtypes(df), list(c("name", "string"),
+                                c("info", "map<string,double>")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("name", "info"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "environment")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
+
+  # StructType
+  df <- read.json(mapTypeJsonPath)
+  expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
+                                c("name", "string")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("info", "name"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "struct")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
+})
+
+test_that("read/write json files", {
+  if (not_cran_or_windows_with_hadoop()) {
+    # Test read.df
+    df <- read.df(jsonPath, "json")
+    expect_is(df, "SparkDataFrame")
+    expect_equal(count(df), 3)
+
+    # Test read.df with a user defined schema
+    schema <- structType(structField("name", type = "string"),
+                         structField("age", type = "double"))
+
+    df1 <- read.df(jsonPath, "json", schema)
+    expect_is(df1, "SparkDataFrame")
+    expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
+
+    # Test loadDF
+    df2 <- loadDF(jsonPath, "json", schema)
+    expect_is(df2, "SparkDataFrame")
+    expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
+
+    # Test read.json
+    df <- read.json(jsonPath)
+    expect_is(df, "SparkDataFrame")
+    expect_equal(count(df), 3)
+
+    # Test write.df
+    jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".json")
+    write.df(df, jsonPath2, "json", mode = "overwrite")
+
+    # Test write.json
+    jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json")
+    write.json(df, jsonPath3)
+
+    # Test read.json()/jsonFile() works with multiple input paths
+    jsonDF1 <- read.json(c(jsonPath2, jsonPath3))
+    expect_is(jsonDF1, "SparkDataFrame")
+    expect_equal(count(jsonDF1), 6)
+    # Suppress warnings because jsonFile is deprecated
+    jsonDF2 <- suppressWarnings(jsonFile(c(jsonPath2, jsonPath3)))
+    expect_is(jsonDF2, "SparkDataFrame")
+    expect_equal(count(jsonDF2), 6)
+
+    unlink(jsonPath2)
+    unlink(jsonPath3)
+  }
+})
+
+test_that("read/write json files - compression option", {
+  skip_on_cran()
+
+  df <- read.df(jsonPath, "json")
+
+  jsonPath <- tempfile(pattern = "jsonPath", fileext = ".json")
+  write.json(df, jsonPath, compression = "gzip")
+  jsonDF <- read.json(jsonPath)
+  expect_is(jsonDF, "SparkDataFrame")
+  expect_equal(count(jsonDF), count(df))
+  expect_true(length(list.files(jsonPath, pattern = ".gz")) > 0)
+
+  unlink(jsonPath)
+})
+
+test_that("jsonRDD() on a RDD with json string", {
+  skip_on_cran()
+
+  sqlContext <- suppressWarnings(sparkRSQL.init(sc))
+  rdd <- parallelize(sc, mockLines)
+  expect_equal(countRDD(rdd), 3)
+  df <- suppressWarnings(jsonRDD(sqlContext, rdd))
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 3)
+
+  rdd2 <- flatMap(rdd, function(x) c(x, x))
+  df <- suppressWarnings(jsonRDD(sqlContext, rdd2))
+  expect_is(df, "SparkDataFrame")
+  expect_equal(count(df), 6)
+})
+
+test_that("test tableNames and tables", {
+  count <- count(listTables())
+
+  df <- read.json(jsonPath)
+  createOrReplaceTempView(df, "table1")
+  expect_equal(length(tableNames()), count + 1)
+  expect_equal(length(tableNames("default")), count + 1)
+
+  tables <- listTables()
+  expect_equal(count(tables), count + 1)
+  expect_equal(count(tables()), count(tables))
+  expect_true("tableName" %in% colnames(tables()))
+  expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables())))
+
+  suppressWarnings(registerTempTable(df, "table2"))
+  tables <- listTables()
+  expect_equal(count(tables), count + 2)
+  suppressWarnings(dropTempTable("table1"))
+  expect_true(dropTempView("table2"))
+
+  tables <- listTables()
+  expect_equal(count(tables), count + 0)
+})
+
+test_that(
+  "createOrReplaceTempView() results in a queryable table and sql() results in a new DataFrame", {
+  df <- read.json(jsonPath)
+  createOrReplaceTempView(df, "table1")
+  newdf <- sql("SELECT * FROM table1 where name = 'Michael'")
+  expect_is(newdf, "SparkDataFrame")
+  expect_equal(count(newdf), 1)
+  expect_true(dropTempView("table1"))
+
+  createOrReplaceTempView(df, "dfView")
+  sqlCast <- collect(sql("select cast('2' as decimal) as x from dfView limit 1"))
+  out <- capture.output(sqlCast)
+  expect_true(is.data.frame(sqlCast))
+  expect_equal(names(sqlCast)[1], "x")
+  expect_equal(nrow(sqlCast), 1)
+  expect_equal(ncol(sqlCast), 1)
+  expect_equal(out[1], "  x")
+  expect_equal(out[2], "1 2")
+  expect_true(dropTempView("dfView"))
+})
+
+test_that("test cache, uncache and clearCache", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  createOrReplaceTempView(df, "table1")
+  cacheTable("table1")
+  uncacheTable("table1")
+  clearCache()
+  expect_true(dropTempView("table1"))
+
+  expect_error(uncacheTable("foo"),
+      "Error in uncacheTable : no such table - Table or view 'foo' not found in database 'default'")
+})
+
+test_that("insertInto() on a registered table", {
+  if (not_cran_or_windows_with_hadoop()) {
+    df <- read.df(jsonPath, "json")
+    write.df(df, parquetPath, "parquet", "overwrite")
+    dfParquet <- read.df(parquetPath, "parquet")
+
+    lines <- c("{\"name\":\"Bob\", \"age\":24}",
+               "{\"name\":\"James\", \"age\":35}")
+    jsonPath2 <- tempfile(pattern = "jsonPath2", fileext = ".tmp")
+    parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
+    writeLines(lines, jsonPath2)
+    df2 <- read.df(jsonPath2, "json")
+    write.df(df2, parquetPath2, "parquet", "overwrite")
+    dfParquet2 <- read.df(parquetPath2, "parquet")
+
+    createOrReplaceTempView(dfParquet, "table1")
+    insertInto(dfParquet2, "table1")
+    expect_equal(count(sql("select * from table1")), 5)
+    expect_equal(first(sql("select * from table1 order by age"))$name, "Michael")
+    expect_true(dropTempView("table1"))
+
+    createOrReplaceTempView(dfParquet, "table1")
+    insertInto(dfParquet2, "table1", overwrite = TRUE)
+    expect_equal(count(sql("select * from table1")), 2)
+    expect_equal(first(sql("select * from table1 order by age"))$name, "Bob")
+    expect_true(dropTempView("table1"))
+
+    unlink(jsonPath2)
+    unlink(parquetPath2)
+  }
+})
+
+test_that("tableToDF() returns a new DataFrame", {
+  df <- read.json(jsonPath)
+  createOrReplaceTempView(df, "table1")
+  tabledf <- tableToDF("table1")
+  expect_is(tabledf, "SparkDataFrame")
+  expect_equal(count(tabledf), 3)
+  tabledf2 <- tableToDF("table1")
+  expect_equal(count(tabledf2), 3)
+  expect_true(dropTempView("table1"))
+})
+
+test_that("toRDD() returns an RRDD", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  testRDD <- toRDD(df)
+  expect_is(testRDD, "RDD")
+  expect_equal(countRDD(testRDD), 3)
+})
+
+test_that("union on two RDDs created from DataFrames returns an RRDD", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  RDD1 <- toRDD(df)
+  RDD2 <- toRDD(df)
+  unioned <- unionRDD(RDD1, RDD2)
+  expect_is(unioned, "RDD")
+  expect_equal(getSerializedMode(unioned), "byte")
+  expect_equal(collectRDD(unioned)[[2]]$name, "Andy")
+})
+
+test_that("union on mixed serialization types correctly returns a byte RRDD", {
+  skip_on_cran()
+
+  # Byte RDD
+  nums <- 1:10
+  rdd <- parallelize(sc, nums, 2L)
+
+  # String RDD
+  textLines <- c("Michael",
+                 "Andy, 30",
+                 "Justin, 19")
+  textPath <- tempfile(pattern = "sparkr-textLines", fileext = ".tmp")
+  writeLines(textLines, textPath)
+  textRDD <- textFile(sc, textPath)
+
+  df <- read.json(jsonPath)
+  dfRDD <- toRDD(df)
+
+  unionByte <- unionRDD(rdd, dfRDD)
+  expect_is(unionByte, "RDD")
+  expect_equal(getSerializedMode(unionByte), "byte")
+  expect_equal(collectRDD(unionByte)[[1]], 1)
+  expect_equal(collectRDD(unionByte)[[12]]$name, "Andy")
+
+  unionString <- unionRDD(textRDD, dfRDD)
+  expect_is(unionString, "RDD")
+  expect_equal(getSerializedMode(unionString), "byte")
+  expect_equal(collectRDD(unionString)[[1]], "Michael")
+  expect_equal(collectRDD(unionString)[[5]]$name, "Andy")
+})
+
+test_that("objectFile() works with row serialization", {
+  skip_on_cran()
+
+  objectPath <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  df <- read.json(jsonPath)
+  dfRDD <- toRDD(df)
+  saveAsObjectFile(coalesceRDD(dfRDD, 1L), objectPath)
+  objectIn <- objectFile(sc, objectPath)
+
+  expect_is(objectIn, "RDD")
+  expect_equal(getSerializedMode(objectIn), "byte")
+  expect_equal(collectRDD(objectIn)[[2]]$age, 30)
+})
+
+test_that("lapply() on a DataFrame returns an RDD with the correct columns", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  testRDD <- lapply(df, function(row) {
+    row$newCol <- row$age + 5
+    row
+    })
+  expect_is(testRDD, "RDD")
+  collected <- collectRDD(testRDD)
+  expect_equal(collected[[1]]$name, "Michael")
+  expect_equal(collected[[2]]$newCol, 35)
+})
+
+test_that("collect() returns a data.frame", {
+  df <- read.json(jsonPath)
+  rdf <- collect(df)
+  expect_true(is.data.frame(rdf))
+  expect_equal(names(rdf)[1], "age")
+  expect_equal(nrow(rdf), 3)
+  expect_equal(ncol(rdf), 2)
+
+  # collect() returns data correctly from a DataFrame with 0 row
+  df0 <- limit(df, 0)
+  rdf <- collect(df0)
+  expect_true(is.data.frame(rdf))
+  expect_equal(names(rdf)[1], "age")
+  expect_equal(nrow(rdf), 0)
+  expect_equal(ncol(rdf), 2)
+
+  # collect() correctly handles multiple columns with same name
+  df <- createDataFrame(list(list(1, 2)), schema = c("name", "name"))
+  ldf <- collect(df)
+  expect_equal(names(ldf), c("name", "name"))
+})
+
+test_that("limit() returns DataFrame with the correct number of rows", {
+  df <- read.json(jsonPath)
+  dfLimited <- limit(df, 2)
+  expect_is(dfLimited, "SparkDataFrame")
+  expect_equal(count(dfLimited), 2)
+})
+
+test_that("collect() and take() on a DataFrame return the same number of rows and columns", {
+  df <- read.json(jsonPath)
+  expect_equal(nrow(collect(df)), nrow(take(df, 10)))
+  expect_equal(ncol(collect(df)), ncol(take(df, 10)))
+})
+
+test_that("collect() support Unicode characters", {
+  lines <- c("{\"name\":\"안녕하세요\"}",
+             "{\"name\":\"您好\", \"age\":30}",
+             "{\"name\":\"こんにちは\", \"age\":19}",
+             "{\"name\":\"Xin chào\"}")
+
+  jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(lines, jsonPath)
+
+  df <- read.df(jsonPath, "json")
+  rdf <- collect(df)
+  expect_true(is.data.frame(rdf))
+  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
+  expect_equal(rdf$name[2], markUtf8("您好"))
+  expect_equal(rdf$name[3], markUtf8("こんにちは"))
+  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+
+  df1 <- createDataFrame(rdf)
+  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+})
+
+test_that("multiple pipeline transformations result in an RDD with the correct values", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  first <- lapply(df, function(row) {
+    row$age <- row$age + 5
+    row
+  })
+  second <- lapply(first, function(row) {
+    row$testCol <- if (row$age == 35 && !is.na(row$age)) TRUE else FALSE
+    row
+  })
+  expect_is(second, "RDD")
+  expect_equal(countRDD(second), 3)
+  expect_equal(collectRDD(second)[[2]]$age, 35)
+  expect_true(collectRDD(second)[[2]]$testCol)
+  expect_false(collectRDD(second)[[3]]$testCol)
+})
+
+test_that("cache(), storageLevel(), persist(), and unpersist() on a DataFrame", {
+  df <- read.json(jsonPath)
+  expect_false(df@env$isCached)
+  cache(df)
+  expect_true(df@env$isCached)
+
+  unpersist(df)
+  expect_false(df@env$isCached)
+
+  persist(df, "MEMORY_AND_DISK")
+  expect_true(df@env$isCached)
+
+  expect_equal(storageLevel(df),
+    "MEMORY_AND_DISK - StorageLevel(disk, memory, deserialized, 1 replicas)")
+
+  unpersist(df)
+  expect_false(df@env$isCached)
+
+  # make sure the data is collectable
+  expect_true(is.data.frame(collect(df)))
+})
+
+test_that("setCheckpointDir(), checkpoint() on a DataFrame", {
+  if (not_cran_or_windows_with_hadoop()) {
+    checkpointDir <- file.path(tempdir(), "cproot")
+    expect_true(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
+
+    setCheckpointDir(checkpointDir)
+    df <- read.json(jsonPath)
+    df <- checkpoint(df)
+    expect_is(df, "SparkDataFrame")
+    expect_false(length(list.files(path = checkpointDir, all.files = TRUE)) == 0)
+  }
+})
+
+test_that("schema(), dtypes(), columns(), names() return the correct values/format", {
+  df <- read.json(jsonPath)
+  testSchema <- schema(df)
+  expect_equal(length(testSchema$fields()), 2)
+  expect_equal(testSchema$fields()[[1]]$dataType.toString(), "LongType")
+  expect_equal(testSchema$fields()[[2]]$dataType.simpleString(), "string")
+  expect_equal(testSchema$fields()[[1]]$name(), "age")
+
+  testTypes <- dtypes(df)
+  expect_equal(length(testTypes[[1]]), 2)
+  expect_equal(testTypes[[1]][1], "age")
+
+  testCols <- columns(df)
+  expect_equal(length(testCols), 2)
+  expect_equal(testCols[2], "name")
+
+  testNames <- names(df)
+  expect_equal(length(testNames), 2)
+  expect_equal(testNames[2], "name")
+})
+
+test_that("names() colnames() set the column names", {
+  df <- read.json(jsonPath)
+  names(df) <- c("col1", "col2")
+  expect_equal(colnames(df)[2], "col2")
+
+  colnames(df) <- c("col3", "col4")
+  expect_equal(names(df)[1], "col3")
+
+  expect_error(names(df) <- NULL, "Invalid column names.")
+  expect_error(names(df) <- c("sepal.length", "sepal_width"),
+               "Column names cannot contain the '.' symbol.")
+  expect_error(names(df) <- c(1, 2), "Invalid column names.")
+  expect_error(names(df) <- c("a"),
+               "Column names must have the same length as the number of columns in the dataset.")
+  expect_error(names(df) <- c("1", NA), "Column names cannot be NA.")
+
+  expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
+               "Column names cannot contain the '.' symbol.")
+  expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
+  expect_error(colnames(df) <- c("a"),
+               "Column names must have the same length as the number of columns in the dataset.")
+  expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.")
+
+  # Note: if this test is broken, remove check for "." character on colnames<- method
+  irisDF <- suppressWarnings(createDataFrame(iris))
+  expect_equal(names(irisDF)[1], "Sepal_Length")
+
+  # Test base::colnames base::names
+  m2 <- cbind(1, 1:4)
+  expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2"))
+  colnames(m2) <- c("x", "Y")
+  expect_equal(colnames(m2), c("x", "Y"))
+
+  z <- list(a = 1, b = "c", c = 1:3)
+  expect_equal(names(z)[3], "c")
+  names(z)[3] <- "c2"
+  expect_equal(names(z)[3], "c2")
+
+  # Test subset assignment
+  colnames(df)[1] <- "col5"
+  expect_equal(colnames(df)[1], "col5")
+  names(df)[2] <- "col6"
+  expect_equal(names(df)[2], "col6")
+})
+
+test_that("head() and first() return the correct data", {
+  df <- read.json(jsonPath)
+  testHead <- head(df)
+  expect_equal(nrow(testHead), 3)
+  expect_equal(ncol(testHead), 2)
+
+  testHead2 <- head(df, 2)
+  expect_equal(nrow(testHead2), 2)
+  expect_equal(ncol(testHead2), 2)
+
+  testFirst <- first(df)
+  expect_equal(nrow(testFirst), 1)
+
+  # head() and first() return the correct data on
+  # a DataFrame with 0 row
+  df0 <- limit(df, 0)
+
+  testHead <- head(df0)
+  expect_equal(nrow(testHead), 0)
+  expect_equal(ncol(testHead), 2)
+
+  testFirst <- first(df0)
+  expect_equal(nrow(testFirst), 0)
+  expect_equal(ncol(testFirst), 2)
+})
+
+test_that("distinct(), unique() and dropDuplicates() on DataFrames", {
+  lines <- c("{\"name\":\"Michael\"}",
+             "{\"name\":\"Andy\", \"age\":30}",
+             "{\"name\":\"Justin\", \"age\":19}",
+             "{\"name\":\"Justin\", \"age\":19}")
+  jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(lines, jsonPathWithDup)
+
+  df <- read.json(jsonPathWithDup)
+  uniques <- distinct(df)
+  expect_is(uniques, "SparkDataFrame")
+  expect_equal(count(uniques), 3)
+
+  uniques2 <- unique(df)
+  expect_is(uniques2, "SparkDataFrame")
+  expect_equal(count(uniques2), 3)
+
+  # Test dropDuplicates()
+  df <- createDataFrame(
+    list(
+      list(2, 1, 2), list(1, 1, 1),
+      list(1, 2, 1), list(2, 1, 2),
+      list(2, 2, 2), list(2, 2, 1),
+      list(2, 1, 1), list(1, 1, 2),
+      list(1, 2, 2), list(1, 2, 1)),
+    schema = c("key", "value1", "value2"))
+  result <- collect(dropDuplicates(df))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 1, 2), c(1, 2, 1),
+    c(1, 2, 2), c(2, 1, 1), c(2, 1, 2),
+    c(2, 2, 1), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2), ],
+    expected)
+
+  result <- collect(dropDuplicates(df, c("key", "value1")))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2), ],
+    expected)
+
+  result <- collect(dropDuplicates(df, "key", "value1"))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(1, 2, 1), c(2, 1, 2), c(2, 2, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2), ],
+    expected)
+
+  result <- collect(dropDuplicates(df, "key"))
+  expected <- rbind.data.frame(
+    c(1, 1, 1), c(2, 1, 2))
+  names(expected) <- c("key", "value1", "value2")
+  expect_equivalent(
+    result[order(result$key, result$value1, result$value2), ],
+    expected)
+})
+
+test_that("sample on a DataFrame", {
+  df <- read.json(jsonPath)
+  sampled <- sample(df, FALSE, 1.0)
+  expect_equal(nrow(collect(sampled)), count(df))
+  expect_is(sampled, "SparkDataFrame")
+  sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result
+  expect_true(count(sampled2) < 3)
+
+  count1 <- count(sample(df, FALSE, 0.1, 0))
+  count2 <- count(sample(df, FALSE, 0.1, 0))
+  expect_equal(count1, count2)
+
+  # Also test sample_frac
+  sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result
+  expect_true(count(sampled3) < 3)
+
+  # nolint start
+  # Test base::sample is working
+  #expect_equal(length(sample(1:12)), 12)
+  # nolint end
+})
+
+test_that("select operators", {
+  df <- select(read.json(jsonPath), "name", "age")
+  expect_is(df$name, "Column")
+  expect_is(df[[2]], "Column")
+  expect_is(df[["age"]], "Column")
+
+  expect_warning(df[[1:2]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_is(suppressWarnings(df[[1:2]]), "Column")
+  expect_warning(df[[c("name", "age")]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_is(suppressWarnings(df[[c("name", "age")]]), "Column")
+
+  expect_warning(df[[1:2]] <- df[[1]],
+                 "Subset index has length > 1. Only the first index is used.")
+  expect_warning(df[[c("name", "age")]] <- df[[1]],
+                 "Subset index has length > 1. Only the first index is used.")
+
+  expect_is(df[, 1, drop = F], "SparkDataFrame")
+  expect_equal(columns(df[, 1, drop = F]), c("name"))
+  expect_equal(columns(df[, "age", drop = F]), c("age"))
+
+  df2 <- df[, c("age", "name")]
+  expect_is(df2, "SparkDataFrame")
+  expect_equal(columns(df2), c("age", "name"))
+
+  df$age2 <- df$age
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == df$age)), 2)
+  df$age2 <- df$age * 2
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == df$age * 2)), 2)
+  df$age2 <- df[["age"]] * 3
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == df$age * 3)), 2)
+
+  df$age2 <- 21
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 21)), 3)
+
+  df$age2 <- c(22)
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 22)), 3)
+
+  expect_error(df$age3 <- c(22, NA),
+              "value must be a Column, literal value as atomic in length of 1, or NULL")
+
+  df[["age2"]] <- 23
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 23)), 3)
+
+  df[[3]] <- 24
+  expect_equal(columns(df), c("name", "age", "age2"))
+  expect_equal(count(where(df, df$age2 == 24)), 3)
+
+  df[[3]] <- df$age
+  expect_equal(count(where(df, df$age2 == df$age)), 2)
+
+  df[["age2"]] <- df[["name"]]
+  expect_equal(count(where(df, df$age2 == df$name)), 3)
+
+  expect_error(df[["age3"]] <- c(22, 23),
+              "value must be a Column, literal value as atomic in length of 1, or NULL")
+
+  # Test parameter drop
+  expect_equal(class(df[, 1]) == "SparkDataFrame", T)
+  expect_equal(class(df[, 1, drop = T]) == "Column", T)
+  expect_equal(class(df[, 1, drop = F]) == "SparkDataFrame", T)
+  expect_equal(class(df[df$age > 4, 2, drop = T]) == "Column", T)
+  expect_equal(class(df[df$age > 4, 2, drop = F]) == "SparkDataFrame", T)
+})
+
+test_that("select with column", {
+  df <- read.json(jsonPath)
+  df1 <- select(df, "name")
+  expect_equal(columns(df1), c("name"))
+  expect_equal(count(df1), 3)
+
+  df2 <- select(df, df$age)
+  expect_equal(columns(df2), c("age"))
+  expect_equal(count(df2), 3)
+
+  df3 <- select(df, lit("x"))
+  expect_equal(columns(df3), c("x"))
+  expect_equal(count(df3), 3)
+  expect_equal(collect(select(df3, "x"))[[1, 1]], "x")
+
+  df4 <- select(df, c("name", "age"))
+  expect_equal(columns(df4), c("name", "age"))
+  expect_equal(count(df4), 3)
+
+  # Test select with alias
+  df5 <- alias(df, "table")
+
+  expect_equal(columns(select(df5, column("table.name"))), "name")
+  expect_equal(columns(select(df5, "table.name")), "name")
+
+  # Test that stats::alias is not masked
+  expect_is(alias(aov(yield ~ block + N * P * K, npk)), "listof")
+
+
+  expect_error(select(df, c("name", "age"), "name"),
+                "To select multiple columns, use a character vector or list for col")
+})
+
+test_that("drop column", {
+  df <- select(read.json(jsonPath), "name", "age")
+  df1 <- drop(df, "name")
+  expect_equal(columns(df1), c("age"))
+
+  df$age2 <- df$age
+  df1 <- drop(df, c("name", "age"))
+  expect_equal(columns(df1), c("age2"))
+
+  df1 <- drop(df, df$age)
+  expect_equal(columns(df1), c("name", "age2"))
+
+  df$age2 <- NULL
+  expect_equal(columns(df), c("name", "age"))
+  df$age3 <- NULL
+  expect_equal(columns(df), c("name", "age"))
+
+  # Test to make sure base::drop is not masked
+  expect_equal(drop(1:3 %*% 2:4), 20)
+})
+
+test_that("subsetting", {
+  # read.json returns columns in random order
+  df <- select(read.json(jsonPath), "name", "age")
+  filtered <- df[df$age > 20, ]
+  expect_equal(count(filtered), 1)
+  expect_equal(columns(filtered), c("name", "age"))
+  expect_equal(collect(filtered)$name, "Andy")
+
+  df2 <- df[df$age == 19, 1, drop = F]
+  expect_is(df2, "SparkDataFrame")
+  expect_equal(count(df2), 1)
+  expect_equal(columns(df2), c("name"))
+  expect_equal(collect(df2)$name, "Justin")
+
+  df3 <- df[df$age > 20, 2, drop = F]
+  expect_equal(count(df3), 1)
+  expect_equal(columns(df3), c("age"))
+
+  df4 <- df[df$age %in% c(19, 30), 1:2]
+  expect_equal(count(df4), 2)
+  expect_equal(columns(df4), c("name", "age"))
+
+  df5 <- df[df$age %in% c(19), c(1, 2)]
+  expect_equal(count(df5), 1)
+  expect_equal(columns(df5), c("name", "age"))
+
+  df6 <- subset(df, df$age %in% c(30), c(1, 2))
+  expect_equal(count(df6), 1)
+  expect_equal(columns(df6), c("name", "age"))
+
+  df7 <- subset(df, select = "name", drop = F)
+  expect_equal(count(df7), 3)
+  expect_equal(columns(df7), c("name"))
+
+  # Test base::subset is working
+  expect_equal(nrow(subset(airquality, Temp > 80, select = c(Ozone, Temp))), 68)
+})
+
+test_that("selectExpr() on a DataFrame", {
+  df <- read.json(jsonPath)
+  selected <- selectExpr(df, "age * 2")
+  expect_equal(names(selected), "(age * 2)")
+  expect_equal(collect(selected), collect(select(df, df$age * 2L)))
+
+  selected2 <- selectExpr(df, "name as newName", "abs(age) as age")
+  expect_equal(names(selected2), c("newName", "age"))
+  expect_equal(count(selected2), 3)
+})
+
+test_that("expr() on a DataFrame", {
+  df <- read.json(jsonPath)
+  expect_equal(collect(select(df, expr("abs(-123)")))[1, 1], 123)
+})
+
+test_that("column calculation", {
+  df <- read.json(jsonPath)
+  d <- collect(select(df, alias(df$age + 1, "age2")))
+  expect_equal(names(d), c("age2"))
+  df2 <- select(df, lower(df$name), abs(df$age))
+  expect_is(df2, "SparkDataFrame")
+  expect_equal(count(df2), 3)
+})
+
+test_that("test HiveContext", {
+  if (not_cran_or_windows_with_hadoop()) {
+    setHiveContext(sc)
+
+    schema <- structType(structField("name", "string"), structField("age", "integer"),
+                         structField("height", "float"))
+    createTable("people", source = "json", schema = schema)
+    df <- read.df(jsonPathNa, "json", schema)
+    insertInto(df, "people")
+    expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16))
+    sql("DROP TABLE people")
+
+    df <- createTable("json", jsonPath, "json")
+    expect_is(df, "SparkDataFrame")
+    expect_equal(count(df), 3)
+    df2 <- sql("select * from json")
+    expect_is(df2, "SparkDataFrame")
+    expect_equal(count(df2), 3)
+
+    jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+    saveAsTable(df, "json2", "json", "append", path = jsonPath2)
+    df3 <- sql("select * from json2")
+    expect_is(df3, "SparkDataFrame")
+    expect_equal(count(df3), 3)
+    unlink(jsonPath2)
+
+    hivetestDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+    saveAsTable(df, "hivetestbl", path = hivetestDataPath)
+    df4 <- sql("select * from hivetestbl")
+    expect_is(df4, "SparkDataFrame")
+    expect_equal(count(df4), 3)
+    unlink(hivetestDataPath)
+
+    parquetDataPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+    saveAsTable(df, "parquetest", "parquet", mode = "overwrite", path = parquetDataPath)
+    df5 <- sql("select * from parquetest")
+    expect_is(df5, "SparkDataFrame")
+    expect_equal(count(df5), 3)
+    unlink(parquetDataPath)
+
+    unsetHiveContext()
+  }
+})
+
+test_that("column operators", {
+  c <- column("a")
+  c2 <- (- c + 1 - 2) * 3 / 4.0
+  c3 <- (c + c2 - c2) * c2 %% c2
+  c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
+  c5 <- c2 ^ c3 ^ c4
+  c6 <- c2 %<=>% c3
+  c7 <- !c6
+})
+
+test_that("column functions", {
+  skip_on_cran()
+
+  c <- column("a")
+  c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
+  c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
+  c3 <- cosh(c) + count(c) + crc32(c) + hash(c) + exp(c)
+  c4 <- explode(c) + expm1(c) + factorial(c) + first(c) + floor(c) + hex(c)
+  c5 <- hour(c) + initcap(c) + last(c) + last_day(c) + length(c)
+  c6 <- log(c) + (c) + log1p(c) + log2(c) + lower(c) + ltrim(c) + max(c) + md5(c)
+  c7 <- mean(c) + min(c) + month(c) + negate(c) + posexplode(c) + quarter(c)
+  c8 <- reverse(c) + rint(c) + round(c) + rtrim(c) + sha1(c) + monotonically_increasing_id()
+  c9 <- signum(c) + sin(c) + sinh(c) + size(c) + stddev(c) + soundex(c) + sqrt(c) + sum(c)
+  c10 <- sumDistinct(c) + tan(c) + tanh(c) + toDegrees(c) + toRadians(c)
+  c11 <- to_date(c) + trim(c) + unbase64(c) + unhex(c) + upper(c)
+  c12 <- variance(c)
+  c13 <- lead("col", 1) + lead(c, 1) + lag("col", 1) + lag(c, 1)
+  c14 <- cume_dist() + ntile(1) + corr(c, c1)
+  c15 <- dense_rank() + percent_rank() + rank() + row_number()
+  c16 <- is.nan(c) + isnan(c) + isNaN(c)
+  c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1")
+  c18 <- covar_pop(c, c1) + covar_pop("c", "c1")
+  c19 <- spark_partition_id() + coalesce(c) + coalesce(c1, c2, c3)
+  c20 <- to_timestamp(c) + to_timestamp(c, "yyyy") + to_date(c, "yyyy")
+  c21 <- posexplode_outer(c) + explode_outer(c)
+  c22 <- not(c)
+
+  # Test if base::is.nan() is exposed
+  expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE))
+
+  # Test if base::rank() is exposed
+  expect_equal(class(rank())[[1]], "Column")
+  expect_equal(rank(1:3), as.numeric(c(1:3)))
+
+  df <- read.json(jsonPath)
+  df2 <- select(df, between(df$age, c(20, 30)), between(df$age, c(10, 20)))
+  expect_equal(collect(df2)[[2, 1]], TRUE)
+  expect_equal(collect(df2)[[2, 2]], FALSE)
+  expect_equal(collect(df2)[[3, 1]], FALSE)
+  expect_equal(collect(df2)[[3, 2]], TRUE)
+
+  # Test that input_file_name()
+  actual_names <- sort(collect(distinct(select(df, input_file_name()))))
+  expect_equal(length(actual_names), 1)
+  expect_equal(basename(actual_names[1, 1]), basename(jsonPath))
+
+  df3 <- select(df, between(df$name, c("Apache", "Spark")))
+  expect_equal(collect(df3)[[1, 1]], TRUE)
+  expect_equal(collect(df3)[[2, 1]], FALSE)
+  expect_equal(collect(df3)[[3, 1]], TRUE)
+
+  df4 <- select(df, countDistinct(df$age, df$name))
+  expect_equal(collect(df4)[[1, 1]], 2)
+
+  expect_equal(collect(select(df, sum(df$age)))[1, 1], 49)
+  expect_true(abs(collect(select(df, stddev(df$age)))[1, 1] - 7.778175) < 1e-6)
+  expect_equal(collect(select(df, var_pop(df$age)))[1, 1], 30.25)
+
+  df5 <- createDataFrame(list(list(a = "010101")))
+  expect_equal(collect(select(df5, conv(df5$a, 2, 16)))[1, 1], "15")
+
+  # Test array_contains() and sort_array()
+  df <- createDataFrame(list(list(list(1L, 2L, 3L)), list(list(6L, 5L, 4L))))
+  result <- collect(select(df, array_contains(df[[1]], 1L)))[[1]]
+  expect_equal(result, c(TRUE, FALSE))
+
+  result <- collect(select(df, sort_array(df[[1]], FALSE)))[[1]]
+  expect_equal(result, list(list(3L, 2L, 1L), list(6L, 5L, 4L)))
+  result <- collect(select(df, sort_array(df[[1]])))[[1]]
+  expect_equal(result, list(list(1L, 2L, 3L), list(4L, 5L, 6L)))
+
+  # Test that stats::lag is working
+  expect_equal(length(lag(ldeaths, 12)), 72)
+
+  # Test struct()
+  df <- createDataFrame(list(list(1L, 2L, 3L), list(4L, 5L, 6L)),
+                        schema = c("a", "b", "c"))
+  result <- collect(select(df, alias(struct("a", "c"), "d")))
+  expected <- data.frame(row.names = 1:2)
+  expected$"d" <- list(listToStruct(list(a = 1L, c = 3L)),
+                      listToStruct(list(a = 4L, c = 6L)))
+  expect_equal(result, expected)
+
+  result <- collect(select(df, alias(struct(df$a, df$b), "d")))
+  expected <- data.frame(row.names = 1:2)
+  expected$"d" <- list(listToStruct(list(a = 1L, b = 2L)),
+                      listToStruct(list(a = 4L, b = 5L)))
+  expect_equal(result, expected)
+
+  # Test encode(), decode()
+  bytes <- as.raw(c(0xe5, 0xa4, 0xa7, 0xe5, 0x8d, 0x83, 0xe4, 0xb8, 0x96, 0xe7, 0x95, 0x8c))
+  df <- createDataFrame(list(list(markUtf8("大千世界"), "utf-8", bytes)),
+                        schema = c("a", "b", "c"))
+  result <- collect(select(df, encode(df$a, "utf-8"), decode(df$c, "utf-8")))
+  expect_equal(result[[1]][[1]], bytes)
+  expect_equal(result[[2]], markUtf8("大千世界"))
+
+  # Test first(), last()
+  df <- read.json(jsonPath)
+  expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
+  expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
+  expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
+  expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
+  expect_equal(collect(select(df, last(df$age)))[[1]], 19)
+  expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
+  expect_equal(collect(select(df, last("age")))[[1]], 19)
+  expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)
+
+  # Test bround()
+  df <- createDataFrame(data.frame(x = c(2.5, 3.5)))
+  expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
+  expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
+
+  # Test to_json(), from_json()
+  df <- sql("SELECT array(named_struct('name', 'Bob'), named_struct('name', 'Alice')) as people")
+  j <- collect(select(df, alias(to_json(df$people), "json")))
+  expect_equal(j[order(j$json), ][1], "[{\"name\":\"Bob\"},{\"name\":\"Alice\"}]")
+
+  df <- read.json(mapTypeJsonPath)
+  j <- collect(select(df, alias(to_json(df$info), "json")))
+  expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
+  df <- as.DataFrame(j)
+  schema <- structType(structField("age", "integer"),
+                       structField("height", "double"))
+  s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
+  expect_equal(ncol(s), 1)
+  expect_equal(nrow(s), 3)
+  expect_is(s[[1]][[1]], "struct")
+  expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+
+  # passing option
+  df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
+  schema2 <- structType(structField("date", "date"))
+  s <- collect(select(df, from_json(df$col, schema2)))
+  expect_equal(s[[1]][[1]], NA)
+  s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
+  expect_is(s[[1]][[1]]$date, "Date")
+  expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")
+
+  # check for unparseable
+  df <- as.DataFrame(list(list("a" = "")))
+  expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
+
+  # check if array type in string is correctly supported.
+  jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
+  df <- as.DataFrame(list(list("people" = jsonArr)))
+  schema <- structType(structField("name", "string"))
+  arr <- collect(select(df, alias(from_json(df$people, schema, as.json.array = TRUE), "arrcol")))
+  expect_equal(ncol(arr), 1)
+  expect_equal(nrow(arr), 1)
+  expect_is(arr[[1]][[1]], "list")
+  expect_equal(length(arr$arrcol[[1]]), 2)
+  expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
+  expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")
+
+  # Test create_array() and create_map()
+  df <- as.DataFrame(data.frame(
+    x = c(1.0, 2.0), y = c(-1.0, 3.0), z = c(-2.0, 5.0)
+  ))
+
+  arrs <- collect(select(df, create_array(df$x, df$y, df$z)))
+  expect_equal(arrs[, 1], list(list(1, -1, -2), list(2, 3, 5)))
+
+  maps <- collect(select(
+    df, create_map(lit("x"), df$x, lit("y"), df$y, lit("z"), df$z)))
+
+  expect_equal(
+    maps[, 1],
+    lapply(
+      list(list(x = 1, y = -1, z = -2), list(x = 2, y = 3,  z = 5)),
+      as.environment))
+
+  df <- as.DataFrame(data.frame(is_true = c(TRUE, FALSE, NA)))
+  expect_equal(
+    collect(select(df, alias(not(df$is_true), "is_false"))),
+    data.frame(is_false = c(FALSE, TRUE, NA))
+  )
+})
+
+test_that("column binary mathfunctions", {
+  lines <- c("{\"a\":1, \"b\":5}",
+             "{\"a\":2, \"b\":6}",
+             "{\"a\":3, \"b\":7}",
+             "{\"a\":4, \"b\":8}")
+  jsonPathWithDup <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(lines, jsonPathWithDup)
+  df <- read.json(jsonPathWithDup)
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[1, "ATAN2(a, b)"], atan2(1, 5))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[2, "ATAN2(a, b)"], atan2(2, 6))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[3, "ATAN2(a, b)"], atan2(3, 7))
+  expect_equal(collect(select(df, atan2(df$a, df$b)))[4, "ATAN2(a, b)"], atan2(4, 8))
+  ## nolint start
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[1, "HYPOT(a, b)"], sqrt(1^2 + 5^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[2, "HYPOT(a, b)"], sqrt(2^2 + 6^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[3, "HYPOT(a, b)"], sqrt(3^2 + 7^2))
+  expect_equal(collect(select(df, hypot(df$a, df$b)))[4, "HYPOT(a, b)"], sqrt(4^2 + 8^2))
+  ## nolint end
+  expect_equal(collect(select(df, shiftLeft(df$b, 1)))[4, 1], 16)
+  expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4)
+  expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4)
+  expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric")
+  expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01)
+  expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric")
+  expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01)
+})
+
+test_that("string operators", {
+  df <- read.json(jsonPath)
+  expect_equal(count(where(df, like(df$name, "A%"))), 1)
+  expect_equal(count(where(df, startsWith(df$name, "A"))), 1)
+  expect_true(first(select(df, startsWith(df$name, "M")))[[1]])
+  expect_false(first(select(df, startsWith(df$name, "m")))[[1]])
+  expect_true(first(select(df, endsWith(df$name, "el")))[[1]])
+  expect_equal(first(select(df, substr(df$name, 1, 2)))[[1]], "Mi")
+  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+    expect_true(startsWith("Hello World", "Hello"))
+    expect_false(endsWith("Hello World", "a"))
+  }
+  expect_equal(collect(select(df, cast(df$age, "string")))[[2, 1]], "30")
+  expect_equal(collect(select(df, concat(df$name, lit(":"), df$age)))[[2, 1]], "Andy:30")
+  expect_equal(collect(select(df, concat_ws(":", df$name)))[[2, 1]], "Andy")
+  expect_equal(collect(select(df, concat_ws(":", df$name, df$age)))[[2, 1]], "Andy:30")
+  expect_equal(collect(select(df, instr(df$name, "i")))[, 1], c(2, 0, 5))
+  expect_equal(collect(select(df, format_number(df$age, 2)))[2, 1], "30.00")
+  expect_equal(collect(select(df, sha1(df$name)))[2, 1],
+               "ab5a000e88b5d9d0fa2575f5c6263eb93452405d")
+  expect_equal(collect(select(df, sha2(df$name, 256)))[2, 1],
+               "80f2aed3c618c423ddf05a2891229fba44942d907173152442cf6591441ed6dc")
+  expect_equal(collect(select(df, format_string("Name:%s", df$name)))[2, 1], "Name:Andy")
+  expect_equal(collect(select(df, format_string("%s, %d", df$name, df$age)))[2, 1], "Andy, 30")
+  expect_equal(collect(select(df, regexp_extract(df$name, "(n.y)", 1)))[2, 1], "ndy")
+  expect_equal(collect(select(df, regexp_replace(df$name, "(n.y)", "ydn")))[2, 1], "Aydn")
+
+  l2 <- list(list(a = "aaads"))
+  df2 <- createDataFrame(l2)
+  expect_equal(collect(select(df2, locate("aa", df2$a)))[1, 1], 1)
+  expect_equal(collect(select(df2, locate("aa", df2$a, 2)))[1, 1], 2)
+  expect_equal(collect(select(df2, lpad(df2$a, 8, "#")))[1, 1], "###aaads") # nolint
+  expect_equal(collect(select(df2, rpad(df2$a, 8, "#")))[1, 1], "aaads###") # nolint
+
+  l3 <- list(list(a = "a.b.c.d"))
+  df3 <- createDataFrame(l3)
+  expect_equal(collect(select(df3, substring_index(df3$a, ".", 2)))[1, 1], "a.b")
+  expect_equal(collect(select(df3, substring_index(df3$a, ".", -3)))[1, 1], "b.c.d")
+  expect_equal(collect(select(df3, translate(df3$a, "bc", "12")))[1, 1], "a.1.2.d")
+
+  l4 <- list(list(a = "a.b@c.d   1\\b"))
+  df4 <- createDataFrame(l4)
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\s+")))[1, 1],
+    list(list("a.b@c.d", "1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\.")))[1, 1],
+    list(list("a", "b@c", "d   1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "@")))[1, 1],
+    list(list("a.b", "c.d   1\\b"))
+  )
+  expect_equal(
+    collect(select(df4, split_string(df4$a, "\\\\")))[1, 1],
+    list(list("a.b@c.d   1", "b"))
+  )
+
+  l5 <- list(list(a = "abc"))
+  df5 <- createDataFrame(l5)
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, 1L)))[1, 1],
+    "abc"
+  )
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, 3)))[1, 1],
+    "abcabcabc"
+  )
+  expect_equal(
+    collect(select(df5, repeat_string(df5$a, -1)))[1, 1],
+    ""
+  )
+})
+
+test_that("date functions on a DataFrame", {
+  .originalTimeZone <- Sys.getenv("TZ")
+  Sys.setenv(TZ = "UTC")
+  l <- list(list(a = 1L, b = as.Date("2012-12-13")),
+            list(a = 2L, b = as.Date("2013-12-14")),
+            list(a = 3L, b = as.Date("2014-12-15")))
+  df <- createDataFrame(l)
+  expect_equal(collect(select(df, dayofmonth(df$b)))[, 1], c(13, 14, 15))
+  expect_equal(collect(select(df, dayofyear(df$b)))[, 1], c(348, 348, 349))
+  expect_equal(collect(select(df, weekofyear(df$b)))[, 1], c(50, 50, 51))
+  expect_equal(collect(select(df, year(df$b)))[, 1], c(2012, 2013, 2014))
+  expect_equal(collect(select(df, month(df$b)))[, 1], c(12, 12, 12))
+  expect_equal(collect(select(df, last_day(df$b)))[, 1],
+               c(as.Date("2012-12-31"), as.Date("2013-12-31"), as.Date("2014-12-31")))
+  expect_equal(collect(select(df, next_day(df$b, "MONDAY")))[, 1],
+               c(as.Date("2012-12-17"), as.Date("2013-12-16"), as.Date("2014-12-22")))
+  expect_equal(collect(select(df, date_format(df$b, "y")))[, 1], c("2012", "2013", "2014"))
+  expect_equal(collect(select(df, add_months(df$b, 3)))[, 1],
+               c(as.Date("2013-03-13"), as.Date("2014-03-14"), as.Date("2015-03-15")))
+  expect_equal(collect(select(df, date_add(df$b, 1)))[, 1],
+               c(as.Date("2012-12-14"), as.Date("2013-12-15"), as.Date("2014-12-16")))
+  expect_equal(collect(select(df, date_sub(df$b, 1)))[, 1],
+               c(as.Date("2012-12-12"), as.Date("2013-12-13"), as.Date("2014-12-14")))
+
+  l2 <- list(list(a = 1L, b = as.POSIXlt("2012-12-13 12:34:00", tz = "UTC")),
+            list(a = 2L, b = as.POSIXlt("2014-12-15 01:24:34", tz = "UTC")))
+  df2 <- createDataFrame(l2)
+  expect_equal(collect(select(df2, minute(df2$b)))[, 1], c(34, 24))
+  expect_equal(collect(select(df2, second(df2$b)))[, 1], c(0, 34))
+  expect_equal(collect(select(df2, from_utc_timestamp(df2$b, "JST")))[, 1],
+               c(as.POSIXlt("2012-12-13 21:34:00 UTC"), as.POSIXlt("2014-12-15 10:24:34 UTC")))
+  expect_equal(collect(select(df2, to_utc_timestamp(df2$b, "JST")))[, 1],
+               c(as.POSIXlt("2012-12-13 03:34:00 UTC"), as.POSIXlt("2014-12-14 16:24:34 UTC")))
+  expect_gt(collect(select(df2, unix_timestamp()))[1, 1], 0)
+  expect_gt(collect(select(df2, unix_timestamp(df2$b)))[1, 1], 0)
+  expect_gt(collect(select(df2, unix_timestamp(lit("2015-01-01"), "yyyy-MM-dd")))[1, 1], 0)
+
+  l3 <- list(list(a = 1000), list(a = -1000))
+  df3 <- createDataFrame(l3)
+  result31 <- collect(select(df3, from_unixtime(df3$a)))
+  expect_equal(grep("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", result31[, 1], perl = TRUE),
+               c(1, 2))
+  result32 <- collect(select(df3, from_unixtime(df3$a, "yyyy")))
+  expect_equal(grep("\\d{4}", result32[, 1]), c(1, 2))
+  Sys.setenv(TZ = .originalTimeZone)
+})
+
+test_that("greatest() and least() on a DataFrame", {
+  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+  df <- createDataFrame(l)
+  expect_equal(collect(select(df, greatest(df$a, df$b)))[, 1], c(2, 4))
+  expect_equal(collect(select(df, least(df$a, df$b)))[, 1], c(1, 3))
+})
+
+test_that("time windowing (window()) with all inputs", {
+  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", "5 seconds", "0 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with slide duration", {
+  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", "2 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1, 1))
+})
+
+test_that("time windowing (window()) with start time", {
+  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds", startTime = "2 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
+test_that("time windowing (window()) with just window duration", {
+  df <- createDataFrame(data.frame(t = c("2016-03-11 09:00:07"), v = c(1)))
+  df$window <- window(df$t, "5 seconds")
+  local <- collect(df)$v
+  # Not checking time windows because of possible time zone issues. Just checking that the function
+  # works
+  expect_equal(local, c(1))
+})
+
+test_that("when(), otherwise() and ifelse() on a DataFrame", {
+  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+  df <- createDataFrame(l)
+  expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, 1)))[, 1], c(NA, 1))
+  expect_equal(collect(select(df, otherwise(when(df$a > 1, 1), 0)))[, 1], c(0, 1))
+  expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, 0, 1)))[, 1], c(1, 0))
+})
+
+test_that("when(), otherwise() and ifelse() with column on a DataFrame", {
+  l <- list(list(a = 1, b = 2), list(a = 3, b = 4))
+  df <- createDataFrame(l)
+  expect_equal(collect(select(df, when(df$a > 1 & df$b > 2, lit(1))))[, 1], c(NA, 1))
+  expect_equal(collect(select(df, otherwise(when(df$a > 1, lit(1)), lit(0))))[, 1], c(0, 1))
+  expect_equal(collect(select(df, ifelse(df$a > 1 & df$b > 2, lit(0), lit(1))))[, 1], c(1, 0))
+})
+
+test_that("group by, agg functions", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+  df1 <- agg(df, name = "max", age = "sum")
+  expect_equal(1, count(df1))
+  df1 <- agg(df, age2 = max(df$age))
+  expect_equal(1, count(df1))
+  expect_equal(columns(df1), c("age2"))
+
+  gd <- groupBy(df, "name")
+  expect_is(gd, "GroupedData")
+  df2 <- count(gd)
+  expect_is(df2, "SparkDataFrame")
+  expect_equal(3, count(df2))
+
+  # Also test group_by, summarize, mean
+  gd1 <- group_by(df, "name")
+  expect_is(gd1, "GroupedData")
+  df_summarized <- summarize(gd, mean_age = mean(df$age))
+  expect_is(df_summarized, "SparkDataFrame")
+  expect_equal(3, count(df_summarized))
+
+  df3 <- agg(gd, age = "stddev")
+  expect_is(df3, "SparkDataFrame")
+  df3_local <- collect(df3)
+  expect_true(is.nan(df3_local[df3_local$name == "Andy", ][1, 2]))
+
+  df4 <- agg(gd, sumAge = sum(df$age))
+  expect_is(df4, "SparkDataFrame")
+  expect_equal(3, count(df4))
+  expect_equal(columns(df4), c("name", "sumAge"))
+
+  df5 <- sum(gd, "age")
+  expect_is(df5, "SparkDataFrame")
+  expect_equal(3, count(df5))
+
+  expect_equal(3, count(mean(gd)))
+  expect_equal(3, count(max(gd)))
+  expect_equal(30, collect(max(gd))[2, 2])
+  expect_equal(1, collect(count(gd))[1, 2])
+
+  mockLines2 <- c("{\"name\":\"ID1\", \"value\": \"10\"}",
+                  "{\"name\":\"ID1\", \"value\": \"10\"}",
+                  "{\"name\":\"ID1\", \"value\": \"22\"}",
+                  "{\"name\":\"ID2\", \"value\": \"-3\"}")
+  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(mockLines2, jsonPath2)
+  gd2 <- groupBy(read.json(jsonPath2), "name")
+  df6 <- agg(gd2, value = "sum")
+  df6_local <- collect(df6)
+  expect_equal(42, df6_local[df6_local$name == "ID1", ][1, 2])
+  expect_equal(-3, df6_local[df6_local$name == "ID2", ][1, 2])
+
+  df7 <- agg(gd2, value = "stddev")
+  df7_local <- collect(df7)
+  expect_true(abs(df7_local[df7_local$name == "ID1", ][1, 2] - 6.928203) < 1e-6)
+  expect_true(is.nan(df7_local[df7_local$name == "ID2", ][1, 2]))
+
+  mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
+                  "{\"name\":\"Andy\", \"age\":30}",
+                  "{\"name\":\"Justin\", \"age\":19}",
+                  "{\"name\":\"Justin\", \"age\":1}")
+  jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(mockLines3, jsonPath3)
+  df8 <- read.json(jsonPath3)
+  gd3 <- groupBy(df8, "name")
+  gd3_local <- collect(sum(gd3))
+  expect_equal(60, gd3_local[gd3_local$name == "Andy", ][1, 2])
+  expect_equal(20, gd3_local[gd3_local$name == "Justin", ][1, 2])
+
+  expect_true(abs(collect(agg(df, sd(df$age)))[1, 1] - 7.778175) < 1e-6)
+  gd3_local <- collect(agg(gd3, var(df8$age)))
+  expect_equal(162, gd3_local[gd3_local$name == "Justin", ][1, 2])
+
+  # Test stats::sd, stats::var are working
+  expect_true(abs(sd(1:2) - 0.7071068) < 1e-6)
+  expect_true(abs(var(1:5, 1:5) - 2.5) < 1e-6)
+
+  # Test collect_list and collect_set
+  gd3_collections_local <- collect(
+    agg(gd3, collect_set(df8$age), collect_list(df8$age))
+  )
+
+  expect_equal(
+    unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 2]),
+    c(30)
+  )
+
+  expect_equal(
+    unlist(gd3_collections_local[gd3_collections_local$name == "Andy", 3]),
+    c(30, 30)
+  )
+
+  expect_equal(
+    sort(unlist(
+      gd3_collections_local[gd3_collections_local$name == "Justin", 3]
+    )),
+    c(1, 19)
+  )
+
+  unlink(jsonPath2)
+  unlink(jsonPath3)
+})
+
+test_that("pivot GroupedData column", {
+  df <- createDataFrame(data.frame(
+    earnings = c(10000, 10000, 11000, 15000, 12000, 20000, 21000, 22000),
+    course = c("R", "Python", "R", "Python", "R", "Python", "R", "Python"),
+    year = c(2013, 2013, 2014, 2014, 2015, 2015, 2016, 2016)
+  ))
+  sum1 <- collect(sum(pivot(groupBy(df, "year"), "course"), "earnings"))
+  sum2 <- collect(sum(pivot(groupBy(df, "year"), "course", c("Python", "R")), "earnings"))
+  sum3 <- collect(sum(pivot(groupBy(df, "year"), "course", list("Python", "R")), "earnings"))
+  sum4 <- collect(sum(pivot(groupBy(df, "year"), "course", "R"), "earnings"))
+
+  correct_answer <- data.frame(
+    year = c(2013, 2014, 2015, 2016),
+    Python = c(10000, 15000, 20000, 22000),
+    R = c(10000, 11000, 12000, 21000)
+  )
+  expect_equal(sum1, correct_answer)
+  expect_equal(sum2, correct_answer)
+  expect_equal(sum3, correct_answer)
+  expect_equal(sum4, correct_answer[, c("year", "R")])
+
+  expect_error(collect(sum(pivot(groupBy(df, "year"), "course", c("R", "R")), "earnings")))
+  expect_error(collect(sum(pivot(groupBy(df, "year"), "course", list("R", "R")), "earnings")))
+})
+
+test_that("test multi-dimensional aggregations with cube and rollup", {
+  df <- createDataFrame(data.frame(
+    id = 1:6,
+    year = c(2016, 2016, 2016, 2017, 2017, 2017),
+    salary = c(10000, 15000, 20000, 22000, 32000, 21000),
+    department = c("management", "rnd", "sales", "management", "rnd", "sales")
+  ))
+
+  actual_cube <- collect(
+    orderBy(
+      agg(
+        cube(df, "year", "department"),
+        expr("sum(salary) AS total_salary"),
+        expr("avg(salary) AS average_salary"),
+        alias(grouping_bit(df$year), "grouping_year"),
+        alias(grouping_bit(df$department), "grouping_department"),
+        alias(grouping_id(df$year, df$department), "grouping_id")
+      ),
+      "year", "department"
+    )
+  )
+
+  expected_cube <- data.frame(
+    year = c(rep(NA, 4), rep(2016, 4), rep(2017, 4)),
+    department = rep(c(NA, "management", "rnd", "sales"), times = 3),
+    total_salary = c(
+      120000, # Total
+      10000 + 22000, 15000 + 32000, 20000 + 21000, # Department only
+      20000 + 15000 + 10000, # 2016
+      10000, 15000, 20000, # 2016 each department
+      21000 + 32000 + 22000, # 2017
+      22000, 32000, 21000 # 2017 each department
+    ),
+    average_salary = c(
+      # Total
+      mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
+      # Mean by department
+      mean(c(10000, 22000)), mean(c(15000, 32000)), mean(c(20000, 21000)),
+      mean(c(10000, 15000, 20000)), # 2016
+      10000, 15000, 20000, # 2016 each department
+      mean(c(21000, 32000, 22000)), # 2017
+      22000, 32000, 21000 # 2017 each department
+    ),
+    grouping_year = c(
+      1, # global
+      1, 1, 1, # by department
+      0, # 2016
+      0, 0, 0, # 2016 by department
+      0, # 2017
+      0, 0, 0 # 2017 by department
+    ),
+    grouping_department = c(
+      1, # global
+      0, 0, 0, # by department
+      1, # 2016
+      0, 0, 0, # 2016 by department
+      1, # 2017
+      0, 0, 0 # 2017 by department
+    ),
+    grouping_id = c(
+      3, #  11
+      2, 2, 2, # 10
+      1, # 01
+      0, 0, 0, # 00
+      1, # 01
+      0, 0, 0 # 00
+    ),
+    stringsAsFactors = FALSE
+  )
+
+  expect_equal(actual_cube, expected_cube)
+
+  # cube should accept column objects
+  expect_equal(
+    count(sum(cube(df, df$year, df$department), "salary")),
+    12
+  )
+
+  # cube without columns should result in a single aggregate
+  expect_equal(
+    collect(agg(cube(df), expr("sum(salary) as total_salary"))),
+    data.frame(total_salary = 120000)
+  )
+
+  actual_rollup <- collect(
+    orderBy(
+      agg(
+        rollup(df, "year", "department"),
+        expr("sum(salary) AS total_salary"), expr("avg(salary) AS average_salary"),
+        alias(grouping_bit(df$year), "grouping_year"),
+        alias(grouping_bit(df$department), "grouping_department"),
+        alias(grouping_id(df$year, df$department), "grouping_id")
+      ),
+      "year", "department"
+    )
+  )
+
+  expected_rollup <- data.frame(
+    year = c(NA, rep(2016, 4), rep(2017, 4)),
+    department = c(NA, rep(c(NA, "management", "rnd", "sales"), times = 2)),
+    total_salary = c(
+      120000, # Total
+      20000 + 15000 + 10000, # 2016
+      10000, 15000, 20000, # 2016 each department
+      21000 + 32000 + 22000, # 2017
+      22000, 32000, 21000 # 2017 each department
+    ),
+    average_salary = c(
+      # Total
+      mean(c(20000, 15000, 10000, 21000, 32000, 22000)),
+      mean(c(10000, 15000, 20000)), # 2016
+      10000, 15000, 20000, # 2016 each department
+      mean(c(21000, 32000, 22000)), # 2017
+      22000, 32000, 21000 # 2017 each department
+    ),
+    grouping_year = c(
+      1, # global
+      0, # 2016
+      0, 0, 0, # 2016 each department
+      0, # 2017
+      0, 0, 0 # 2017 each department
+    ),
+    grouping_department = c(
+      1, # global
+      1, # 2016
+      0, 0, 0, # 2016 each department
+      1, # 2017
+      0, 0, 0 # 2017 each department
+    ),
+    grouping_id = c(
+      3, # 11
+      1, # 01
+      0, 0, 0, # 00
+      1, # 01
+      0, 0, 0 # 00
+    ),
+    stringsAsFactors = FALSE
+  )
+
+  expect_equal(actual_rollup, expected_rollup)
+
+  # cube should accept column objects
+  expect_equal(
+    count(sum(rollup(df, df$year, df$department), "salary")),
+    9
+  )
+
+  # rollup without columns should result in a single aggregate
+  expect_equal(
+    collect(agg(rollup(df), expr("sum(salary) as total_salary"))),
+    data.frame(total_salary = 120000)
+  )
+})
+
+test_that("arrange() and orderBy() on a DataFrame", {
+  df <- read.json(jsonPath)
+  sorted <- arrange(df, df$age)
+  expect_equal(collect(sorted)[1, 2], "Michael")
+
+  sorted2 <- arrange(df, "name", decreasing = FALSE)
+  expect_equal(collect(sorted2)[2, "age"], 19)
+
+  sorted3 <- orderBy(df, asc(df$age))
+  expect_true(is.na(first(sorted3)$age))
+  expect_equal(collect(sorted3)[2, "age"], 19)
+
+  sorted4 <- orderBy(df, desc(df$name))
+  expect_equal(first(sorted4)$name, "Michael")
+  expect_equal(collect(sorted4)[3, "name"], "Andy")
+
+  sorted5 <- arrange(df, "age", "name", decreasing = TRUE)
+  expect_equal(collect(sorted5)[1, 2], "Andy")
+
+  sorted6 <- arrange(df, "age", "name", decreasing = c(T, F))
+  expect_equal(collect(sorted6)[1, 2], "Andy")
+
+  sorted7 <- arrange(df, "name", decreasing = FALSE)
+  expect_equal(collect(sorted7)[2, "age"], 19)
+})
+
+test_that("filter() on a DataFrame", {
+  df <- read.json(jsonPath)
+  filtered <- filter(df, "age > 20")
+  expect_equal(count(filtered), 1)
+  expect_equal(collect(filtered)$name, "Andy")
+  filtered2 <- where(df, df$name != "Michael")
+  expect_equal(count(filtered2), 2)
+  expect_equal(collect(filtered2)$age[2], 19)
+
+  # test suites for %in%
+  filtered3 <- filter(df, "age in (19)")
+  expect_equal(count(filtered3), 1)
+  filtered4 <- filter(df, "age in (19, 30)")
+  expect_equal(count(filtered4), 2)
+  filtered5 <- where(df, df$age %in% c(19))
+  expect_equal(count(filtered5), 1)
+  filtered6 <- where(df, df$age %in% c(19, 30))
+  expect_equal(count(filtered6), 2)
+
+  # test suites for %<=>%
+  dfNa <- read.json(jsonPathNa)
+  expect_equal(count(filter(dfNa, dfNa$age %<=>% 60)), 1)
+  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% 60))), 5 - 1)
+  expect_equal(count(filter(dfNa, dfNa$age %<=>% NULL)), 3)
+  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% NULL))), 5 - 3)
+  # match NA from two columns
+  expect_equal(count(filter(dfNa, dfNa$age %<=>% dfNa$height)), 2)
+  expect_equal(count(filter(dfNa, !(dfNa$age %<=>% dfNa$height))), 5 - 2)
+
+  # Test stats::filter is working
+  #expect_true(is.ts(filter(1:100, rep(1, 3)))) # nolint
+})
+
+test_that("join(), crossJoin() and merge() on a DataFrame", {
+  skip_on_cran()
+
+  df <- read.json(jsonPath)
+
+  mockLines2 <- c("{\"name\":\"Michael\", \"test\": \"yes\"}",
+                  "{\"name\":\"Andy\",  \"test\": \"no\"}",
+                  "{\"name\":\"Justin\", \"test\": \"yes\"}",
+                  "{\"name\":\"Bob\", \"test\": \"yes\"}")
+  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(mockLines2, jsonPath2)
+  df2 <- read.json(jsonPath2)
+
+  # inner join, not cartesian join
+  expect_equal(count(where(join(df, df2), df$name == df2$name)), 3)
+  # cartesian join
+  expect_error(tryCatch(count(join(df, df2)), error = function(e) { stop(e) }),
+               paste0(".*(org.apache.spark.sql.AnalysisException: Detected cartesian product for",
+                      " INNER join between logical plans).*"))
+
+  joined <- crossJoin(df, df2)
+  expect_equal(names(joined), c("age", "name", "name", "test"))
+  expect_equal(count(joined), 12)
+  expect_equal(names(collect(joined)), c("age", "name", "name", "test"))
+
+  joined2 <- join(df, df2, df$name == df2$name)
+  expect_equal(names(joined2), c("age", "name", "name", "test"))
+  expect_equal(count(joined2), 3)
+
+  joined3 <- join(df, df2, df$name == df2$name, "rightouter")
+  expect_equal(names(joined3), c("age", "name", "name", "test"))
+  expect_equal(count(joined3), 4)
+  expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
+
+  joined4 <- select(join(df, df2, df$name == df2$name, "outer"),
+                    alias(df$age + 5, "newAge"), df$name, df2$test)
+  expect_equal(names(joined4), c("newAge", "name", "test"))
+  expect_equal(count(joined4), 4)
+  expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
+
+  joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+  expect_equal(names(joined5), c("age", "name", "name", "test"))
+  expect_equal(count(joined5), 3)
+  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
+
+  joined6 <- join(df, df2, df$name == df2$name, "inner")
+  expect_equal(names(joined6), c("age", "name", "name", "test"))
+  expect_equal(count(joined6), 3)
+
+  joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
+  expect_equal(names(joined7), c("age", "name"))
+  expect_equal(count(joined7), 3)
+
+  joined8 <- join(df, df2, df$name == df2$name, "left_outer")
+  expect_equal(names(joined8), c("age", "name", "name", "test"))
+  expect_equal(count(joined8), 3)
+  expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
+
+  joined9 <- join(df, df2, df$name == df2$name, "right_outer")
+  expect_equal(names(joined9), c("age", "name", "name", "test"))
+  expect_equal(count(joined9), 4)
+  expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
+
+  merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE)
+  expect_equal(count(merged), 4)
+  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+  expect_equal(collect(orderBy(merged, merged$name_x))$age[3], 19)
+
+  merged <- merge(df, df2, suffixes = c("-X", "-Y"))
+  expect_equal(count(merged), 3)
+  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name-X"))$age[1], 30)
+
+  merged <- merge(df, df2, by = "name", suffixes = c("-X", "-Y"), sort = FALSE)
+  expect_equal(count(merged), 3)
+  expect_equal(names(merged), c("age", "name-X", "name-Y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name-Y"))$"name-X"[3], "Michael")
+
+  merged <- merge(df, df2, by = "name", all = T, sort = T)
+  expect_equal(count(merged), 4)
+  expect_equal(names(merged), c("age", "name_x", "name_y", "test"))
+  expect_equal(collect(orderBy(merged, merged$"name_y"))$"name_x"[1], "Andy")
+
+  merged <- merge(df, df2, by = NULL)
+  expect_equal(count(merged), 12)
+  expect_equal(names(merged), c("age", "name", "name", "test"))
+
+  mockLines3 <- c("{\"name\":\"Michael\", \"name_y\":\"Michael\", \"test\": \"yes\"}",
+                  "{\"name\":\"Andy\", \"name_y\":\"Andy\", \"test\": \"no\"}",
+                  "{\"name\":\"Justin\", \"name_y\":\"Justin\", \"test\": \"yes\"}",
+                  "{\"name\":\"Bob\", \"name_y\":\"Bob\", \"test\": \"yes\"}")
+  jsonPath3 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(mockLines3, jsonPath3)
+  df3 <- read.json(jsonPath3)
+  expect_error(merge(df, df3),
+               paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
+                     "Please use different suffixes for the intersected columns.", sep = ""))
+
+  unlink(jsonPath2)
+  unlink(jsonPath3)
+
+  # Join with broadcast hint
+  df1 <- sql("SELECT * FROM range(10e10)")
+  df2 <- sql("SELECT * FROM range(10e10)")
+
+  execution_plan <- capture.output(explain(join(df1, df2, df1$id == df2$id)))
+  expect_false(any(grepl("BroadcastHashJoin", execution_plan)))
+
+  execution_plan_hint <- capture.output(
+    explain(join(df1, hint(df2, "broadcast"), df1$id == df2$id))
+  )
+  expect_true(any(grepl("BroadcastHashJoin", execution_plan_hint)))
+
+  execution_plan_broadcast <- capture.output(
+    explain(join(df1, broadcast(df2), df1$id == df2$id))
+  )
+  expect_true(any(grepl("BroadcastHashJoin", execution_plan_broadcast)))
+})
+
+test_that("toJSON() on DataFrame", {
+  df <- as.DataFrame(cars)
+  df_json <- toJSON(df)
+  expect_is(df_json, "SparkDataFrame")
+  expect_equal(colnames(df_json), c("value"))
+  expect_equal(head(df_json, 1),
+              data.frame(value = "{\"speed\":4.0,\"dist\":2.0}", stringsAsFactors = FALSE))
+})
+
+test_that("showDF()", {
+  df <- read.json(jsonPath)
+  expected <- paste("+----+-------+\n",
+                    "| age|   name|\n",
+                    "+----+-------+\n",
+                    "|null|Michael|\n",
+                    "|  30|   Andy|\n",
+                    "|  19| Justin|\n",
+                    "+----+-------+\n", sep = "")
+  expected2 <- paste("+---+----+\n",
+                     "|age|name|\n",
+                     "+---+----+\n",
+                     "|nul| Mic|\n",
+                     "| 30| And|\n",
+                     "| 19| Jus|\n",
+                     "+---+----+\n", sep = "")
+  expect_output(showDF(df), expected)
+  expect_output(showDF(df, truncate = 3), expected2)
+})
+
+test_that("isLocal()", {
+  df <- read.json(jsonPath)
+  expect_false(isLocal(df))
+})
+
+test_that("union(), rbind(), except(), and intersect() on a DataFrame", {
+  df <- read.json(jsonPath)
+
+  lines <- c("{\"name\":\"Bob\", \"age\":24}",
+             "{\"name\":\"Andy\", \"age\":30}",
+             "{\"name\":\"James\", \"age\":35}")
+  jsonPath2 <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
+  writeLines(lines, jsonPath2)
+  df2 <- read.df(jsonPath2, "json")
+
+  unioned <- arrange(union(df, df2), df$age)
+  expect_is(unioned, "SparkDataFrame")
+  expect_equal(count(unioned), 6)
+  expect_equal(first(unioned)$name, "Michael")
+  expect_equal(count(arrange(suppressWarnings(unionAll(df, df2)), df$age)), 6)
+
+  unioned2 <- arrange(rbind(unioned, df, df2), df$age)
+  expect_is(unioned2, "SparkDataFrame")
+  expect_equal(count(unioned2), 12)
+  expect_equal(first(unioned2)$name, "Michael")
+
+  df3 <- df2
+  names(df3)[1] <- "newName"
+  expect_error(rbind(df, df3),
+               "Names of input data frames are different.")
+  expect_error(rbind(df, df2, df3),
+               "Names of input data frames are different.")
+
+  excepted <- arrange(except(df, df2), desc(df$age))
+  expect_is(unioned, "SparkDataFrame")
+  expect_equal(count(excepted), 2)
+  expect_equal(first(excepted)$name, "Justin")
+
+  intersected <- arrange(intersect(df, df2), df$age)
+  expect_is(unioned, "SparkDataFrame")
+  expect_equal(count(intersected), 1)
+  expect_equal(first(intersected)$name, "Andy")
+
+  # Test base::union is working
+  expect_equal(union(c(1:3), c(3:5)), c(1:5))
+
+  # Test base::rbind is working
+  expect_equal(length(rbind(1:4, c = 2, a = 10, 10, deparse.level = 0)), 16)
+
+  # Test base::intersect is working
+  expect_equal(length(intersect(1:20, 3:23)), 18)
+
+  unlink(jsonPath2)
+})
+
+test_that("withColumn() and withColumnRenamed()", {
+  df <- read.json(jsonPath)
+  newDF <- withColumn(df, "newAge", df$age + 2)
+  expect_equal(length(columns(newDF)), 3)
+  expect_equal(columns(newDF)[3], "newAge")
+  expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
+
+  # Replace existing column
+  newDF <- withColumn(df, "age", df$age + 2)
+  expect_equal(length(columns(newDF)), 2)
+  expect_equal(first(filter(newDF, df$name != "Michael"))$age, 32)
+
+  newDF <- withColumn(df, "age", 18)
+  expect_equal(length(columns(newDF)), 2)
+  expect_equal(first(newDF)$age, 18)
+
+  expect_error(withColumn(df, "age", list("a")),
+              "Literal value must be atomic in length of 1")
+
+  newDF2 <- withColumnRenamed(df, "age", "newerAge")
+  expect_equal(length(columns(newDF2)), 2)
+  expect_equal(columns(newDF2)[1], "newerAge")
+})
+
+test_that("mutate(), transform(), rename() and names()", {
+  df <- read.json(jsonPath)
+  newDF <- mutate(df, newAge = df$age + 2)
+  expect_equal(length(columns(newDF)), 3)
+  expect_equal(columns(newDF)[3], "newAge")
+  expect_equal(first(filter(newDF, df$name != "Michael"))$newAge, 32)
+
+  newDF <- mutate(df, age = df$age

<TRUNCATED>

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[3/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_fpm.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_fpm.R b/R/pkg/tests/fulltests/test_mllib_fpm.R
new file mode 100644
index 0000000..4e10ca1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_fpm.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib frequent pattern mining")
+
+# Tests for MLlib frequent pattern mining algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.fpGrowth", {
+  data <- selectExpr(createDataFrame(data.frame(items = c(
+    "1,2",
+    "1,2",
+    "1,2,3",
+    "1,3"
+  ))), "split(items, ',') as items")
+
+  model <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8, numPartitions = 1)
+
+  itemsets <- collect(spark.freqItemsets(model))
+
+  expected_itemsets <- data.frame(
+    items = I(list(list("3"), list("3", "1"), list("2"), list("2", "1"), list("1"))),
+    freq = c(2, 2, 3, 3, 4)
+  )
+
+  expect_equivalent(expected_itemsets, itemsets)
+
+  expected_association_rules <- data.frame(
+    antecedent = I(list(list("2"), list("3"))),
+    consequent = I(list(list("1"), list("1"))),
+    confidence = c(1, 1)
+  )
+
+  expect_equivalent(expected_association_rules, collect(spark.associationRules(model)))
+
+  new_data <- selectExpr(createDataFrame(data.frame(items = c(
+    "1,2",
+    "1,3",
+    "2,3"
+  ))), "split(items, ',') as items")
+
+  expected_predictions <- data.frame(
+    items = I(list(list("1", "2"), list("1", "3"), list("2", "3"))),
+    prediction = I(list(list(), list(), list("1")))
+  )
+
+  expect_equivalent(expected_predictions, collect(predict(model, new_data)))
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-fpm", fileext = ".tmp")
+    write.ml(model, modelPath, overwrite = TRUE)
+    loaded_model <- read.ml(modelPath)
+
+    expect_equivalent(
+      itemsets,
+      collect(spark.freqItemsets(loaded_model)))
+
+    unlink(modelPath)
+  }
+
+  model_without_numpartitions <- spark.fpGrowth(data, minSupport = 0.3, minConfidence = 0.8)
+  expect_equal(
+    count(spark.freqItemsets(model_without_numpartitions)),
+    count(spark.freqItemsets(model))
+  )
+
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_recommendation.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R
new file mode 100644
index 0000000..cc8064f
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R
@@ -0,0 +1,67 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib recommendation algorithms")
+
+# Tests for MLlib recommendation algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.als", {
+  data <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0),
+               list(2, 1, 1.0), list(2, 2, 5.0))
+  df <- createDataFrame(data, c("user", "item", "score"))
+  model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item",
+                     rank = 10, maxIter = 5, seed = 0, regParam = 0.1)
+  stats <- summary(model)
+  expect_equal(stats$rank, 10)
+  test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item"))
+  predictions <- collect(predict(model, test))
+
+  expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409),
+  tolerance = 1e-4)
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-als", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats2$rating, "score")
+    userFactors <- collect(stats$userFactors)
+    itemFactors <- collect(stats$itemFactors)
+    userFactors2 <- collect(stats2$userFactors)
+    itemFactors2 <- collect(stats2$itemFactors)
+
+    orderUser <- order(userFactors$id)
+    orderUser2 <- order(userFactors2$id)
+    expect_equal(userFactors$id[orderUser], userFactors2$id[orderUser2])
+    expect_equal(userFactors$features[orderUser], userFactors2$features[orderUser2])
+
+    orderItem <- order(itemFactors$id)
+    orderItem2 <- order(itemFactors2$id)
+    expect_equal(itemFactors$id[orderItem], itemFactors2$id[orderItem2])
+    expect_equal(itemFactors$features[orderItem], itemFactors2$features[orderItem2])
+
+    unlink(modelPath)
+  }
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_regression.R b/R/pkg/tests/fulltests/test_mllib_regression.R
new file mode 100644
index 0000000..b05fdd3
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_regression.R
@@ -0,0 +1,480 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib regression algorithms, except for tree-based algorithms")
+
+# Tests for MLlib regression algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("formula of spark.glm", {
+  skip_on_cran()
+
+  training <- suppressWarnings(createDataFrame(iris))
+  # directly calling the spark API
+  # dot minus and intercept vs native glm
+  model <- spark.glm(training, Sepal_Width ~ . - Species + 0)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # feature interaction vs native glm
+  model <- spark.glm(training, Sepal_Width ~ Species:Sepal_Length)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # glm should work with long formula
+  training <- suppressWarnings(createDataFrame(iris))
+  training$LongLongLongLongLongName <- training$Sepal_Width
+  training$VeryLongLongLongLonLongName <- training$Sepal_Length
+  training$AnotherLongLongLongLongName <- training$Species
+  model <- spark.glm(training, LongLongLongLongLongName ~ VeryLongLongLongLonLongName +
+    AnotherLongLongLongLongName)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
+test_that("spark.glm and predict", {
+  training <- suppressWarnings(createDataFrame(iris))
+  # gaussian family
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # poisson family
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = poisson(link = identity))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
+                                        data = iris, family = poisson(link = identity)), iris))
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # Gamma family
+  x <- runif(100, -1, 1)
+  y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10)
+  df <- as.DataFrame(as.data.frame(list(x = x, y = y)))
+  model <- glm(y ~ x, family = Gamma, df)
+  out <- capture.output(print(summary(model)))
+  expect_true(any(grepl("Dispersion parameter for gamma family", out)))
+
+  # tweedie family
+  model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
+                     family = "tweedie", var.power = 1.2, link.power = 0.0)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+
+  # manual calculation of the R predicted values to avoid dependence on statmod
+  #' library(statmod)
+  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
+  #' print(coef(rModel))
+
+  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                       data = iris) %*% rCoef))
+  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+  # Test stats::predict is working
+  x <- rnorm(15)
+  y <- x + rnorm(15)
+  expect_equal(length(predict(lm(y ~ x))), 15)
+})
+
+test_that("spark.glm summary", {
+  # gaussian family
+  training <- suppressWarnings(createDataFrame(iris))
+  stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species))
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+  # test summary coefficients return matrix type
+  expect_true(class(stats$coefficients) == "matrix")
+  expect_true(class(stats$coefficients[, 1]) == "numeric")
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  out <- capture.output(print(stats))
+  expect_match(out[2], "Deviance Residuals:")
+  expect_true(any(grepl("AIC: 59.22", out)))
+
+  # binomial family
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  stats <- summary(spark.glm(training, Species ~ Sepal_Length + Sepal_Width,
+                             family = binomial(link = "logit")))
+
+  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+                        family = binomial(link = "logit")))
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # Test spark.glm works with weighted dataset
+  a1 <- c(0, 1, 2, 3)
+  a2 <- c(5, 2, 1, 3)
+  w <- c(1, 2, 3, 4)
+  b <- c(1, 0, 1, 0)
+  data <- as.data.frame(cbind(a1, a2, w, b))
+  df <- createDataFrame(data)
+
+  stats <- summary(spark.glm(df, b ~ a1 + a2, family = "binomial", weightCol = "w"))
+  rStats <- summary(glm(b ~ a1 + a2, family = "binomial", data = data, weights = w))
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
+  expect_true(all(abs(rCoefs - coefs) < 1e-3))
+  expect_true(all(rownames(stats$coefficients) == c("(Intercept)", "a1", "a2")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # Test summary works on base GLM models
+  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+  baseSummary <- summary(baseModel)
+  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+
+  # Test spark.glm works with regularization parameter
+  data <- as.data.frame(cbind(a1, a2, b))
+  df <- suppressWarnings(createDataFrame(data))
+  regStats <- summary(spark.glm(df, b ~ a1 + a2, regParam = 1.0))
+  expect_equal(regStats$aic, 13.32836, tolerance = 1e-4) # 13.32836 is from summary() result
+
+  # Test spark.glm works on collinear data
+  A <- matrix(c(1, 2, 3, 4, 2, 4, 6, 8), 4, 2)
+  b <- c(1, 2, 3, 4)
+  data <- as.data.frame(cbind(A, b))
+  df <- createDataFrame(data)
+  stats <- summary(spark.glm(df, b ~ . - 1))
+  coefs <- stats$coefficients
+  expect_true(all(abs(c(0.5, 0.25) - coefs) < 1e-4))
+})
+
+test_that("spark.glm save/load", {
+  skip_on_cran()
+
+  training <- suppressWarnings(createDataFrame(iris))
+  m <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species)
+  s <- summary(m)
+
+  modelPath <- tempfile(pattern = "spark-glm", fileext = ".tmp")
+  write.ml(m, modelPath)
+  expect_error(write.ml(m, modelPath))
+  write.ml(m, modelPath, overwrite = TRUE)
+  m2 <- read.ml(modelPath)
+  s2 <- summary(m2)
+
+  expect_equal(s$coefficients, s2$coefficients)
+  expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
+  expect_equal(s$dispersion, s2$dispersion)
+  expect_equal(s$null.deviance, s2$null.deviance)
+  expect_equal(s$deviance, s2$deviance)
+  expect_equal(s$df.null, s2$df.null)
+  expect_equal(s$df.residual, s2$df.residual)
+  expect_equal(s$aic, s2$aic)
+  expect_equal(s$iter, s2$iter)
+  expect_true(!s$is.loaded)
+  expect_true(s2$is.loaded)
+
+  unlink(modelPath)
+})
+
+test_that("formula of glm", {
+  skip_on_cran()
+
+  training <- suppressWarnings(createDataFrame(iris))
+  # dot minus and intercept vs native glm
+  model <- glm(Sepal_Width ~ . - Species + 0, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # feature interaction vs native glm
+  model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # glm should work with long formula
+  training <- suppressWarnings(createDataFrame(iris))
+  training$LongLongLongLongLongName <- training$Sepal_Width
+  training$VeryLongLongLongLonLongName <- training$Sepal_Length
+  training$AnotherLongLongLongLongName <- training$Species
+  model <- glm(LongLongLongLongLongName ~ VeryLongLongLongLonLongName + AnotherLongLongLongLongName,
+               data = training)
+  vals <- collect(select(predict(model, training), "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+})
+
+test_that("glm and predict", {
+  skip_on_cran()
+
+  training <- suppressWarnings(createDataFrame(iris))
+  # gaussian family
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # poisson family
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+               family = poisson(link = identity))
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+  rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
+                                        data = iris, family = poisson(link = identity)), iris))
+  expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
+
+  # tweedie family
+  model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
+               family = "tweedie", var.power = 1.2, link.power = 0.0)
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
+  vals <- collect(select(prediction, "prediction"))
+
+  # manual calculation of the R predicted values to avoid dependence on statmod
+  #' library(statmod)
+  #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris,
+  #'             family = tweedie(var.power = 1.2, link.power = 0.0))
+  #' print(coef(rModel))
+
+  rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174)
+  rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species,
+                                   data = iris) %*% rCoef))
+  expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals)
+
+  # Test stats::predict is working
+  x <- rnorm(15)
+  y <- x + rnorm(15)
+  expect_equal(length(predict(lm(y ~ x))), 15)
+})
+
+test_that("glm summary", {
+  skip_on_cran()
+
+  # gaussian family
+  training <- suppressWarnings(createDataFrame(iris))
+  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+
+  rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # binomial family
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
+                       family = binomial(link = "logit")))
+
+  rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
+                        family = binomial(link = "logit")))
+
+  coefs <- stats$coefficients
+  rCoefs <- rStats$coefficients
+  expect_true(all(abs(rCoefs - coefs) < 1e-4))
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "Sepal_Length", "Sepal_Width")))
+  expect_equal(stats$dispersion, rStats$dispersion)
+  expect_equal(stats$null.deviance, rStats$null.deviance)
+  expect_equal(stats$deviance, rStats$deviance)
+  expect_equal(stats$df.null, rStats$df.null)
+  expect_equal(stats$df.residual, rStats$df.residual)
+  expect_equal(stats$aic, rStats$aic)
+
+  # Test summary works on base GLM models
+  baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
+  baseSummary <- summary(baseModel)
+  expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
+})
+
+test_that("glm save/load", {
+  skip_on_cran()
+
+  training <- suppressWarnings(createDataFrame(iris))
+  m <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
+  s <- summary(m)
+
+  modelPath <- tempfile(pattern = "glm", fileext = ".tmp")
+  write.ml(m, modelPath)
+  expect_error(write.ml(m, modelPath))
+  write.ml(m, modelPath, overwrite = TRUE)
+  m2 <- read.ml(modelPath)
+  s2 <- summary(m2)
+
+  expect_equal(s$coefficients, s2$coefficients)
+  expect_equal(rownames(s$coefficients), rownames(s2$coefficients))
+  expect_equal(s$dispersion, s2$dispersion)
+  expect_equal(s$null.deviance, s2$null.deviance)
+  expect_equal(s$deviance, s2$deviance)
+  expect_equal(s$df.null, s2$df.null)
+  expect_equal(s$df.residual, s2$df.residual)
+  expect_equal(s$aic, s2$aic)
+  expect_equal(s$iter, s2$iter)
+  expect_true(!s$is.loaded)
+  expect_true(s2$is.loaded)
+
+  unlink(modelPath)
+})
+
+test_that("spark.isoreg", {
+  label <- c(7.0, 5.0, 3.0, 5.0, 1.0)
+  feature <- c(0.0, 1.0, 2.0, 3.0, 4.0)
+  weight <- c(1.0, 1.0, 1.0, 1.0, 1.0)
+  data <- as.data.frame(cbind(label, feature, weight))
+  df <- createDataFrame(data)
+
+  model <- spark.isoreg(df, label ~ feature, isotonic = FALSE,
+                        weightCol = "weight")
+  # only allow one variable on the right hand side of the formula
+  expect_error(model2 <- spark.isoreg(df, ~., isotonic = FALSE))
+  result <- summary(model)
+  expect_equal(result$predictions, list(7, 5, 4, 4, 1))
+
+  # Test model prediction
+  predict_data <- list(list(-2.0), list(-1.0), list(0.5),
+                       list(0.75), list(1.0), list(2.0), list(9.0))
+  predict_df <- createDataFrame(predict_data, c("feature"))
+  predict_result <- collect(select(predict(model, predict_df), "prediction"))
+  expect_equal(predict_result$prediction, c(7.0, 7.0, 6.0, 5.5, 5.0, 4.0, 1.0))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-isoreg", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    expect_equal(result, summary(model2))
+
+    unlink(modelPath)
+  }
+})
+
+test_that("spark.survreg", {
+  # R code to reproduce the result.
+  #
+  #' rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
+  #'               x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
+  #' library(survival)
+  #' model <- survreg(Surv(time, status) ~ x + sex, rData)
+  #' summary(model)
+  #' predict(model, data)
+  #
+  # -- output of 'summary(model)'
+  #
+  #              Value Std. Error     z        p
+  # (Intercept)  1.315      0.270  4.88 1.07e-06
+  # x           -0.190      0.173 -1.10 2.72e-01
+  # sex         -0.253      0.329 -0.77 4.42e-01
+  # Log(scale)  -1.160      0.396 -2.93 3.41e-03
+  #
+  # -- output of 'predict(model, data)'
+  #
+  #        1        2        3        4        5        6        7
+  # 3.724591 2.545368 3.079035 3.079035 2.390146 2.891269 2.891269
+  #
+  data <- list(list(4, 1, 0, 0), list(3, 1, 2, 0), list(1, 1, 1, 0),
+          list(1, 0, 1, 0), list(2, 1, 1, 1), list(2, 1, 0, 1), list(3, 0, 0, 1))
+  df <- createDataFrame(data, c("time", "status", "x", "sex"))
+  model <- spark.survreg(df, Surv(time, status) ~ x + sex)
+  stats <- summary(model)
+  coefs <- as.vector(stats$coefficients[, 1])
+  rCoefs <- c(1.3149571, -0.1903409, -0.2532618, -1.1599800)
+  expect_equal(coefs, rCoefs, tolerance = 1e-4)
+  expect_true(all(
+    rownames(stats$coefficients) ==
+    c("(Intercept)", "x", "sex", "Log(scale)")))
+  p <- collect(select(predict(model, df), "prediction"))
+  expect_equal(p$prediction, c(3.724591, 2.545368, 3.079035, 3.079035,
+               2.390146, 2.891269, 2.891269), tolerance = 1e-4)
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-survreg", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    coefs2 <- as.vector(stats2$coefficients[, 1])
+    expect_equal(coefs, coefs2)
+    expect_equal(rownames(stats$coefficients), rownames(stats2$coefficients))
+
+    unlink(modelPath)
+  }
+
+  # Test survival::survreg
+  if (requireNamespace("survival", quietly = TRUE)) {
+    rData <- list(time = c(4, 3, 1, 1, 2, 2, 3), status = c(1, 1, 1, 0, 1, 1, 0),
+                 x = c(0, 2, 1, 1, 1, 0, 0), sex = c(0, 0, 0, 0, 1, 1, 1))
+    expect_error(
+      model <- survival::survreg(formula = survival::Surv(time, status) ~ x + sex, data = rData),
+                                 NA)
+    expect_equal(predict(model, rData)[[1]], 3.724591, tolerance = 1e-4)
+  }
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_stat.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_stat.R b/R/pkg/tests/fulltests/test_mllib_stat.R
new file mode 100644
index 0000000..1600833
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_stat.R
@@ -0,0 +1,53 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib statistics algorithms")
+
+# Tests for MLlib statistics algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("spark.kstest", {
+  data <- data.frame(test = c(0.1, 0.15, 0.2, 0.3, 0.25, -1, -0.5))
+  df <- createDataFrame(data)
+  testResult <- spark.kstest(df, "test", "norm")
+  stats <- summary(testResult)
+
+  rStats <- ks.test(data$test, "pnorm", alternative = "two.sided")
+
+  expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+  expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+  expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+  testResult <- spark.kstest(df, "test", "norm", -0.5)
+  stats <- summary(testResult)
+
+  rStats <- ks.test(data$test, "pnorm", -0.5, 1, alternative = "two.sided")
+
+  expect_equal(stats$p.value, rStats$p.value, tolerance = 1e-4)
+  expect_equal(stats$statistic, unname(rStats$statistic), tolerance = 1e-4)
+  expect_match(capture.output(stats)[1], "Kolmogorov-Smirnov test summary:")
+
+  # Test print.summary.KSTest
+  printStats <- capture.output(print.summary.KSTest(stats))
+  expect_match(printStats[1], "Kolmogorov-Smirnov test summary:")
+  expect_match(printStats[5],
+               "Low presumption against null hypothesis: Sample follows theoretical distribution. ")
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_tree.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R
new file mode 100644
index 0000000..31427ee
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_tree.R
@@ -0,0 +1,320 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib tree-based algorithms")
+
+# Tests for MLlib tree-based algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+  sparkHome <- sparkR.conf("spark.home")
+  file.path(sparkHome, x)
+}
+
+test_that("spark.gbt", {
+  skip_on_cran()
+
+  # regression
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.gbt(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, seed = 123)
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+  expect_equal(stats$formula, "Employed ~ .")
+  expect_equal(stats$numFeatures, 6)
+  expect_equal(length(stats$treeWeights), 20)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-gbtRegression", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$formula, stats2$formula)
+    expect_equal(stats$numFeatures, stats2$numFeatures)
+    expect_equal(stats$features, stats2$features)
+    expect_equal(stats$featureImportances, stats2$featureImportances)
+    expect_equal(stats$maxDepth, stats2$maxDepth)
+    expect_equal(stats$numTrees, stats2$numTrees)
+    expect_equal(stats$treeWeights, stats2$treeWeights)
+
+    unlink(modelPath)
+  }
+
+  # classification
+  # label must be binary - GBTClassifier currently only supports binary classification.
+  iris2 <- iris[iris$Species != "virginica", ]
+  data <- suppressWarnings(createDataFrame(iris2))
+  model <- spark.gbt(data, Species ~ Petal_Length + Petal_Width, "classification")
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+  predictions <- collect(predict(model, data))$prediction
+  # test string prediction values
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-gbtClassification", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$depth, stats2$depth)
+    expect_equal(stats$numNodes, stats2$numNodes)
+    expect_equal(stats$numClasses, stats2$numClasses)
+
+    unlink(modelPath)
+  }
+
+  iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
+  df <- suppressWarnings(createDataFrame(iris2))
+  m <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
+  s <- summary(m)
+  # test numeric prediction values
+  expect_equal(iris2$NumericSpecies, as.double(collect(predict(m, df))$prediction))
+  expect_equal(s$numFeatures, 5)
+  expect_equal(s$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+
+  # spark.gbt classification can work on libsvm data
+  if (not_cran_or_windows_with_hadoop()) {
+    data <- read.df(absoluteSparkPath("data/mllib/sample_binary_classification_data.txt"),
+                  source = "libsvm")
+    model <- spark.gbt(data, label ~ features, "classification")
+    expect_equal(summary(model)$numFeatures, 692)
+  }
+})
+
+test_that("spark.randomForest", {
+  # regression
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                              numTrees = 1)
+
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 1)
+  expect_equal(stats$maxDepth, 5)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+
+  model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16,
+                              numTrees = 20, seed = 123)
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070,
+                                         63.53160, 64.05470, 65.12710, 64.30450,
+                                         66.70910, 67.86125, 68.08700, 67.21865,
+                                         68.89275, 69.53180, 69.39640, 69.68250),
+               tolerance = 1e-4)
+  stats <- summary(model)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-randomForestRegression", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$formula, stats2$formula)
+    expect_equal(stats$numFeatures, stats2$numFeatures)
+    expect_equal(stats$features, stats2$features)
+    expect_equal(stats$featureImportances, stats2$featureImportances)
+    expect_equal(stats$numTrees, stats2$numTrees)
+    expect_equal(stats$maxDepth, stats2$maxDepth)
+    expect_equal(stats$treeWeights, stats2$treeWeights)
+
+    unlink(modelPath)
+  }
+
+  # classification
+  data <- suppressWarnings(createDataFrame(iris))
+  model <- spark.randomForest(data, Species ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+  # Test string prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-randomForestClassification", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$depth, stats2$depth)
+    expect_equal(stats$numNodes, stats2$numNodes)
+    expect_equal(stats$numClasses, stats2$numClasses)
+
+    unlink(modelPath)
+  }
+
+  # Test numeric response variable
+  labelToIndex <- function(species) {
+    switch(as.character(species),
+      setosa = 0.0,
+      versicolor = 1.0,
+      virginica = 2.0
+    )
+  }
+  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+  data <- suppressWarnings(createDataFrame(iris[-5]))
+  model <- spark.randomForest(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$numTrees, 20)
+  expect_equal(stats$maxDepth, 5)
+
+  # Test numeric prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("1.0", predictions)), 50)
+  expect_equal(length(grep("2.0", predictions)), 50)
+
+  # spark.randomForest classification can work on libsvm data
+  if (not_cran_or_windows_with_hadoop()) {
+    data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+                  source = "libsvm")
+    model <- spark.randomForest(data, label ~ features, "classification")
+    expect_equal(summary(model)$numFeatures, 4)
+  }
+})
+
+test_that("spark.decisionTree", {
+  skip_on_cran()
+
+  # regression
+  data <- suppressWarnings(createDataFrame(longley))
+  model <- spark.decisionTree(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16)
+
+  predictions <- collect(predict(model, data))
+  expect_equal(predictions$prediction, c(60.323, 61.122, 60.171, 61.187,
+                                         63.221, 63.639, 64.989, 63.761,
+                                         66.019, 67.857, 68.169, 66.513,
+                                         68.655, 69.564, 69.331, 70.551),
+               tolerance = 1e-4)
+
+  stats <- summary(model)
+  expect_equal(stats$maxDepth, 5)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-decisionTreeRegression", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$formula, stats2$formula)
+    expect_equal(stats$numFeatures, stats2$numFeatures)
+    expect_equal(stats$features, stats2$features)
+    expect_equal(stats$featureImportances, stats2$featureImportances)
+    expect_equal(stats$maxDepth, stats2$maxDepth)
+
+    unlink(modelPath)
+  }
+
+  # classification
+  data <- suppressWarnings(createDataFrame(iris))
+  model <- spark.decisionTree(data, Species ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$maxDepth, 5)
+  expect_error(capture.output(stats), NA)
+  expect_true(length(capture.output(stats)) > 6)
+  # Test string prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("setosa", predictions)), 50)
+  expect_equal(length(grep("versicolor", predictions)), 50)
+
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-decisionTreeClassification", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$depth, stats2$depth)
+    expect_equal(stats$numNodes, stats2$numNodes)
+    expect_equal(stats$numClasses, stats2$numClasses)
+
+    unlink(modelPath)
+  }
+
+  # Test numeric response variable
+  labelToIndex <- function(species) {
+    switch(as.character(species),
+      setosa = 0.0,
+      versicolor = 1.0,
+      virginica = 2.0
+    )
+  }
+  iris$NumericSpecies <- lapply(iris$Species, labelToIndex)
+  data <- suppressWarnings(createDataFrame(iris[-5]))
+  model <- spark.decisionTree(data, NumericSpecies ~ Petal_Length + Petal_Width, "classification",
+                              maxDepth = 5, maxBins = 16)
+  stats <- summary(model)
+  expect_equal(stats$numFeatures, 2)
+  expect_equal(stats$maxDepth, 5)
+
+  # Test numeric prediction values
+  predictions <- collect(predict(model, data))$prediction
+  expect_equal(length(grep("1.0", predictions)), 50)
+  expect_equal(length(grep("2.0", predictions)), 50)
+
+  # spark.decisionTree classification can work on libsvm data
+  if (not_cran_or_windows_with_hadoop()) {
+    data <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+                  source = "libsvm")
+    model <- spark.decisionTree(data, label ~ features, "classification")
+    expect_equal(summary(model)$numFeatures, 4)
+  }
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_parallelize_collect.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_parallelize_collect.R b/R/pkg/tests/fulltests/test_parallelize_collect.R
new file mode 100644
index 0000000..52d4c93
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_parallelize_collect.R
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("parallelize() and collect()")
+
+# Mock data
+numVector <- c(-10:97)
+numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
+strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
+               "violated, but I'm not. No, in fact, I think this is a friendly",
+               "message, like \"Hey, wanna play?\" and yes, I want to play. ",
+               "I really, really do.")
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
+                "other times it helps me control the chaos.",
+                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
+                "raising me. But they're both dead now. I didn't kill them. Honest.")
+
+numPairs <- list(list(1, 1), list(1, 2), list(2, 2), list(2, 3))
+strPairs <- list(list(strList, strList), list(strList, strList))
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+jsc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Tests
+
+test_that("parallelize() on simple vectors and lists returns an RDD", {
+  skip_on_cran()
+
+  numVectorRDD <- parallelize(jsc, numVector, 1)
+  numVectorRDD2 <- parallelize(jsc, numVector, 10)
+  numListRDD <- parallelize(jsc, numList, 1)
+  numListRDD2 <- parallelize(jsc, numList, 4)
+  strVectorRDD <- parallelize(jsc, strVector, 2)
+  strVectorRDD2 <- parallelize(jsc, strVector, 3)
+  strListRDD <- parallelize(jsc, strList, 4)
+  strListRDD2 <- parallelize(jsc, strList, 1)
+
+  rdds <- c(numVectorRDD,
+             numVectorRDD2,
+             numListRDD,
+             numListRDD2,
+             strVectorRDD,
+             strVectorRDD2,
+             strListRDD,
+             strListRDD2)
+
+  for (rdd in rdds) {
+    expect_is(rdd, "RDD")
+    expect_true(.hasSlot(rdd, "jrdd")
+                && inherits(rdd@jrdd, "jobj")
+                && isInstanceOf(rdd@jrdd, "org.apache.spark.api.java.JavaRDD"))
+  }
+})
+
+test_that("collect(), following a parallelize(), gives back the original collections", {
+  skip_on_cran()
+
+  numVectorRDD <- parallelize(jsc, numVector, 10)
+  expect_equal(collectRDD(numVectorRDD), as.list(numVector))
+
+  numListRDD <- parallelize(jsc, numList, 1)
+  numListRDD2 <- parallelize(jsc, numList, 4)
+  expect_equal(collectRDD(numListRDD), as.list(numList))
+  expect_equal(collectRDD(numListRDD2), as.list(numList))
+
+  strVectorRDD <- parallelize(jsc, strVector, 2)
+  strVectorRDD2 <- parallelize(jsc, strVector, 3)
+  expect_equal(collectRDD(strVectorRDD), as.list(strVector))
+  expect_equal(collectRDD(strVectorRDD2), as.list(strVector))
+
+  strListRDD <- parallelize(jsc, strList, 4)
+  strListRDD2 <- parallelize(jsc, strList, 1)
+  expect_equal(collectRDD(strListRDD), as.list(strList))
+  expect_equal(collectRDD(strListRDD2), as.list(strList))
+})
+
+test_that("regression: collect() following a parallelize() does not drop elements", {
+  skip_on_cran()
+
+  # 10 %/% 6 = 1, ceiling(10 / 6) = 2
+  collLen <- 10
+  numPart <- 6
+  expected <- runif(collLen)
+  actual <- collectRDD(parallelize(jsc, expected, numPart))
+  expect_equal(actual, as.list(expected))
+})
+
+test_that("parallelize() and collect() work for lists of pairs (pairwise data)", {
+  skip_on_cran()
+
+  # use the pairwise logical to indicate pairwise data
+  numPairsRDDD1 <- parallelize(jsc, numPairs, 1)
+  numPairsRDDD2 <- parallelize(jsc, numPairs, 2)
+  numPairsRDDD3 <- parallelize(jsc, numPairs, 3)
+  expect_equal(collectRDD(numPairsRDDD1), numPairs)
+  expect_equal(collectRDD(numPairsRDDD2), numPairs)
+  expect_equal(collectRDD(numPairsRDDD3), numPairs)
+  # can also leave out the parameter name, if the params are supplied in order
+  strPairsRDDD1 <- parallelize(jsc, strPairs, 1)
+  strPairsRDDD2 <- parallelize(jsc, strPairs, 2)
+  expect_equal(collectRDD(strPairsRDDD1), strPairs)
+  expect_equal(collectRDD(strPairsRDDD2), strPairs)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_rdd.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_rdd.R b/R/pkg/tests/fulltests/test_rdd.R
new file mode 100644
index 0000000..fb244e1
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_rdd.R
@@ -0,0 +1,906 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("basic RDD functions")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+nums <- 1:10
+rdd <- parallelize(sc, nums, 2L)
+
+intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
+intRdd <- parallelize(sc, intPairs, 2L)
+
+test_that("get number of partitions in RDD", {
+  skip_on_cran()
+
+  expect_equal(getNumPartitionsRDD(rdd), 2)
+  expect_equal(getNumPartitionsRDD(intRdd), 2)
+})
+
+test_that("first on RDD", {
+  skip_on_cran()
+
+  expect_equal(firstRDD(rdd), 1)
+  newrdd <- lapply(rdd, function(x) x + 1)
+  expect_equal(firstRDD(newrdd), 2)
+})
+
+test_that("count and length on RDD", {
+  skip_on_cran()
+
+  expect_equal(countRDD(rdd), 10)
+  expect_equal(lengthRDD(rdd), 10)
+})
+
+test_that("count by values and keys", {
+  skip_on_cran()
+
+  mods <- lapply(rdd, function(x) { x %% 3 })
+  actual <- countByValue(mods)
+  expected <- list(list(0, 3L), list(1, 4L), list(2, 3L))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  actual <- countByKey(intRdd)
+  expected <- list(list(2L, 2L), list(1L, 2L))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("lapply on RDD", {
+  skip_on_cran()
+
+  multiples <- lapply(rdd, function(x) { 2 * x })
+  actual <- collectRDD(multiples)
+  expect_equal(actual, as.list(nums * 2))
+})
+
+test_that("lapplyPartition on RDD", {
+  skip_on_cran()
+
+  sums <- lapplyPartition(rdd, function(part) { sum(unlist(part)) })
+  actual <- collectRDD(sums)
+  expect_equal(actual, list(15, 40))
+})
+
+test_that("mapPartitions on RDD", {
+  skip_on_cran()
+
+  sums <- mapPartitions(rdd, function(part) { sum(unlist(part)) })
+  actual <- collectRDD(sums)
+  expect_equal(actual, list(15, 40))
+})
+
+test_that("flatMap() on RDDs", {
+  skip_on_cran()
+
+  flat <- flatMap(intRdd, function(x) { list(x, x) })
+  actual <- collectRDD(flat)
+  expect_equal(actual, rep(intPairs, each = 2))
+})
+
+test_that("filterRDD on RDD", {
+  skip_on_cran()
+
+  filtered.rdd <- filterRDD(rdd, function(x) { x %% 2 == 0 })
+  actual <- collectRDD(filtered.rdd)
+  expect_equal(actual, list(2, 4, 6, 8, 10))
+
+  filtered.rdd <- Filter(function(x) { x[[2]] < 0 }, intRdd)
+  actual <- collectRDD(filtered.rdd)
+  expect_equal(actual, list(list(1L, -1)))
+
+  # Filter out all elements.
+  filtered.rdd <- filterRDD(rdd, function(x) { x > 10 })
+  actual <- collectRDD(filtered.rdd)
+  expect_equal(actual, list())
+})
+
+test_that("lookup on RDD", {
+  skip_on_cran()
+
+  vals <- lookup(intRdd, 1L)
+  expect_equal(vals, list(-1, 200))
+
+  vals <- lookup(intRdd, 3L)
+  expect_equal(vals, list())
+})
+
+test_that("several transformations on RDD (a benchmark on PipelinedRDD)", {
+  skip_on_cran()
+
+  rdd2 <- rdd
+  for (i in 1:12)
+    rdd2 <- lapplyPartitionsWithIndex(
+              rdd2, function(partIndex, part) {
+                part <- as.list(unlist(part) * partIndex + i)
+              })
+  rdd2 <- lapply(rdd2, function(x) x + x)
+  actual <- collectRDD(rdd2)
+  expected <- list(24, 24, 24, 24, 24,
+                   168, 170, 172, 174, 176)
+  expect_equal(actual, expected)
+})
+
+test_that("PipelinedRDD support actions: cache(), persist(), unpersist(), checkpoint()", {
+  skip_on_cran()
+
+  # RDD
+  rdd2 <- rdd
+  # PipelinedRDD
+  rdd2 <- lapplyPartitionsWithIndex(
+            rdd2,
+            function(partIndex, part) {
+              part <- as.list(unlist(part) * partIndex)
+            })
+
+  cacheRDD(rdd2)
+  expect_true(rdd2@env$isCached)
+  rdd2 <- lapply(rdd2, function(x) x)
+  expect_false(rdd2@env$isCached)
+
+  unpersistRDD(rdd2)
+  expect_false(rdd2@env$isCached)
+
+  persistRDD(rdd2, "MEMORY_AND_DISK")
+  expect_true(rdd2@env$isCached)
+  rdd2 <- lapply(rdd2, function(x) x)
+  expect_false(rdd2@env$isCached)
+
+  unpersistRDD(rdd2)
+  expect_false(rdd2@env$isCached)
+
+  tempDir <- tempfile(pattern = "checkpoint")
+  setCheckpointDirSC(sc, tempDir)
+  checkpointRDD(rdd2)
+  expect_true(rdd2@env$isCheckpointed)
+
+  rdd2 <- lapply(rdd2, function(x) x)
+  expect_false(rdd2@env$isCached)
+  expect_false(rdd2@env$isCheckpointed)
+
+  # make sure the data is collectable
+  collectRDD(rdd2)
+
+  unlink(tempDir)
+})
+
+test_that("reduce on RDD", {
+  skip_on_cran()
+
+  sum <- reduce(rdd, "+")
+  expect_equal(sum, 55)
+
+  # Also test with an inline function
+  sumInline <- reduce(rdd, function(x, y) { x + y })
+  expect_equal(sumInline, 55)
+})
+
+test_that("lapply with dependency", {
+  skip_on_cran()
+
+  fa <- 5
+  multiples <- lapply(rdd, function(x) { fa * x })
+  actual <- collectRDD(multiples)
+
+  expect_equal(actual, as.list(nums * 5))
+})
+
+test_that("lapplyPartitionsWithIndex on RDDs", {
+  skip_on_cran()
+
+  func <- function(partIndex, part) { list(partIndex, Reduce("+", part)) }
+  actual <- collectRDD(lapplyPartitionsWithIndex(rdd, func), flatten = FALSE)
+  expect_equal(actual, list(list(0, 15), list(1, 40)))
+
+  pairsRDD <- parallelize(sc, list(list(1, 2), list(3, 4), list(4, 8)), 1L)
+  partitionByParity <- function(key) { if (key %% 2 == 1) 0 else 1 }
+  mkTup <- function(partIndex, part) { list(partIndex, part) }
+  actual <- collectRDD(lapplyPartitionsWithIndex(
+                      partitionByRDD(pairsRDD, 2L, partitionByParity),
+                      mkTup),
+                    FALSE)
+  expect_equal(actual, list(list(0, list(list(1, 2), list(3, 4))),
+                            list(1, list(list(4, 8)))))
+})
+
+test_that("sampleRDD() on RDDs", {
+  skip_on_cran()
+
+  expect_equal(unlist(collectRDD(sampleRDD(rdd, FALSE, 1.0, 2014L))), nums)
+})
+
+test_that("takeSample() on RDDs", {
+  skip_on_cran()
+
+  # ported from RDDSuite.scala, modified seeds
+  data <- parallelize(sc, 1:100, 2L)
+  for (seed in 4:5) {
+    s <- takeSample(data, FALSE, 20L, seed)
+    expect_equal(length(s), 20L)
+    expect_equal(length(unique(s)), 20L)
+    for (elem in s) {
+      expect_true(elem >= 1 && elem <= 100)
+    }
+  }
+  for (seed in 4:5) {
+    s <- takeSample(data, FALSE, 200L, seed)
+    expect_equal(length(s), 100L)
+    expect_equal(length(unique(s)), 100L)
+    for (elem in s) {
+      expect_true(elem >= 1 && elem <= 100)
+    }
+  }
+  for (seed in 4:5) {
+    s <- takeSample(data, TRUE, 20L, seed)
+    expect_equal(length(s), 20L)
+    for (elem in s) {
+      expect_true(elem >= 1 && elem <= 100)
+    }
+  }
+  for (seed in 4:5) {
+    s <- takeSample(data, TRUE, 100L, seed)
+    expect_equal(length(s), 100L)
+    # Chance of getting all distinct elements is astronomically low, so test we
+    # got less than 100
+    expect_true(length(unique(s)) < 100L)
+  }
+  for (seed in 4:5) {
+    s <- takeSample(data, TRUE, 200L, seed)
+    expect_equal(length(s), 200L)
+    # Chance of getting all distinct elements is still quite low, so test we
+    # got less than 100
+    expect_true(length(unique(s)) < 100L)
+  }
+})
+
+test_that("mapValues() on pairwise RDDs", {
+  skip_on_cran()
+
+  multiples <- mapValues(intRdd, function(x) { x * 2 })
+  actual <- collectRDD(multiples)
+  expected <- lapply(intPairs, function(x) {
+    list(x[[1]], x[[2]] * 2)
+  })
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("flatMapValues() on pairwise RDDs", {
+  skip_on_cran()
+
+  l <- parallelize(sc, list(list(1, c(1, 2)), list(2, c(3, 4))))
+  actual <- collectRDD(flatMapValues(l, function(x) { x }))
+  expect_equal(actual, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+
+  # Generate x to x+1 for every value
+  actual <- collectRDD(flatMapValues(intRdd, function(x) { x: (x + 1) }))
+  expect_equal(actual,
+               list(list(1L, -1), list(1L, 0), list(2L, 100), list(2L, 101),
+                    list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
+})
+
+test_that("reduceByKeyLocally() on PairwiseRDDs", {
+  skip_on_cran()
+
+  pairs <- parallelize(sc, list(list(1, 2), list(1.1, 3), list(1, 4)), 2L)
+  actual <- reduceByKeyLocally(pairs, "+")
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list(1, 6), list(1.1, 3))))
+
+  pairs <- parallelize(sc, list(list("abc", 1.2), list(1.1, 0), list("abc", 1.3),
+                                list("bb", 5)), 4L)
+  actual <- reduceByKeyLocally(pairs, "+")
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list("abc", 2.5), list(1.1, 0), list("bb", 5))))
+})
+
+test_that("distinct() on RDDs", {
+  skip_on_cran()
+
+  nums.rep2 <- rep(1:10, 2)
+  rdd.rep2 <- parallelize(sc, nums.rep2, 2L)
+  uniques <- distinctRDD(rdd.rep2)
+  actual <- sort(unlist(collectRDD(uniques)))
+  expect_equal(actual, nums)
+})
+
+test_that("maximum() on RDDs", {
+  skip_on_cran()
+
+  max <- maximum(rdd)
+  expect_equal(max, 10)
+})
+
+test_that("minimum() on RDDs", {
+  skip_on_cran()
+
+  min <- minimum(rdd)
+  expect_equal(min, 1)
+})
+
+test_that("sumRDD() on RDDs", {
+  skip_on_cran()
+
+  sum <- sumRDD(rdd)
+  expect_equal(sum, 55)
+})
+
+test_that("keyBy on RDDs", {
+  skip_on_cran()
+
+  func <- function(x) { x * x }
+  keys <- keyBy(rdd, func)
+  actual <- collectRDD(keys)
+  expect_equal(actual, lapply(nums, function(x) { list(func(x), x) }))
+})
+
+test_that("repartition/coalesce on RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, 1:20, 4L) # each partition contains 5 elements
+
+  # repartition
+  r1 <- repartitionRDD(rdd, 2)
+  expect_equal(getNumPartitionsRDD(r1), 2L)
+  count <- length(collectPartition(r1, 0L))
+  expect_true(count >= 8 && count <= 12)
+
+  r2 <- repartitionRDD(rdd, 6)
+  expect_equal(getNumPartitionsRDD(r2), 6L)
+  count <- length(collectPartition(r2, 0L))
+  expect_true(count >= 0 && count <= 4)
+
+  # coalesce
+  r3 <- coalesceRDD(rdd, 1)
+  expect_equal(getNumPartitionsRDD(r3), 1L)
+  count <- length(collectPartition(r3, 0L))
+  expect_equal(count, 20)
+})
+
+test_that("sortBy() on RDDs", {
+  skip_on_cran()
+
+  sortedRdd <- sortBy(rdd, function(x) { x * x }, ascending = FALSE)
+  actual <- collectRDD(sortedRdd)
+  expect_equal(actual, as.list(sort(nums, decreasing = TRUE)))
+
+  rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
+  sortedRdd2 <- sortBy(rdd2, function(x) { x * x })
+  actual <- collectRDD(sortedRdd2)
+  expect_equal(actual, as.list(nums))
+})
+
+test_that("takeOrdered() on RDDs", {
+  skip_on_cran()
+
+  l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
+  rdd <- parallelize(sc, l)
+  actual <- takeOrdered(rdd, 6L)
+  expect_equal(actual, as.list(sort(unlist(l)))[1:6])
+
+  l <- list("e", "d", "c", "d", "a")
+  rdd <- parallelize(sc, l)
+  actual <- takeOrdered(rdd, 3L)
+  expect_equal(actual, as.list(sort(unlist(l)))[1:3])
+})
+
+test_that("top() on RDDs", {
+  skip_on_cran()
+
+  l <- list(10, 1, 2, 9, 3, 4, 5, 6, 7)
+  rdd <- parallelize(sc, l)
+  actual <- top(rdd, 6L)
+  expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:6])
+
+  l <- list("e", "d", "c", "d", "a")
+  rdd <- parallelize(sc, l)
+  actual <- top(rdd, 3L)
+  expect_equal(actual, as.list(sort(unlist(l), decreasing = TRUE))[1:3])
+})
+
+test_that("fold() on RDDs", {
+  skip_on_cran()
+
+  actual <- fold(rdd, 0, "+")
+  expect_equal(actual, Reduce("+", nums, 0))
+
+  rdd <- parallelize(sc, list())
+  actual <- fold(rdd, 0, "+")
+  expect_equal(actual, 0)
+})
+
+test_that("aggregateRDD() on RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, list(1, 2, 3, 4))
+  zeroValue <- list(0, 0)
+  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+  actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
+  expect_equal(actual, list(10, 4))
+
+  rdd <- parallelize(sc, list())
+  actual <- aggregateRDD(rdd, zeroValue, seqOp, combOp)
+  expect_equal(actual, list(0, 0))
+})
+
+test_that("zipWithUniqueId() on RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
+  actual <- collectRDD(zipWithUniqueId(rdd))
+  expected <- list(list("a", 0), list("b", 1), list("c", 4),
+                   list("d", 2), list("e", 5))
+  expect_equal(actual, expected)
+
+  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
+  actual <- collectRDD(zipWithUniqueId(rdd))
+  expected <- list(list("a", 0), list("b", 1), list("c", 2),
+                   list("d", 3), list("e", 4))
+  expect_equal(actual, expected)
+})
+
+test_that("zipWithIndex() on RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 3L)
+  actual <- collectRDD(zipWithIndex(rdd))
+  expected <- list(list("a", 0), list("b", 1), list("c", 2),
+                   list("d", 3), list("e", 4))
+  expect_equal(actual, expected)
+
+  rdd <- parallelize(sc, list("a", "b", "c", "d", "e"), 1L)
+  actual <- collectRDD(zipWithIndex(rdd))
+  expected <- list(list("a", 0), list("b", 1), list("c", 2),
+                   list("d", 3), list("e", 4))
+  expect_equal(actual, expected)
+})
+
+test_that("glom() on RDD", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, as.list(1:4), 2L)
+  actual <- collectRDD(glom(rdd))
+  expect_equal(actual, list(list(1, 2), list(3, 4)))
+})
+
+test_that("keys() on RDDs", {
+  skip_on_cran()
+
+  keys <- keys(intRdd)
+  actual <- collectRDD(keys)
+  expect_equal(actual, lapply(intPairs, function(x) { x[[1]] }))
+})
+
+test_that("values() on RDDs", {
+  skip_on_cran()
+
+  values <- values(intRdd)
+  actual <- collectRDD(values)
+  expect_equal(actual, lapply(intPairs, function(x) { x[[2]] }))
+})
+
+test_that("pipeRDD() on RDDs", {
+  skip_on_cran()
+
+  actual <- collectRDD(pipeRDD(rdd, "more"))
+  expected <- as.list(as.character(1:10))
+  expect_equal(actual, expected)
+
+  trailed.rdd <- parallelize(sc, c("1", "", "2\n", "3\n\r\n"))
+  actual <- collectRDD(pipeRDD(trailed.rdd, "sort"))
+  expected <- list("", "1", "2", "3")
+  expect_equal(actual, expected)
+
+  rev.nums <- 9:0
+  rev.rdd <- parallelize(sc, rev.nums, 2L)
+  actual <- collectRDD(pipeRDD(rev.rdd, "sort"))
+  expected <- as.list(as.character(c(5:9, 0:4)))
+  expect_equal(actual, expected)
+})
+
+test_that("zipRDD() on RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, 0:4, 2)
+  rdd2 <- parallelize(sc, 1000:1004, 2)
+  actual <- collectRDD(zipRDD(rdd1, rdd2))
+  expect_equal(actual,
+               list(list(0, 1000), list(1, 1001), list(2, 1002), list(3, 1003), list(4, 1004)))
+
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName, 1)
+  actual <- collectRDD(zipRDD(rdd, rdd))
+  expected <- lapply(mockFile, function(x) { list(x, x) })
+  expect_equal(actual, expected)
+
+  rdd1 <- parallelize(sc, 0:1, 1)
+  actual <- collectRDD(zipRDD(rdd1, rdd))
+  expected <- lapply(0:1, function(x) { list(x, mockFile[x + 1]) })
+  expect_equal(actual, expected)
+
+  rdd1 <- map(rdd, function(x) { x })
+  actual <- collectRDD(zipRDD(rdd, rdd1))
+  expected <- lapply(mockFile, function(x) { list(x, x) })
+  expect_equal(actual, expected)
+
+  unlink(fileName)
+})
+
+test_that("cartesian() on RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, 1:3)
+  actual <- collectRDD(cartesian(rdd, rdd))
+  expect_equal(sortKeyValueList(actual),
+               list(
+                 list(1, 1), list(1, 2), list(1, 3),
+                 list(2, 1), list(2, 2), list(2, 3),
+                 list(3, 1), list(3, 2), list(3, 3)))
+
+  # test case where one RDD is empty
+  emptyRdd <- parallelize(sc, list())
+  actual <- collectRDD(cartesian(rdd, emptyRdd))
+  expect_equal(actual, list())
+
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName)
+  actual <- collectRDD(cartesian(rdd, rdd))
+  expected <- list(
+    list("Spark is awesome.", "Spark is pretty."),
+    list("Spark is awesome.", "Spark is awesome."),
+    list("Spark is pretty.", "Spark is pretty."),
+    list("Spark is pretty.", "Spark is awesome."))
+  expect_equal(sortKeyValueList(actual), expected)
+
+  rdd1 <- parallelize(sc, 0:1)
+  actual <- collectRDD(cartesian(rdd1, rdd))
+  expect_equal(sortKeyValueList(actual),
+               list(
+                 list(0, "Spark is pretty."),
+                 list(0, "Spark is awesome."),
+                 list(1, "Spark is pretty."),
+                 list(1, "Spark is awesome.")))
+
+  rdd1 <- map(rdd, function(x) { x })
+  actual <- collectRDD(cartesian(rdd, rdd1))
+  expect_equal(sortKeyValueList(actual), expected)
+
+  unlink(fileName)
+})
+
+test_that("subtract() on RDDs", {
+  skip_on_cran()
+
+  l <- list(1, 1, 2, 2, 3, 4)
+  rdd1 <- parallelize(sc, l)
+
+  # subtract by itself
+  actual <- collectRDD(subtract(rdd1, rdd1))
+  expect_equal(actual, list())
+
+  # subtract by an empty RDD
+  rdd2 <- parallelize(sc, list())
+  actual <- collectRDD(subtract(rdd1, rdd2))
+  expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
+               l)
+
+  rdd2 <- parallelize(sc, list(2, 4))
+  actual <- collectRDD(subtract(rdd1, rdd2))
+  expect_equal(as.list(sort(as.vector(actual, mode = "integer"))),
+               list(1, 1, 3))
+
+  l <- list("a", "a", "b", "b", "c", "d")
+  rdd1 <- parallelize(sc, l)
+  rdd2 <- parallelize(sc, list("b", "d"))
+  actual <- collectRDD(subtract(rdd1, rdd2))
+  expect_equal(as.list(sort(as.vector(actual, mode = "character"))),
+               list("a", "a", "c"))
+})
+
+test_that("subtractByKey() on pairwise RDDs", {
+  skip_on_cran()
+
+  l <- list(list("a", 1), list("b", 4),
+            list("b", 5), list("a", 2))
+  rdd1 <- parallelize(sc, l)
+
+  # subtractByKey by itself
+  actual <- collectRDD(subtractByKey(rdd1, rdd1))
+  expect_equal(actual, list())
+
+  # subtractByKey by an empty RDD
+  rdd2 <- parallelize(sc, list())
+  actual <- collectRDD(subtractByKey(rdd1, rdd2))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(l))
+
+  rdd2 <- parallelize(sc, list(list("a", 3), list("c", 1)))
+  actual <- collectRDD(subtractByKey(rdd1, rdd2))
+  expect_equal(actual,
+               list(list("b", 4), list("b", 5)))
+
+  l <- list(list(1, 1), list(2, 4),
+            list(2, 5), list(1, 2))
+  rdd1 <- parallelize(sc, l)
+  rdd2 <- parallelize(sc, list(list(1, 3), list(3, 1)))
+  actual <- collectRDD(subtractByKey(rdd1, rdd2))
+  expect_equal(actual,
+               list(list(2, 4), list(2, 5)))
+})
+
+test_that("intersection() on RDDs", {
+  skip_on_cran()
+
+  # intersection with self
+  actual <- collectRDD(intersection(rdd, rdd))
+  expect_equal(sort(as.integer(actual)), nums)
+
+  # intersection with an empty RDD
+  emptyRdd <- parallelize(sc, list())
+  actual <- collectRDD(intersection(rdd, emptyRdd))
+  expect_equal(actual, list())
+
+  rdd1 <- parallelize(sc, list(1, 10, 2, 3, 4, 5))
+  rdd2 <- parallelize(sc, list(1, 6, 2, 3, 7, 8))
+  actual <- collectRDD(intersection(rdd1, rdd2))
+  expect_equal(sort(as.integer(actual)), 1:3)
+})
+
+test_that("join() on pairwise RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list(1, list(1, 2)), list(1, list(1, 3)))))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+  rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list("a", list(1, 2)), list("a", list(1, 3)))))
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+  expect_equal(actual, list())
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+  actual <- collectRDD(joinRDD(rdd1, rdd2, 2L))
+  expect_equal(actual, list())
+})
+
+test_that("leftOuterJoin() on pairwise RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list(1, list(1, 2)), list(1, list(1, 3)), list(2, list(4, NULL)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+  rdd2 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+  expected <-  list(list("b", list(4, NULL)), list("a", list(1, 2)), list("a", list(1, 3)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list(1, list(1, NULL)), list(2, list(2, NULL)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+  actual <- collectRDD(leftOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list("b", list(2, NULL)), list("a", list(1, NULL)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+})
+
+test_that("rightOuterJoin() on pairwise RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+  rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)), list(2, list(NULL, 4)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3)))
+  rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)), list("a", list(3, 1)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+  actual <- collectRDD(rightOuterJoin(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
+})
+
+test_that("fullOuterJoin() on pairwise RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, list(list(1, 2), list(1, 3), list(3, 3)))
+  rdd2 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list(1, list(2, 1)), list(1, list(3, 1)),
+                   list(2, list(NULL, 4)), list(3, list(3, NULL)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list("a", 2), list("a", 3), list("c", 1)))
+  rdd2 <- parallelize(sc, list(list("a", 1), list("b", 4)))
+  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+  expected <- list(list("b", list(NULL, 4)), list("a", list(2, 1)),
+                   list("a", list(3, 1)), list("c", list(1, NULL)))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 2)))
+  rdd2 <- parallelize(sc, list(list(3, 3), list(4, 4)))
+  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list(1, list(1, NULL)), list(2, list(2, NULL)),
+                                     list(3, list(NULL, 3)), list(4, list(NULL, 4)))))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("b", 2)))
+  rdd2 <- parallelize(sc, list(list("c", 3), list("d", 4)))
+  actual <- collectRDD(fullOuterJoin(rdd1, rdd2, 2L))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list("a", list(1, NULL)), list("b", list(2, NULL)),
+                                     list("d", list(NULL, 4)), list("c", list(NULL, 3)))))
+})
+
+test_that("sortByKey() on pairwise RDDs", {
+  skip_on_cran()
+
+  numPairsRdd <- map(rdd, function(x) { list (x, x) })
+  sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
+  actual <- collectRDD(sortedRdd)
+  numPairs <- lapply(nums, function(x) { list (x, x) })
+  expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
+
+  rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
+  numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
+  sortedRdd2 <- sortByKey(numPairsRdd2)
+  actual <- collectRDD(sortedRdd2)
+  expect_equal(actual, numPairs)
+
+  # sort by string keys
+  l <- list(list("a", 1), list("b", 2), list("1", 3), list("d", 4), list("2", 5))
+  rdd3 <- parallelize(sc, l, 2L)
+  sortedRdd3 <- sortByKey(rdd3)
+  actual <- collectRDD(sortedRdd3)
+  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+  # test on the boundary cases
+
+  # boundary case 1: the RDD to be sorted has only 1 partition
+  rdd4 <- parallelize(sc, l, 1L)
+  sortedRdd4 <- sortByKey(rdd4)
+  actual <- collectRDD(sortedRdd4)
+  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+  # boundary case 2: the sorted RDD has only 1 partition
+  rdd5 <- parallelize(sc, l, 2L)
+  sortedRdd5 <- sortByKey(rdd5, numPartitions = 1L)
+  actual <- collectRDD(sortedRdd5)
+  expect_equal(actual, list(list("1", 3), list("2", 5), list("a", 1), list("b", 2), list("d", 4)))
+
+  # boundary case 3: the RDD to be sorted has only 1 element
+  l2 <- list(list("a", 1))
+  rdd6 <- parallelize(sc, l2, 2L)
+  sortedRdd6 <- sortByKey(rdd6)
+  actual <- collectRDD(sortedRdd6)
+  expect_equal(actual, l2)
+
+  # boundary case 4: the RDD to be sorted has 0 element
+  l3 <- list()
+  rdd7 <- parallelize(sc, l3, 2L)
+  sortedRdd7 <- sortByKey(rdd7)
+  actual <- collectRDD(sortedRdd7)
+  expect_equal(actual, l3)
+})
+
+test_that("collectAsMap() on a pairwise RDD", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, list(list(1, 2), list(3, 4)))
+  vals <- collectAsMap(rdd)
+  expect_equal(vals, list(`1` = 2, `3` = 4))
+
+  rdd <- parallelize(sc, list(list("a", 1), list("b", 2)))
+  vals <- collectAsMap(rdd)
+  expect_equal(vals, list(a = 1, b = 2))
+
+  rdd <- parallelize(sc, list(list(1.1, 2.2), list(1.2, 2.4)))
+  vals <- collectAsMap(rdd)
+  expect_equal(vals, list(`1.1` = 2.2, `1.2` = 2.4))
+
+  rdd <- parallelize(sc, list(list(1, "a"), list(2, "b")))
+  vals <- collectAsMap(rdd)
+  expect_equal(vals, list(`1` = "a", `2` = "b"))
+})
+
+test_that("show()", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, list(1:10))
+  expect_output(showRDD(rdd), "ParallelCollectionRDD\\[\\d+\\] at parallelize at RRDD\\.scala:\\d+")
+})
+
+test_that("sampleByKey() on pairwise RDDs", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, 1:2000)
+  pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list("a", x) else list("b", x) })
+  fractions <- list(a = 0.2, b = 0.1)
+  sample <- sampleByKey(pairsRDD, FALSE, fractions, 1618L)
+  expect_equal(100 < length(lookup(sample, "a")) && 300 > length(lookup(sample, "a")), TRUE)
+  expect_equal(50 < length(lookup(sample, "b")) && 150 > length(lookup(sample, "b")), TRUE)
+  expect_equal(lookup(sample, "a")[which.min(lookup(sample, "a"))] >= 0, TRUE)
+  expect_equal(lookup(sample, "a")[which.max(lookup(sample, "a"))] <= 2000, TRUE)
+  expect_equal(lookup(sample, "b")[which.min(lookup(sample, "b"))] >= 0, TRUE)
+  expect_equal(lookup(sample, "b")[which.max(lookup(sample, "b"))] <= 2000, TRUE)
+
+  rdd <- parallelize(sc, 1:2000)
+  pairsRDD <- lapply(rdd, function(x) { if (x %% 2 == 0) list(2, x) else list(3, x) })
+  fractions <- list(`2` = 0.2, `3` = 0.1)
+  sample <- sampleByKey(pairsRDD, TRUE, fractions, 1618L)
+  expect_equal(100 < length(lookup(sample, 2)) && 300 > length(lookup(sample, 2)), TRUE)
+  expect_equal(50 < length(lookup(sample, 3)) && 150 > length(lookup(sample, 3)), TRUE)
+  expect_equal(lookup(sample, 2)[which.min(lookup(sample, 2))] >= 0, TRUE)
+  expect_equal(lookup(sample, 2)[which.max(lookup(sample, 2))] <= 2000, TRUE)
+  expect_equal(lookup(sample, 3)[which.min(lookup(sample, 3))] >= 0, TRUE)
+  expect_equal(lookup(sample, 3)[which.max(lookup(sample, 3))] <= 2000, TRUE)
+})
+
+test_that("Test correct concurrency of RRDD.compute()", {
+  skip_on_cran()
+
+  rdd <- parallelize(sc, 1:1000, 100)
+  jrdd <- getJRDD(lapply(rdd, function(x) { x }), "row")
+  zrdd <- callJMethod(jrdd, "zip", jrdd)
+  count <- callJMethod(zrdd, "count")
+  expect_equal(count, 1000)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_shuffle.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_shuffle.R b/R/pkg/tests/fulltests/test_shuffle.R
new file mode 100644
index 0000000..18320ea
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_shuffle.R
@@ -0,0 +1,248 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("partitionBy, groupByKey, reduceByKey etc.")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+intPairs <- list(list(1L, -1), list(2L, 100), list(2L, 1), list(1L, 200))
+intRdd <- parallelize(sc, intPairs, 2L)
+
+doublePairs <- list(list(1.5, -1), list(2.5, 100), list(2.5, 1), list(1.5, 200))
+doubleRdd <- parallelize(sc, doublePairs, 2L)
+
+numPairs <- list(list(1L, 100), list(2L, 200), list(4L, -1), list(3L, 1),
+                 list(3L, 0))
+numPairsRdd <- parallelize(sc, numPairs, length(numPairs))
+
+strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge and ",
+                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ")
+strListRDD <- parallelize(sc, strList, 4)
+
+test_that("groupByKey for integers", {
+  skip_on_cran()
+
+  grouped <- groupByKey(intRdd, 2L)
+
+  actual <- collectRDD(grouped)
+
+  expected <- list(list(2L, list(100, 1)), list(1L, list(-1, 200)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("groupByKey for doubles", {
+  skip_on_cran()
+
+  grouped <- groupByKey(doubleRdd, 2L)
+
+  actual <- collectRDD(grouped)
+
+  expected <- list(list(1.5, list(-1, 200)), list(2.5, list(100, 1)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("reduceByKey for ints", {
+  skip_on_cran()
+
+  reduced <- reduceByKey(intRdd, "+", 2L)
+
+  actual <- collectRDD(reduced)
+
+  expected <- list(list(2L, 101), list(1L, 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("reduceByKey for doubles", {
+  skip_on_cran()
+
+  reduced <- reduceByKey(doubleRdd, "+", 2L)
+  actual <- collectRDD(reduced)
+
+  expected <- list(list(1.5, 199), list(2.5, 101))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for ints", {
+  skip_on_cran()
+
+  reduced <- combineByKey(intRdd, function(x) { x }, "+", "+", 2L)
+
+  actual <- collectRDD(reduced)
+
+  expected <- list(list(2L, 101), list(1L, 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for doubles", {
+  skip_on_cran()
+
+  reduced <- combineByKey(doubleRdd, function(x) { x }, "+", "+", 2L)
+  actual <- collectRDD(reduced)
+
+  expected <- list(list(1.5, 199), list(2.5, 101))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("combineByKey for characters", {
+  skip_on_cran()
+
+  stringKeyRDD <- parallelize(sc,
+                              list(list("max", 1L), list("min", 2L),
+                                   list("other", 3L), list("max", 4L)), 2L)
+  reduced <- combineByKey(stringKeyRDD,
+                          function(x) { x }, "+", "+", 2L)
+  actual <- collectRDD(reduced)
+
+  expected <- list(list("max", 5L), list("min", 2L), list("other", 3L))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("aggregateByKey", {
+  skip_on_cran()
+
+  # test aggregateByKey for int keys
+  rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+
+  zeroValue <- list(0, 0)
+  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
+
+  actual <- collectRDD(aggregatedRDD)
+
+  expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test aggregateByKey for string keys
+  rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))
+
+  zeroValue <- list(0, 0)
+  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
+
+  actual <- collectRDD(aggregatedRDD)
+
+  expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("foldByKey", {
+  skip_on_cran()
+
+  # test foldByKey for int keys
+  folded <- foldByKey(intRdd, 0, "+", 2L)
+
+  actual <- collectRDD(folded)
+
+  expected <- list(list(2L, 101), list(1L, 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test foldByKey for double keys
+  folded <- foldByKey(doubleRdd, 0, "+", 2L)
+
+  actual <- collectRDD(folded)
+
+  expected <- list(list(1.5, 199), list(2.5, 101))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test foldByKey for string keys
+  stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))
+
+  stringKeyRDD <- parallelize(sc, stringKeyPairs)
+  folded <- foldByKey(stringKeyRDD, 0, "+", 2L)
+
+  actual <- collectRDD(folded)
+
+  expected <- list(list("b", 101), list("a", 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test foldByKey for empty pair RDD
+  rdd <- parallelize(sc, list())
+  folded <- foldByKey(rdd, 0, "+", 2L)
+  actual <- collectRDD(folded)
+  expected <- list()
+  expect_equal(actual, expected)
+
+  # test foldByKey for RDD with only 1 pair
+  rdd <- parallelize(sc,  list(list(1, 1)))
+  folded <- foldByKey(rdd, 0, "+", 2L)
+  actual <- collectRDD(folded)
+  expected <- list(list(1, 1))
+  expect_equal(actual, expected)
+})
+
+test_that("partitionBy() partitions data correctly", {
+  skip_on_cran()
+
+  # Partition by magnitude
+  partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
+
+  resultRDD <- partitionByRDD(numPairsRdd, 2L, partitionByMagnitude)
+
+  expected_first <- list(list(1, 100), list(2, 200)) # key less than 3
+  expected_second <- list(list(4, -1), list(3, 1), list(3, 0)) # key greater than or equal 3
+  actual_first <- collectPartition(resultRDD, 0L)
+  actual_second <- collectPartition(resultRDD, 1L)
+
+  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+test_that("partitionBy works with dependencies", {
+  skip_on_cran()
+
+  kOne <- 1
+  partitionByParity <- function(key) { if (key %% 2 == kOne) 7 else 4 }
+
+  # Partition by parity
+  resultRDD <- partitionByRDD(numPairsRdd, numPartitions = 2L, partitionByParity)
+
+  # keys even; 100 %% 2 == 0
+  expected_first <- list(list(2, 200), list(4, -1))
+  # keys odd; 3 %% 2 == 1
+  expected_second <- list(list(1, 100), list(3, 1), list(3, 0))
+  actual_first <- collectPartition(resultRDD, 0L)
+  actual_second <- collectPartition(resultRDD, 1L)
+
+  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+test_that("test partitionBy with string keys", {
+  skip_on_cran()
+
+  words <- flatMap(strListRDD, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  resultRDD <- partitionByRDD(wordCount, 2L)
+  expected_first <- list(list("Dexter", 1), list("Dexter", 1))
+  expected_second <- list(list("and", 1), list("and", 1))
+
+  actual_first <- Filter(function(item) { item[[1]] == "Dexter" },
+                         collectPartition(resultRDD, 0L))
+  actual_second <- Filter(function(item) { item[[1]] == "and" },
+                          collectPartition(resultRDD, 1L))
+
+  expect_equal(sortKeyValueList(actual_first), sortKeyValueList(expected_first))
+  expect_equal(sortKeyValueList(actual_second), sortKeyValueList(expected_second))
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_sparkR.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkR.R b/R/pkg/tests/fulltests/test_sparkR.R
new file mode 100644
index 0000000..a40981c
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_sparkR.R
@@ -0,0 +1,48 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in sparkR.R")
+
+test_that("sparkCheckInstall", {
+  skip_on_cran()
+
+  # "local, yarn-client, mesos-client" mode, SPARK_HOME was set correctly,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- paste0(tempdir(), "/", "sparkHome")
+  dir.create(sparkHome)
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+  unlink(sparkHome, recursive = TRUE)
+
+  # "yarn-cluster, mesos-cluster" mode, SPARK_HOME was not set,
+  # and the SparkR job was submitted by "spark-submit"
+  sparkHome <- ""
+  master <- ""
+  deployMode <- ""
+  expect_true(is.null(sparkCheckInstall(sparkHome, master, deployMode)))
+
+  # "yarn-client, mesos-client" mode, SPARK_HOME was not set
+  sparkHome <- ""
+  master <- "yarn-client"
+  deployMode <- ""
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+  sparkHome <- ""
+  master <- ""
+  deployMode <- "client"
+  expect_error(sparkCheckInstall(sparkHome, master, deployMode))
+})


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org

[4/7] spark git commit: [SPARK-20877][SPARKR] refactor tests to basic tests only for CRAN

Posted by fe...@apache.org.

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_streaming.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_streaming.R b/R/pkg/inst/tests/testthat/test_streaming.R
deleted file mode 100644
index b20b431..0000000
--- a/R/pkg/inst/tests/testthat/test_streaming.R
+++ /dev/null
@@ -1,167 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-library(testthat)
-
-context("Structured Streaming")
-
-# Tests for Structured Streaming functions in SparkR
-
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-
-jsonSubDir <- file.path("sparkr-test", "json", "")
-if (.Platform$OS.type == "windows") {
-  # file.path removes the empty separator on Windows, adds it back
-  jsonSubDir <- paste0(jsonSubDir, .Platform$file.sep)
-}
-jsonDir <- file.path(tempdir(), jsonSubDir)
-dir.create(jsonDir, recursive = TRUE)
-
-mockLines <- c("{\"name\":\"Michael\"}",
-               "{\"name\":\"Andy\", \"age\":30}",
-               "{\"name\":\"Justin\", \"age\":19}")
-jsonPath <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-writeLines(mockLines, jsonPath)
-
-mockLinesNa <- c("{\"name\":\"Bob\",\"age\":16,\"height\":176.5}",
-                 "{\"name\":\"Alice\",\"age\":null,\"height\":164.3}",
-                 "{\"name\":\"David\",\"age\":60,\"height\":null}")
-jsonPathNa <- tempfile(pattern = jsonSubDir, fileext = ".tmp")
-
-schema <- structType(structField("name", "string"),
-                     structField("age", "integer"),
-                     structField("count", "double"))
-
-test_that("read.stream, write.stream, awaitTermination, stopQuery", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people", outputMode = "complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 3)
-
-  writeLines(mockLinesNa, jsonPathNa)
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people"))[[1]], 6)
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_error(awaitTermination(q), NA)
-})
-
-test_that("print from explain, lastProgress, status, isActive", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people2", outputMode = "complete")
-
-  awaitTermination(q, 5 * 1000)
-  callJMethod(q@ssq, "processAllAvailable")
-
-  expect_equal(capture.output(explain(q))[[1]], "== Physical Plan ==")
-  expect_true(any(grepl("\"description\" : \"MemorySink\"", capture.output(lastProgress(q)))))
-  expect_true(any(grepl("\"isTriggerActive\" : ", capture.output(status(q)))))
-
-  expect_equal(queryName(q), "people2")
-  expect_true(isActive(q))
-
-  stopQuery(q)
-})
-
-test_that("Stream other format", {
-  skip_on_cran()
-
-  parquetPath <- tempfile(pattern = "sparkr-test", fileext = ".parquet")
-  df <- read.df(jsonPath, "json", schema)
-  write.df(df, parquetPath, "parquet", "overwrite")
-
-  df <- read.stream(path = parquetPath, schema = schema)
-  expect_true(isStreaming(df))
-  counts <- count(group_by(df, "name"))
-  q <- write.stream(counts, "memory", queryName = "people3", outputMode = "complete")
-
-  expect_false(awaitTermination(q, 5 * 1000))
-  callJMethod(q@ssq, "processAllAvailable")
-  expect_equal(head(sql("SELECT count(*) FROM people3"))[[1]], 3)
-
-  expect_equal(queryName(q), "people3")
-  expect_true(any(grepl("\"description\" : \"FileStreamSource[[:print:]]+parquet",
-              capture.output(lastProgress(q)))))
-  expect_true(isActive(q))
-
-  stopQuery(q)
-  expect_true(awaitTermination(q, 1))
-  expect_false(isActive(q))
-
-  unlink(parquetPath)
-})
-
-test_that("Non-streaming DataFrame", {
-  skip_on_cran()
-
-  c <- as.DataFrame(cars)
-  expect_false(isStreaming(c))
-
-  expect_error(write.stream(c, "memory", queryName = "people", outputMode = "complete"),
-               paste0(".*(writeStream : analysis error - 'writeStream' can be called only on ",
-                      "streaming Dataset/DataFrame).*"))
-})
-
-test_that("Unsupported operation", {
-  skip_on_cran()
-
-  # memory sink without aggregation
-  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = 1)
-  expect_error(write.stream(df, "memory", queryName = "people", outputMode = "complete"),
-               paste0(".*(start : analysis error - Complete output mode not supported when there ",
-                      "are no streaming aggregations on streaming DataFrames/Datasets).*"))
-})
-
-test_that("Terminated by error", {
-  skip_on_cran()
-
-  df <- read.stream("json", path = jsonDir, schema = schema, maxFilesPerTrigger = -1)
-  counts <- count(group_by(df, "name"))
-  # This would not fail before returning with a StreamingQuery,
-  # but could dump error log at just about the same time
-  expect_error(q <- write.stream(counts, "memory", queryName = "people4", outputMode = "complete"),
-               NA)
-
-  expect_error(awaitTermination(q, 5 * 1000),
-               paste0(".*(awaitTermination : streaming query error - Invalid value '-1' for option",
-                      " 'maxFilesPerTrigger', must be a positive integer).*"))
-
-  expect_true(any(grepl("\"message\" : \"Terminated with exception: Invalid value",
-              capture.output(status(q)))))
-  expect_true(any(grepl("Streaming query has no progress", capture.output(lastProgress(q)))))
-  expect_equal(queryName(q), "people4")
-  expect_false(isActive(q))
-
-  stopQuery(q)
-})
-
-unlink(jsonPath)
-unlink(jsonPathNa)
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_take.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/testthat/test_take.R
deleted file mode 100644
index c00723b..0000000
--- a/R/pkg/inst/tests/testthat/test_take.R
+++ /dev/null
@@ -1,71 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("tests RDD function take()")
-
-# Mock data
-numVector <- c(-10:97)
-numList <- list(sqrt(1), sqrt(2), sqrt(3), 4 ** 10)
-strVector <- c("Dexter Morgan: I suppose I should be upset, even feel",
-               "violated, but I'm not. No, in fact, I think this is a friendly",
-               "message, like \"Hey, wanna play?\" and yes, I want to play. ",
-               "I really, really do.")
-strList <- list("Dexter Morgan: Blood. Sometimes it sets my teeth on edge, ",
-                "other times it helps me control the chaos.",
-                "Dexter Morgan: Harry and Dorris Morgan did a wonderful job ",
-                "raising me. But they're both dead now. I didn't kill them. Honest.")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-test_that("take() gives back the original elements in correct count and order", {
-  skip_on_cran()
-
-  numVectorRDD <- parallelize(sc, numVector, 10)
-  # case: number of elements to take is less than the size of the first partition
-  expect_equal(takeRDD(numVectorRDD, 1), as.list(head(numVector, n = 1)))
-  # case: number of elements to take is the same as the size of the first partition
-  expect_equal(takeRDD(numVectorRDD, 11), as.list(head(numVector, n = 11)))
-  # case: number of elements to take is greater than all elements
-  expect_equal(takeRDD(numVectorRDD, length(numVector)), as.list(numVector))
-  expect_equal(takeRDD(numVectorRDD, length(numVector) + 1), as.list(numVector))
-
-  numListRDD <- parallelize(sc, numList, 1)
-  numListRDD2 <- parallelize(sc, numList, 4)
-  expect_equal(takeRDD(numListRDD, 3), takeRDD(numListRDD2, 3))
-  expect_equal(takeRDD(numListRDD, 5), takeRDD(numListRDD2, 5))
-  expect_equal(takeRDD(numListRDD, 1), as.list(head(numList, n = 1)))
-  expect_equal(takeRDD(numListRDD2, 999), numList)
-
-  strVectorRDD <- parallelize(sc, strVector, 2)
-  strVectorRDD2 <- parallelize(sc, strVector, 3)
-  expect_equal(takeRDD(strVectorRDD, 4), as.list(strVector))
-  expect_equal(takeRDD(strVectorRDD2, 2), as.list(head(strVector, n = 2)))
-
-  strListRDD <- parallelize(sc, strList, 4)
-  strListRDD2 <- parallelize(sc, strList, 1)
-  expect_equal(takeRDD(strListRDD, 3), as.list(head(strList, n = 3)))
-  expect_equal(takeRDD(strListRDD2, 1), as.list(head(strList, n = 1)))
-
-  expect_equal(length(takeRDD(strListRDD, 0)), 0)
-  expect_equal(length(takeRDD(strVectorRDD, 0)), 0)
-  expect_equal(length(takeRDD(numListRDD, 0)), 0)
-  expect_equal(length(takeRDD(numVectorRDD, 0)), 0)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_textFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/testthat/test_textFile.R
deleted file mode 100644
index e8a961c..0000000
--- a/R/pkg/inst/tests/testthat/test_textFile.R
+++ /dev/null
@@ -1,182 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("the textFile() function")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-mockFile <- c("Spark is pretty.", "Spark is awesome.")
-
-test_that("textFile() on a local file returns an RDD", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName)
-  expect_is(rdd, "RDD")
-  expect_true(countRDD(rdd) > 0)
-  expect_equal(countRDD(rdd), 2)
-
-  unlink(fileName)
-})
-
-test_that("textFile() followed by a collect() returns the same content", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName)
-  expect_equal(collectRDD(rdd), as.list(mockFile))
-
-  unlink(fileName)
-})
-
-test_that("textFile() word count works as expected", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName)
-
-  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
-  wordCount <- lapply(words, function(word) { list(word, 1L) })
-
-  counts <- reduceByKey(wordCount, "+", 2L)
-  output <- collectRDD(counts)
-  expected <- list(list("pretty.", 1), list("is", 2), list("awesome.", 1),
-                   list("Spark", 2))
-  expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
-
-  unlink(fileName)
-})
-
-test_that("several transformations on RDD created by textFile()", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName) # RDD
-  for (i in 1:10) {
-    # PipelinedRDD initially created from RDD
-    rdd <- lapply(rdd, function(x) paste(x, x))
-  }
-  collectRDD(rdd)
-
-  unlink(fileName)
-})
-
-test_that("textFile() followed by a saveAsTextFile() returns the same content", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName1)
-
-  rdd <- textFile(sc, fileName1, 1L)
-  saveAsTextFile(rdd, fileName2)
-  rdd <- textFile(sc, fileName2)
-  expect_equal(collectRDD(rdd), as.list(mockFile))
-
-  unlink(fileName1)
-  unlink(fileName2)
-})
-
-test_that("saveAsTextFile() on a parallelized list works as expected", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  l <- list(1, 2, 3)
-  rdd <- parallelize(sc, l, 1L)
-  saveAsTextFile(rdd, fileName)
-  rdd <- textFile(sc, fileName)
-  expect_equal(collectRDD(rdd), lapply(l, function(x) {toString(x)}))
-
-  unlink(fileName)
-})
-
-test_that("textFile() and saveAsTextFile() word count works as expected", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName1)
-
-  rdd <- textFile(sc, fileName1)
-
-  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
-  wordCount <- lapply(words, function(word) { list(word, 1L) })
-
-  counts <- reduceByKey(wordCount, "+", 2L)
-
-  saveAsTextFile(counts, fileName2)
-  rdd <- textFile(sc, fileName2)
-
-  output <- collectRDD(rdd)
-  expected <- list(list("awesome.", 1), list("Spark", 2),
-                   list("pretty.", 1), list("is", 2))
-  expectedStr <- lapply(expected, function(x) { toString(x) })
-  expect_equal(sortKeyValueList(output), sortKeyValueList(expectedStr))
-
-  unlink(fileName1)
-  unlink(fileName2)
-})
-
-test_that("textFile() on multiple paths", {
-  skip_on_cran()
-
-  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines("Spark is pretty.", fileName1)
-  writeLines("Spark is awesome.", fileName2)
-
-  rdd <- textFile(sc, c(fileName1, fileName2))
-  expect_equal(countRDD(rdd), 2)
-
-  unlink(fileName1)
-  unlink(fileName2)
-})
-
-test_that("Pipelined operations on RDDs created using textFile", {
-  skip_on_cran()
-
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  rdd <- textFile(sc, fileName)
-
-  lengths <- lapply(rdd, function(x) { length(x) })
-  expect_equal(collectRDD(lengths), list(1, 1))
-
-  lengthsPipelined <- lapply(lengths, function(x) { x + 10 })
-  expect_equal(collectRDD(lengthsPipelined), list(11, 11))
-
-  lengths30 <- lapply(lengthsPipelined, function(x) { x + 20 })
-  expect_equal(collectRDD(lengths30), list(31, 31))
-
-  lengths20 <- lapply(lengths, function(x) { x + 20 })
-  expect_equal(collectRDD(lengths20), list(21, 21))
-
-  unlink(fileName)
-})
-
-sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/inst/tests/testthat/test_utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/testthat/test_utils.R
deleted file mode 100644
index 6197ae7..0000000
--- a/R/pkg/inst/tests/testthat/test_utils.R
+++ /dev/null
@@ -1,248 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-context("functions in utils.R")
-
-# JavaSparkContext handle
-sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
-sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
-
-test_that("convertJListToRList() gives back (deserializes) the original JLists
-          of strings and integers", {
-  skip_on_cran()
-  # It's hard to manually create a Java List using rJava, since it does not
-  # support generics well. Instead, we rely on collectRDD() returning a
-  # JList.
-  nums <- as.list(1:10)
-  rdd <- parallelize(sc, nums, 1L)
-  jList <- callJMethod(rdd@jrdd, "collect")
-  rList <- convertJListToRList(jList, flatten = TRUE)
-  expect_equal(rList, nums)
-
-  strs <- as.list("hello", "spark")
-  rdd <- parallelize(sc, strs, 2L)
-  jList <- callJMethod(rdd@jrdd, "collect")
-  rList <- convertJListToRList(jList, flatten = TRUE)
-  expect_equal(rList, strs)
-})
-
-test_that("serializeToBytes on RDD", {
-  skip_on_cran()
-  # File content
-  mockFile <- c("Spark is pretty.", "Spark is awesome.")
-  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
-  writeLines(mockFile, fileName)
-
-  text.rdd <- textFile(sc, fileName)
-  expect_equal(getSerializedMode(text.rdd), "string")
-  ser.rdd <- serializeToBytes(text.rdd)
-  expect_equal(collectRDD(ser.rdd), as.list(mockFile))
-  expect_equal(getSerializedMode(ser.rdd), "byte")
-
-  unlink(fileName)
-})
-
-test_that("cleanClosure on R functions", {
-  y <- c(1, 2, 3)
-  g <- function(x) { x + 1 }
-  f <- function(x) { g(x) + y }
-  newF <- cleanClosure(f)
-  env <- environment(newF)
-  expect_equal(length(ls(env)), 2)  # y, g
-  actual <- get("y", envir = env, inherits = FALSE)
-  expect_equal(actual, y)
-  actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
-
-  # Test for nested enclosures and package variables.
-  env2 <- new.env()
-  funcEnv <- new.env(parent = env2)
-  f <- function(x) { log(g(x) + y) }
-  environment(f) <- funcEnv  # enclosing relationship: f -> funcEnv -> env2 -> .GlobalEnv
-  newF <- cleanClosure(f)
-  env <- environment(newF)
-  expect_equal(length(ls(env)), 2)  # "min" should not be included
-  actual <- get("y", envir = env, inherits = FALSE)
-  expect_equal(actual, y)
-  actual <- get("g", envir = env, inherits = FALSE)
-  expect_equal(actual, g)
-
-  base <- c(1, 2, 3)
-  l <- list(field = matrix(1))
-  field <- matrix(2)
-  defUse <- 3
-  g <- function(x) { x + y }
-  f <- function(x) {
-    defUse <- base::as.integer(x) + 1  # Test for access operators `::`.
-    lapply(x, g) + 1  # Test for capturing function call "g"'s closure as a argument of lapply.
-    l$field[1, 1] <- 3  # Test for access operators `$`.
-    res <- defUse + l$field[1, ]  # Test for def-use chain of "defUse", and "" symbol.
-    f(res)  # Test for recursive calls.
-  }
-  newF <- cleanClosure(f)
-  env <- environment(newF)
-  # TODO(shivaram): length(ls(env)) is 4 here for some reason and `lapply` is included in `env`.
-  # Disabling this test till we debug this.
-  #
-  # nolint start
-  # expect_equal(length(ls(env)), 3)  # Only "g", "l" and "f". No "base", "field" or "defUse".
-  # nolint end
-  expect_true("g" %in% ls(env))
-  expect_true("l" %in% ls(env))
-  expect_true("f" %in% ls(env))
-  expect_equal(get("l", envir = env, inherits = FALSE), l)
-  # "y" should be in the environemnt of g.
-  newG <- get("g", envir = env, inherits = FALSE)
-  env <- environment(newG)
-  expect_equal(length(ls(env)), 1)
-  actual <- get("y", envir = env, inherits = FALSE)
-  expect_equal(actual, y)
-
-  # Test for function (and variable) definitions.
-  f <- function(x) {
-    g <- function(y) { y * 2 }
-    g(x)
-  }
-  newF <- cleanClosure(f)
-  env <- environment(newF)
-  expect_equal(length(ls(env)), 0)  # "y" and "g" should not be included.
-
-  # Test for overriding variables in base namespace (Issue: SparkR-196).
-  nums <- as.list(1:10)
-  rdd <- parallelize(sc, nums, 2L)
-  t <- 4  # Override base::t in .GlobalEnv.
-  f <- function(x) { x > t }
-  newF <- cleanClosure(f)
-  env <- environment(newF)
-  expect_equal(ls(env), "t")
-  expect_equal(get("t", envir = env, inherits = FALSE), t)
-  actual <- collectRDD(lapply(rdd, f))
-  expected <- as.list(c(rep(FALSE, 4), rep(TRUE, 6)))
-  expect_equal(actual, expected)
-
-  # Test for broadcast variables.
-  a <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
-  aBroadcast <- broadcastRDD(sc, a)
-  normMultiply <- function(x) { norm(aBroadcast$value) * x }
-  newnormMultiply <- SparkR:::cleanClosure(normMultiply)
-  env <- environment(newnormMultiply)
-  expect_equal(ls(env), "aBroadcast")
-  expect_equal(get("aBroadcast", envir = env, inherits = FALSE), aBroadcast)
-})
-
-test_that("varargsToJProperties", {
-  jprops <- newJObject("java.util.Properties")
-  expect_true(class(jprops) == "jobj")
-
-  jprops <- varargsToJProperties(abc = "123")
-  expect_true(class(jprops) == "jobj")
-  expect_equal(callJMethod(jprops, "getProperty", "abc"), "123")
-
-  jprops <- varargsToJProperties(abc = "abc", b = 1)
-  expect_equal(callJMethod(jprops, "getProperty", "abc"), "abc")
-  expect_equal(callJMethod(jprops, "getProperty", "b"), "1")
-
-  jprops <- varargsToJProperties()
-  expect_equal(callJMethod(jprops, "size"), 0L)
-})
-
-test_that("convertToJSaveMode", {
-  s <- convertToJSaveMode("error")
-  expect_true(class(s) == "jobj")
-  expect_match(capture.output(print.jobj(s)), "Java ref type org.apache.spark.sql.SaveMode id ")
-  expect_error(convertToJSaveMode("foo"),
-    'mode should be one of "append", "overwrite", "error", "ignore"') #nolint
-})
-
-test_that("captureJVMException", {
-  skip_on_cran()
-
-  method <- "createStructField"
-  expect_error(tryCatch(callJStatic("org.apache.spark.sql.api.r.SQLUtils", method,
-                                    "col", "unknown", TRUE),
-                        error = function(e) {
-                          captureJVMException(e, method)
-                        }),
-               "parse error - .*DataType unknown.*not supported.")
-})
-
-test_that("hashCode", {
-  skip_on_cran()
-
-  expect_error(hashCode("bc53d3605e8a5b7de1e8e271c2317645"), NA)
-})
-
-test_that("overrideEnvs", {
-  config <- new.env()
-  config[["spark.master"]] <- "foo"
-  config[["config_only"]] <- "ok"
-  param <- new.env()
-  param[["spark.master"]] <- "local"
-  param[["param_only"]] <- "blah"
-  overrideEnvs(config, param)
-  expect_equal(config[["spark.master"]], "local")
-  expect_equal(config[["param_only"]], "blah")
-  expect_equal(config[["config_only"]], "ok")
-})
-
-test_that("rbindRaws", {
-
-  # Mixed Column types
-  r <- serialize(1:5, connection = NULL)
-  r1 <- serialize(1, connection = NULL)
-  r2 <- serialize(letters, connection = NULL)
-  r3 <- serialize(1:10, connection = NULL)
-  inputData <- list(list(1L, r1, "a", r), list(2L, r2, "b", r),
-                    list(3L, r3, "c", r))
-  expected <- data.frame(V1 = 1:3)
-  expected$V2 <- list(r1, r2, r3)
-  expected$V3 <- c("a", "b", "c")
-  expected$V4 <- list(r, r, r)
-  result <- rbindRaws(inputData)
-  expect_equal(expected, result)
-
-  # Single binary column
-  input <- list(list(r1), list(r2), list(r3))
-  expected <- subset(expected, select = "V2")
-  result <- setNames(rbindRaws(input), "V2")
-  expect_equal(expected, result)
-
-})
-
-test_that("varargsToStrEnv", {
-  strenv <- varargsToStrEnv(a = 1, b = 1.1, c = TRUE, d = "abcd")
-  env <- varargsToEnv(a = "1", b = "1.1", c = "true", d = "abcd")
-  expect_equal(strenv, env)
-  expect_error(varargsToStrEnv(a = list(1, "a")),
-               paste0("Unsupported type for a : list. Supported types are logical, ",
-                      "numeric, character and NULL."))
-  expect_warning(varargsToStrEnv(a = 1, 2, 3, 4), "Unnamed arguments ignored: 2, 3, 4.")
-  expect_warning(varargsToStrEnv(1, 2, 3, 4), "Unnamed arguments ignored: 1, 2, 3, 4.")
-})
-
-test_that("basenameSansExtFromUrl", {
-  x <- paste0("http://people.apache.org/~pwendell/spark-nightly/spark-branch-2.1-bin/spark-2.1.1-",
-              "SNAPSHOT-2016_12_09_11_08-eb2d9bf-bin/spark-2.1.1-SNAPSHOT-bin-hadoop2.7.tgz")
-  expect_equal(basenameSansExtFromUrl(x), "spark-2.1.1-SNAPSHOT-bin-hadoop2.7")
-  z <- "http://people.apache.org/~pwendell/spark-releases/spark-2.1.0--hive.tar.gz"
-  expect_equal(basenameSansExtFromUrl(z), "spark-2.1.0--hive")
-})
-
-sparkR.session.stop()
-
-message("--- End test (utils) ", as.POSIXct(Sys.time(), tz = "GMT"))
-message("elapsed ", (proc.time() - timer_ptm)[3])

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/jarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/jarTest.R b/R/pkg/tests/fulltests/jarTest.R
new file mode 100644
index 0000000..e2241e0
--- /dev/null
+++ b/R/pkg/tests/fulltests/jarTest.R
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+library(SparkR)
+
+sc <- sparkR.session(master = "local[1]")
+
+helloTest <- SparkR:::callJStatic("sparkrtest.DummyClass",
+                                  "helloWorld",
+                                  "Dave")
+stopifnot(identical(helloTest, "Hello Dave"))
+
+basicFunction <- SparkR:::callJStatic("sparkrtest.DummyClass",
+                                      "addStuff",
+                                      2L,
+                                      2L)
+stopifnot(basicFunction == 4L)
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/packageInAJarTest.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/packageInAJarTest.R b/R/pkg/tests/fulltests/packageInAJarTest.R
new file mode 100644
index 0000000..ac70626
--- /dev/null
+++ b/R/pkg/tests/fulltests/packageInAJarTest.R
@@ -0,0 +1,30 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+library(SparkR)
+library(sparkPackageTest)
+
+sparkR.session(master = "local[1]")
+
+run1 <- myfunc(5L)
+
+run2 <- myfunc(-4L)
+
+sparkR.session.stop()
+
+if (run1 != 6) quit(save = "no", status = 1)
+
+if (run2 != -3) quit(save = "no", status = 1)

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_Serde.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R
new file mode 100644
index 0000000..6e160fa
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_Serde.R
@@ -0,0 +1,85 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("SerDe functionality")
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("SerDe of primitive types", {
+  skip_on_cran()
+
+  x <- callJStatic("SparkRHandler", "echo", 1L)
+  expect_equal(x, 1L)
+  expect_equal(class(x), "integer")
+
+  x <- callJStatic("SparkRHandler", "echo", 1)
+  expect_equal(x, 1)
+  expect_equal(class(x), "numeric")
+
+  x <- callJStatic("SparkRHandler", "echo", TRUE)
+  expect_true(x)
+  expect_equal(class(x), "logical")
+
+  x <- callJStatic("SparkRHandler", "echo", "abc")
+  expect_equal(x, "abc")
+  expect_equal(class(x), "character")
+})
+
+test_that("SerDe of list of primitive types", {
+  skip_on_cran()
+
+  x <- list(1L, 2L, 3L)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "integer")
+
+  x <- list(1, 2, 3)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "numeric")
+
+  x <- list(TRUE, FALSE)
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "logical")
+
+  x <- list("a", "b", "c")
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+  expect_equal(class(y[[1]]), "character")
+
+  # Empty list
+  x <- list()
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+})
+
+test_that("SerDe of list of lists", {
+  skip_on_cran()
+
+  x <- list(list(1L, 2L, 3L), list(1, 2, 3),
+            list(TRUE, FALSE), list("a", "b", "c"))
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+
+  # List of empty lists
+  x <- list(list(), list())
+  y <- callJStatic("SparkRHandler", "echo", x)
+  expect_equal(x, y)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_Windows.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_Windows.R b/R/pkg/tests/fulltests/test_Windows.R
new file mode 100644
index 0000000..00d684e
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_Windows.R
@@ -0,0 +1,32 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+context("Windows-specific tests")
+
+test_that("sparkJars tag in SparkContext", {
+  skip_on_cran()
+
+  if (.Platform$OS.type != "windows") {
+    skip("This test is only for Windows, skipped")
+  }
+
+  testOutput <- launchScript("ECHO", "a/b/c", wait = TRUE)
+  abcPath <- testOutput[1]
+  expect_equal(abcPath, "a\\b\\c")
+})
+
+message("--- End test (Windows) ", as.POSIXct(Sys.time(), tz = "GMT"))
+message("elapsed ", (proc.time() - timer_ptm)[3])

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_binaryFile.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_binaryFile.R b/R/pkg/tests/fulltests/test_binaryFile.R
new file mode 100644
index 0000000..00954fa
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_binaryFile.R
@@ -0,0 +1,100 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions on binary files")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("saveAsObjectFile()/objectFile() following textFile() works", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1, 1)
+  saveAsObjectFile(rdd, fileName2)
+  rdd <- objectFile(sc, fileName2)
+  expect_equal(collectRDD(rdd), as.list(mockFile))
+
+  unlink(fileName1)
+  unlink(fileName2, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() works on a parallelized list", {
+  skip_on_cran()
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+
+  l <- list(1, 2, 3)
+  rdd <- parallelize(sc, l, 1)
+  saveAsObjectFile(rdd, fileName)
+  rdd <- objectFile(sc, fileName)
+  expect_equal(collectRDD(rdd), l)
+
+  unlink(fileName, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() following RDD transformations works", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName1)
+
+  rdd <- textFile(sc, fileName1)
+
+  words <- flatMap(rdd, function(line) { strsplit(line, " ")[[1]] })
+  wordCount <- lapply(words, function(word) { list(word, 1L) })
+
+  counts <- reduceByKey(wordCount, "+", 2L)
+
+  saveAsObjectFile(counts, fileName2)
+  counts <- objectFile(sc, fileName2)
+
+  output <- collectRDD(counts)
+  expected <- list(list("awesome.", 1), list("Spark", 2), list("pretty.", 1),
+                    list("is", 2))
+  expect_equal(sortKeyValueList(output), sortKeyValueList(expected))
+
+  unlink(fileName1)
+  unlink(fileName2, recursive = TRUE)
+})
+
+test_that("saveAsObjectFile()/objectFile() works with multiple paths", {
+  skip_on_cran()
+
+  fileName1 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  fileName2 <- tempfile(pattern = "spark-test", fileext = ".tmp")
+
+  rdd1 <- parallelize(sc, "Spark is pretty.")
+  saveAsObjectFile(rdd1, fileName1)
+  rdd2 <- parallelize(sc, "Spark is awesome.")
+  saveAsObjectFile(rdd2, fileName2)
+
+  rdd <- objectFile(sc, c(fileName1, fileName2))
+  expect_equal(countRDD(rdd), 2)
+
+  unlink(fileName1, recursive = TRUE)
+  unlink(fileName2, recursive = TRUE)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_binary_function.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_binary_function.R b/R/pkg/tests/fulltests/test_binary_function.R
new file mode 100644
index 0000000..236cb38
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_binary_function.R
@@ -0,0 +1,110 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("binary functions")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Data
+nums <- 1:10
+rdd <- parallelize(sc, nums, 2L)
+
+# File content
+mockFile <- c("Spark is pretty.", "Spark is awesome.")
+
+test_that("union on two RDDs", {
+  skip_on_cran()
+
+  actual <- collectRDD(unionRDD(rdd, rdd))
+  expect_equal(actual, as.list(rep(nums, 2)))
+
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  text.rdd <- textFile(sc, fileName)
+  union.rdd <- unionRDD(rdd, text.rdd)
+  actual <- collectRDD(union.rdd)
+  expect_equal(actual, c(as.list(nums), mockFile))
+  expect_equal(getSerializedMode(union.rdd), "byte")
+
+  rdd <- map(text.rdd, function(x) {x})
+  union.rdd <- unionRDD(rdd, text.rdd)
+  actual <- collectRDD(union.rdd)
+  expect_equal(actual, as.list(c(mockFile, mockFile)))
+  expect_equal(getSerializedMode(union.rdd), "byte")
+
+  unlink(fileName)
+})
+
+test_that("cogroup on two RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, list(list(1, 1), list(2, 4)))
+  rdd2 <- parallelize(sc, list(list(1, 2), list(1, 3)))
+  cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
+  actual <- collectRDD(cogroup.rdd)
+  expect_equal(actual,
+               list(list(1, list(list(1), list(2, 3))), list(2, list(list(4), list()))))
+
+  rdd1 <- parallelize(sc, list(list("a", 1), list("a", 4)))
+  rdd2 <- parallelize(sc, list(list("b", 2), list("a", 3)))
+  cogroup.rdd <- cogroup(rdd1, rdd2, numPartitions = 2L)
+  actual <- collectRDD(cogroup.rdd)
+
+  expected <- list(list("b", list(list(), list(2))), list("a", list(list(1, 4), list(3))))
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(expected))
+})
+
+test_that("zipPartitions() on RDDs", {
+  skip_on_cran()
+
+  rdd1 <- parallelize(sc, 1:2, 2L)  # 1, 2
+  rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
+  rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6
+  actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
+                                  func = function(x, y, z) { list(list(x, y, z))} ))
+  expect_equal(actual,
+               list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 6))))
+
+  mockFile <- c("Spark is pretty.", "Spark is awesome.")
+  fileName <- tempfile(pattern = "spark-test", fileext = ".tmp")
+  writeLines(mockFile, fileName)
+
+  rdd <- textFile(sc, fileName, 1)
+  actual <- collectRDD(zipPartitions(rdd, rdd,
+                                  func = function(x, y) { list(paste(x, y, sep = "\n")) }))
+  expected <- list(paste(mockFile, mockFile, sep = "\n"))
+  expect_equal(actual, expected)
+
+  rdd1 <- parallelize(sc, 0:1, 1)
+  actual <- collectRDD(zipPartitions(rdd1, rdd,
+                                  func = function(x, y) { list(x + nchar(y)) }))
+  expected <- list(0:1 + nchar(mockFile))
+  expect_equal(actual, expected)
+
+  rdd <- map(rdd, function(x) { x })
+  actual <- collectRDD(zipPartitions(rdd, rdd1,
+                                  func = function(x, y) { list(y + nchar(x)) }))
+  expect_equal(actual, expected)
+
+  unlink(fileName)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_broadcast.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_broadcast.R b/R/pkg/tests/fulltests/test_broadcast.R
new file mode 100644
index 0000000..2c96740
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_broadcast.R
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("broadcast variables")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Partitioned data
+nums <- 1:2
+rrdd <- parallelize(sc, nums, 2L)
+
+test_that("using broadcast variable", {
+  skip_on_cran()
+
+  randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+  randomMatBr <- broadcastRDD(sc, randomMat)
+
+  useBroadcast <- function(x) {
+    sum(SparkR:::value(randomMatBr) * x)
+  }
+  actual <- collectRDD(lapply(rrdd, useBroadcast))
+  expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
+  expect_equal(actual, expected)
+})
+
+test_that("without using broadcast variable", {
+  skip_on_cran()
+
+  randomMat <- matrix(nrow = 10, ncol = 10, data = rnorm(100))
+
+  useBroadcast <- function(x) {
+    sum(randomMat * x)
+  }
+  actual <- collectRDD(lapply(rrdd, useBroadcast))
+  expected <- list(sum(randomMat) * 1, sum(randomMat) * 2)
+  expect_equal(actual, expected)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_client.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_client.R b/R/pkg/tests/fulltests/test_client.R
new file mode 100644
index 0000000..3d53beb
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_client.R
@@ -0,0 +1,51 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("functions in client.R")
+
+test_that("adding spark-testing-base as a package works", {
+  skip_on_cran()
+
+  args <- generateSparkSubmitArgs("", "", "", "",
+                                  "holdenk:spark-testing-base:1.3.0_0.0.5")
+  expect_equal(gsub("[[:space:]]", "", args),
+               gsub("[[:space:]]", "",
+                    "--packages holdenk:spark-testing-base:1.3.0_0.0.5"))
+})
+
+test_that("no package specified doesn't add packages flag", {
+  skip_on_cran()
+
+  args <- generateSparkSubmitArgs("", "", "", "", "")
+  expect_equal(gsub("[[:space:]]", "", args),
+               "")
+})
+
+test_that("multiple packages don't produce a warning", {
+  skip_on_cran()
+
+  expect_warning(generateSparkSubmitArgs("", "", "", "", c("A", "B")), NA)
+})
+
+test_that("sparkJars sparkPackages as character vectors", {
+  skip_on_cran()
+
+  args <- generateSparkSubmitArgs("", "", c("one.jar", "two.jar", "three.jar"), "",
+                                  c("com.databricks:spark-avro_2.10:2.0.1"))
+  expect_match(args, "--jars one.jar,two.jar,three.jar")
+  expect_match(args, "--packages com.databricks:spark-avro_2.10:2.0.1")
+})

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_context.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R
new file mode 100644
index 0000000..f6d9f54
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_context.R
@@ -0,0 +1,226 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("test functions in sparkR.R")
+
+test_that("Check masked functions", {
+  skip_on_cran()
+
+  # Check that we are not masking any new function from base, stats, testthat unexpectedly
+  # NOTE: We should avoid adding entries to *namesOfMaskedCompletely* as masked functions make it
+  # hard for users to use base R functions. Please check when in doubt.
+  namesOfMaskedCompletely <- c("cov", "filter", "sample", "not")
+  namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
+                     "colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
+                     "summary", "transform", "drop", "window", "as.data.frame", "union", "not")
+  if (as.numeric(R.version$major) >= 3 && as.numeric(R.version$minor) >= 3) {
+    namesOfMasked <- c("endsWith", "startsWith", namesOfMasked)
+  }
+  masked <- conflicts(detail = TRUE)$`package:SparkR`
+  expect_true("describe" %in% masked)  # only when with testthat..
+  func <- lapply(masked, function(x) { capture.output(showMethods(x))[[1]] })
+  funcSparkROrEmpty <- grepl("\\(package SparkR\\)$|^$", func)
+  maskedBySparkR <- masked[funcSparkROrEmpty]
+  expect_equal(length(maskedBySparkR), length(namesOfMasked))
+  # make the 2 lists the same length so expect_equal will print their content
+  l <- max(length(maskedBySparkR), length(namesOfMasked))
+  length(maskedBySparkR) <- l
+  length(namesOfMasked) <- l
+  expect_equal(sort(maskedBySparkR, na.last = TRUE), sort(namesOfMasked, na.last = TRUE))
+  # above are those reported as masked when `library(SparkR)`
+  # note that many of these methods are still callable without base:: or stats:: prefix
+  # there should be a test for each of these, except followings, which are currently "broken"
+  funcHasAny <- unlist(lapply(masked, function(x) {
+                                        any(grepl("=\"ANY\"", capture.output(showMethods(x)[-1])))
+                                      }))
+  maskedCompletely <- masked[!funcHasAny]
+  expect_equal(length(maskedCompletely), length(namesOfMaskedCompletely))
+  l <- max(length(maskedCompletely), length(namesOfMaskedCompletely))
+  length(maskedCompletely) <- l
+  length(namesOfMaskedCompletely) <- l
+  expect_equal(sort(maskedCompletely, na.last = TRUE),
+               sort(namesOfMaskedCompletely, na.last = TRUE))
+})
+
+test_that("repeatedly starting and stopping SparkR", {
+  skip_on_cran()
+
+  for (i in 1:4) {
+    sc <- suppressWarnings(sparkR.init(master = sparkRTestMaster))
+    rdd <- parallelize(sc, 1:20, 2L)
+    expect_equal(countRDD(rdd), 20)
+    suppressWarnings(sparkR.stop())
+  }
+})
+
+test_that("repeatedly starting and stopping SparkSession", {
+  for (i in 1:4) {
+    sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+    df <- createDataFrame(data.frame(dummy = 1:i))
+    expect_equal(count(df), i)
+    sparkR.session.stop()
+  }
+})
+
+test_that("rdd GC across sparkR.stop", {
+  skip_on_cran()
+
+  sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0
+  rdd1 <- parallelize(sc, 1:20, 2L) # rdd1 should get id 1
+  rdd2 <- parallelize(sc, 1:10, 2L) # rdd2 should get id 2
+  sparkR.session.stop()
+
+  sc <- sparkR.sparkContext(master = sparkRTestMaster) # sc should get id 0 again
+
+  # GC rdd1 before creating rdd3 and rdd2 after
+  rm(rdd1)
+  gc()
+
+  rdd3 <- parallelize(sc, 1:20, 2L) # rdd3 should get id 1 now
+  rdd4 <- parallelize(sc, 1:10, 2L) # rdd4 should get id 2 now
+
+  rm(rdd2)
+  gc()
+
+  countRDD(rdd3)
+  countRDD(rdd4)
+  sparkR.session.stop()
+})
+
+test_that("job group functions can be called", {
+  skip_on_cran()
+
+  sc <- sparkR.sparkContext(master = sparkRTestMaster)
+  setJobGroup("groupId", "job description", TRUE)
+  cancelJobGroup("groupId")
+  clearJobGroup()
+
+  suppressWarnings(setJobGroup(sc, "groupId", "job description", TRUE))
+  suppressWarnings(cancelJobGroup(sc, "groupId"))
+  suppressWarnings(clearJobGroup(sc))
+  sparkR.session.stop()
+})
+
+test_that("utility function can be called", {
+  skip_on_cran()
+
+  sparkR.sparkContext(master = sparkRTestMaster)
+  setLogLevel("ERROR")
+  sparkR.session.stop()
+})
+
+test_that("getClientModeSparkSubmitOpts() returns spark-submit args from whitelist", {
+  skip_on_cran()
+
+  e <- new.env()
+  e[["spark.driver.memory"]] <- "512m"
+  ops <- getClientModeSparkSubmitOpts("sparkrmain", e)
+  expect_equal("--driver-memory \"512m\" sparkrmain", ops)
+
+  e[["spark.driver.memory"]] <- "5g"
+  e[["spark.driver.extraClassPath"]] <- "/opt/class_path" # nolint
+  e[["spark.driver.extraJavaOptions"]] <- "-XX:+UseCompressedOops -XX:+UseCompressedStrings"
+  e[["spark.driver.extraLibraryPath"]] <- "/usr/local/hadoop/lib" # nolint
+  e[["random"]] <- "skipthis"
+  ops2 <- getClientModeSparkSubmitOpts("sparkr-shell", e)
+  # nolint start
+  expect_equal(ops2, paste0("--driver-class-path \"/opt/class_path\" --driver-java-options \"",
+                      "-XX:+UseCompressedOops -XX:+UseCompressedStrings\" --driver-library-path \"",
+                      "/usr/local/hadoop/lib\" --driver-memory \"5g\" sparkr-shell"))
+  # nolint end
+
+  e[["spark.driver.extraClassPath"]] <- "/" # too short
+  ops3 <- getClientModeSparkSubmitOpts("--driver-memory 4g sparkr-shell2", e)
+  # nolint start
+  expect_equal(ops3, paste0("--driver-java-options \"-XX:+UseCompressedOops ",
+                      "-XX:+UseCompressedStrings\" --driver-library-path \"/usr/local/hadoop/lib\"",
+                      " --driver-memory 4g sparkr-shell2"))
+  # nolint end
+})
+
+test_that("sparkJars sparkPackages as comma-separated strings", {
+  skip_on_cran()
+
+  expect_warning(processSparkJars(" a, b "))
+  jars <- suppressWarnings(processSparkJars(" a, b "))
+  expect_equal(lapply(jars, basename), list("a", "b"))
+
+  jars <- suppressWarnings(processSparkJars(" abc ,, def "))
+  expect_equal(lapply(jars, basename), list("abc", "def"))
+
+  jars <- suppressWarnings(processSparkJars(c(" abc ,, def ", "", "xyz", " ", "a,b")))
+  expect_equal(lapply(jars, basename), list("abc", "def", "xyz", "a", "b"))
+
+  p <- processSparkPackages(c("ghi", "lmn"))
+  expect_equal(p, c("ghi", "lmn"))
+
+  # check normalizePath
+  f <- dir()[[1]]
+  expect_warning(processSparkJars(f), NA)
+  expect_match(processSparkJars(f), f)
+})
+
+test_that("spark.lapply should perform simple transforms", {
+  sparkR.sparkContext(master = sparkRTestMaster)
+  doubled <- spark.lapply(1:10, function(x) { 2 * x })
+  expect_equal(doubled, as.list(2 * 1:10))
+  sparkR.session.stop()
+})
+
+test_that("add and get file to be downloaded with Spark job on every node", {
+  skip_on_cran()
+
+  sparkR.sparkContext(master = sparkRTestMaster)
+  # Test add file.
+  path <- tempfile(pattern = "hello", fileext = ".txt")
+  filename <- basename(path)
+  words <- "Hello World!"
+  writeLines(words, path)
+  spark.addFile(path)
+  download_path <- spark.getSparkFiles(filename)
+  expect_equal(readLines(download_path), words)
+
+  # Test spark.getSparkFiles works well on executors.
+  seq <- seq(from = 1, to = 10, length.out = 5)
+  f <- function(seq) { spark.getSparkFiles(filename) }
+  results <- spark.lapply(seq, f)
+  for (i in 1:5) { expect_equal(basename(results[[i]]), filename) }
+
+  unlink(path)
+
+  # Test add directory recursively.
+  path <- paste0(tempdir(), "/", "recursive_dir")
+  dir.create(path)
+  dir_name <- basename(path)
+  path1 <- paste0(path, "/", "hello.txt")
+  file.create(path1)
+  sub_path <- paste0(path, "/", "sub_hello")
+  dir.create(sub_path)
+  path2 <- paste0(sub_path, "/", "sub_hello.txt")
+  file.create(path2)
+  words <- "Hello World!"
+  sub_words <- "Sub Hello World!"
+  writeLines(words, path1)
+  writeLines(sub_words, path2)
+  spark.addFile(path, recursive = TRUE)
+  download_path1 <- spark.getSparkFiles(paste0(dir_name, "/", "hello.txt"))
+  expect_equal(readLines(download_path1), words)
+  download_path2 <- spark.getSparkFiles(paste0(dir_name, "/", "sub_hello/sub_hello.txt"))
+  expect_equal(readLines(download_path2), sub_words)
+  unlink(path, recursive = TRUE)
+  sparkR.session.stop()
+})

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_includePackage.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_includePackage.R b/R/pkg/tests/fulltests/test_includePackage.R
new file mode 100644
index 0000000..d7d9eee
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_includePackage.R
@@ -0,0 +1,64 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("include R packages")
+
+# JavaSparkContext handle
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+sc <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "getJavaSparkContext", sparkSession)
+
+# Partitioned data
+nums <- 1:2
+rdd <- parallelize(sc, nums, 2L)
+
+test_that("include inside function", {
+  skip_on_cran()
+
+  # Only run the test if plyr is installed.
+  if ("plyr" %in% rownames(installed.packages())) {
+    suppressPackageStartupMessages(library(plyr))
+    generateData <- function(x) {
+      suppressPackageStartupMessages(library(plyr))
+      attach(airquality)
+      result <- transform(Ozone, logOzone = log(Ozone))
+      result
+    }
+
+    data <- lapplyPartition(rdd, generateData)
+    actual <- collectRDD(data)
+  }
+})
+
+test_that("use include package", {
+  skip_on_cran()
+
+  # Only run the test if plyr is installed.
+  if ("plyr" %in% rownames(installed.packages())) {
+    suppressPackageStartupMessages(library(plyr))
+    generateData <- function(x) {
+      attach(airquality)
+      result <- transform(Ozone, logOzone = log(Ozone))
+      result
+    }
+
+    includePackage(sc, plyr)
+    data <- lapplyPartition(rdd, generateData)
+    actual <- collectRDD(data)
+  }
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_jvm_api.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R
new file mode 100644
index 0000000..8b3b4f7
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_jvm_api.R
@@ -0,0 +1,36 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+context("JVM API")
+
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+test_that("Create and call methods on object", {
+  jarr <- sparkR.newJObject("java.util.ArrayList")
+  # Add an element to the array
+  sparkR.callJMethod(jarr, "add", 1L)
+  # Check if get returns the same element
+  expect_equal(sparkR.callJMethod(jarr, "get", 0L), 1L)
+})
+
+test_that("Call static methods", {
+  # Convert a boolean to a string
+  strTrue <- sparkR.callJStatic("java.lang.String", "valueOf", TRUE)
+  expect_equal(strTrue, "true")
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
new file mode 100644
index 0000000..82e588d
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -0,0 +1,396 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib classification algorithms, except for tree-based algorithms")
+
+# Tests for MLlib classification algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+  sparkHome <- sparkR.conf("spark.home")
+  file.path(sparkHome, x)
+}
+
+test_that("spark.svmLinear", {
+  skip_on_cran()
+
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.svmLinear(training,  Species ~ ., regParam = 0.01, maxIter = 10)
+  summary <- summary(model)
+
+  # test summary coefficients return matrix type
+  expect_true(class(summary$coefficients) == "matrix")
+  expect_true(class(summary$coefficients[, 1]) == "numeric")
+
+  coefs <- summary$coefficients[, "Estimate"]
+  expected_coefs <- c(-0.06004978, -0.1563083, -0.460648, 0.2276626, 1.055085)
+  expect_true(all(abs(coefs - expected_coefs) < 0.1))
+
+  # Test prediction with string label
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("versicolor", "versicolor", "versicolor", "virginica",  "virginica",
+                "virginica",  "virginica",  "virginica",  "virginica",  "virginica")
+  expect_equal(sort(as.list(take(select(prediction, "prediction"), 10))[[1]]), expected)
+
+  # Test model save and load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-svm-linear", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    coefs <- summary(model)$coefficients
+    coefs2 <- summary(model2)$coefficients
+    expect_equal(coefs, coefs2)
+    unlink(modelPath)
+  }
+
+  # Test prediction with numeric label
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  data <- as.data.frame(cbind(label, feature))
+  df <- createDataFrame(data)
+  model <- spark.svmLinear(df, label ~ feature, regParam = 0.1)
+  prediction <- collect(select(predict(model, df), "prediction"))
+  expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
+
+})
+
+test_that("spark.logit", {
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris.x = as.matrix(iris[, 1:4])
+  #' iris.y = as.factor(as.character(iris[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $setosa
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.0981324
+  # Sepal.Length -0.2909860
+  # Sepal.Width   0.5510907
+  # Petal.Length -0.1915217
+  # Petal.Width  -0.4211946
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               1.520061e+00
+  # Sepal.Length  2.524501e-02
+  # Sepal.Width  -5.310313e-01
+  # Petal.Length  3.656543e-02
+  # Petal.Width  -3.144464e-05
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -2.61819385
+  # Sepal.Length  0.26574097
+  # Sepal.Width  -0.02005932
+  # Petal.Length  0.15495629
+  # Petal.Width   0.42122607
+  # nolint end
+
+  # Test multinomial logistic regression againt three classes
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.logit(df, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+
+  # test summary coefficients return matrix type
+  expect_true(class(summary$coefficients) == "matrix")
+  expect_true(class(summary$coefficients[, 1]) == "numeric")
+
+  versicolorCoefsR <- c(1.52, 0.03, -0.53, 0.04, 0.00)
+  virginicaCoefsR <- c(-2.62, 0.27, -0.02, 0.16, 0.42)
+  setosaCoefsR <- c(1.10, -0.29, 0.55, -0.19, -0.42)
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
+  setosaCoefs <- summary$coefficients[, "setosa"]
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+  expect_true(all(abs(setosaCoefs - setosaCoefs) < 0.1))
+
+  # Test model save and load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-logit", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    coefs <- summary(model)$coefficients
+    coefs2 <- summary(model2)$coefficients
+    expect_equal(coefs, coefs2)
+    unlink(modelPath)
+  }
+
+  # R code to reproduce the result.
+  # nolint start
+  #' library(glmnet)
+  #' iris2 <- iris[iris$Species %in% c("versicolor", "virginica"), ]
+  #' iris.x = as.matrix(iris2[, 1:4])
+  #' iris.y = as.factor(as.character(iris2[, 5]))
+  #' logit = glmnet(iris.x, iris.y, family="multinomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # $versicolor
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #               3.93844796
+  # Sepal.Length -0.13538675
+  # Sepal.Width  -0.02386443
+  # Petal.Length -0.35076451
+  # Petal.Width  -0.77971954
+  #
+  # $virginica
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  #              -3.93844796
+  # Sepal.Length  0.13538675
+  # Sepal.Width   0.02386443
+  # Petal.Length  0.35076451
+  # Petal.Width   0.77971954
+  #
+  #' logit = glmnet(iris.x, iris.y, family="binomial", alpha=0, lambda=0.5)
+  #' coef(logit)
+  #
+  # 5 x 1 sparse Matrix of class "dgCMatrix"
+  # s0
+  # (Intercept)  -6.0824412
+  # Sepal.Length  0.2458260
+  # Sepal.Width   0.1642093
+  # Petal.Length  0.4759487
+  # Petal.Width   1.0383948
+  #
+  # nolint end
+
+  # Test multinomial logistic regression againt two classes
+  df <- suppressWarnings(createDataFrame(iris))
+  training <- df[df$Species %in% c("versicolor", "virginica"), ]
+  model <- spark.logit(training, Species ~ ., regParam = 0.5, family = "multinomial")
+  summary <- summary(model)
+  versicolorCoefsR <- c(3.94, -0.16, -0.02, -0.35, -0.78)
+  virginicaCoefsR <- c(-3.94, 0.16, -0.02, 0.35, 0.78)
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
+
+  # Test binomial logistic regression againt two classes
+  model <- spark.logit(training, Species ~ ., regParam = 0.5)
+  summary <- summary(model)
+  coefsR <- c(-6.08, 0.25, 0.16, 0.48, 1.04)
+  coefs <- summary$coefficients[, "Estimate"]
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+
+  # Test prediction with string label
+  prediction <- predict(model, training)
+  expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "character")
+  expected <- c("versicolor", "versicolor", "virginica", "versicolor", "versicolor",
+                "versicolor", "versicolor", "versicolor", "versicolor", "versicolor")
+  expect_equal(as.list(take(select(prediction, "prediction"), 10))[[1]], expected)
+
+  # Test prediction with numeric label
+  label <- c(0.0, 0.0, 0.0, 1.0, 1.0)
+  feature <- c(1.1419053, 0.9194079, -0.9498666, -1.1069903, 0.2809776)
+  data <- as.data.frame(cbind(label, feature))
+  df <- createDataFrame(data)
+  model <- spark.logit(df, label ~ feature)
+  prediction <- collect(select(predict(model, df), "prediction"))
+  expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
+
+  # Test prediction with weightCol
+  weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
+  data2 <- as.data.frame(cbind(label, feature, weight))
+  df2 <- createDataFrame(data2)
+  model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
+  prediction2 <- collect(select(predict(model2, df2), "prediction"))
+  expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
+})
+
+test_that("spark.mlp", {
+  skip_on_cran()
+
+  df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"),
+                source = "libsvm")
+  model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3),
+                     solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1)
+
+  # Test summary method
+  summary <- summary(model)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
+  expect_equal(summary$layers, c(4, 5, 4, 3))
+  expect_equal(length(summary$weights), 64)
+  expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825),
+               tolerance = 1e-6)
+
+  # Test predict method
+  mlpTestDF <- df
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "0.0", "0.0", "0.0", "0.0", "0.0"))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-mlp", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    summary2 <- summary(model2)
+
+    expect_equal(summary2$numOfInputs, 4)
+    expect_equal(summary2$numOfOutputs, 3)
+    expect_equal(summary2$layers, c(4, 5, 4, 3))
+    expect_equal(length(summary2$weights), 64)
+
+    unlink(modelPath)
+  }
+
+  # Test default parameter
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+  # Test illegal parameter
+  expect_error(spark.mlp(df, label ~ features, layers = NULL),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c()),
+               "layers must be a integer vector with length > 1.")
+  expect_error(spark.mlp(df, label ~ features, layers = c(3)),
+               "layers must be a integer vector with length > 1.")
+
+  # Test random seed
+  # default seed
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10)
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+  # seed equals 10
+  model <- spark.mlp(df, label ~ features, layers = c(4, 5, 4, 3), maxIter = 10, seed = 10)
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+  # test initialWeights
+  model <- spark.mlp(df, label ~ features, layers = c(4, 3), initialWeights =
+    c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
+  mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction"))
+  expect_equal(head(mlpPredictions$prediction, 10),
+               c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0", "2.0", "2.0", "1.0", "0.0"))
+
+  # Test formula works well
+  df <- suppressWarnings(createDataFrame(iris))
+  model <- spark.mlp(df, Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
+                     layers = c(4, 3))
+  summary <- summary(model)
+  expect_equal(summary$numOfInputs, 4)
+  expect_equal(summary$numOfOutputs, 3)
+  expect_equal(summary$layers, c(4, 3))
+  expect_equal(length(summary$weights), 15)
+})
+
+test_that("spark.naiveBayes", {
+  # R code to reproduce the result.
+  # We do not support instance weights yet. So we ignore the frequencies.
+  #
+  #' library(e1071)
+  #' t <- as.data.frame(Titanic)
+  #' t1 <- t[t$Freq > 0, -5]
+  #' m <- naiveBayes(Survived ~ ., data = t1)
+  #' m
+  #' predict(m, t1)
+  #
+  # -- output of 'm'
+  #
+  # A-priori probabilities:
+  # Y
+  #        No       Yes
+  # 0.4166667 0.5833333
+  #
+  # Conditional probabilities:
+  #      Class
+  # Y           1st       2nd       3rd      Crew
+  #   No  0.2000000 0.2000000 0.4000000 0.2000000
+  #   Yes 0.2857143 0.2857143 0.2857143 0.1428571
+  #
+  #      Sex
+  # Y     Male Female
+  #   No   0.5    0.5
+  #   Yes  0.5    0.5
+  #
+  #      Age
+  # Y         Child     Adult
+  #   No  0.2000000 0.8000000
+  #   Yes 0.4285714 0.5714286
+  #
+  # -- output of 'predict(m, t1)'
+  #
+  # Yes Yes Yes Yes No  No  Yes Yes No  No  Yes Yes Yes Yes Yes Yes Yes Yes No  No  Yes Yes No  No
+  #
+
+  t <- as.data.frame(Titanic)
+  t1 <- t[t$Freq > 0, -5]
+  df <- suppressWarnings(createDataFrame(t1))
+  m <- spark.naiveBayes(df, Survived ~ ., smoothing = 0.0)
+  s <- summary(m)
+  expect_equal(as.double(s$apriori[1, "Yes"]), 0.5833333, tolerance = 1e-6)
+  expect_equal(sum(s$apriori), 1)
+  expect_equal(as.double(s$tables["Yes", "Age_Adult"]), 0.5714286, tolerance = 1e-6)
+  p <- collect(select(predict(m, df), "prediction"))
+  expect_equal(p$prediction, c("Yes", "Yes", "Yes", "Yes", "No", "No", "Yes", "Yes", "No", "No",
+                               "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No",
+                               "Yes", "Yes", "No", "No"))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-naiveBayes", fileext = ".tmp")
+    write.ml(m, modelPath)
+    expect_error(write.ml(m, modelPath))
+    write.ml(m, modelPath, overwrite = TRUE)
+    m2 <- read.ml(modelPath)
+    s2 <- summary(m2)
+    expect_equal(s$apriori, s2$apriori)
+    expect_equal(s$tables, s2$tables)
+
+    unlink(modelPath)
+  }
+
+  # Test e1071::naiveBayes
+  if (requireNamespace("e1071", quietly = TRUE)) {
+    expect_error(m <- e1071::naiveBayes(Survived ~ ., data = t1), NA)
+    expect_equal(as.character(predict(m, t1[1, ])), "Yes")
+  }
+
+  # Test numeric response variable
+  t1$NumericSurvived <- ifelse(t1$Survived == "No", 0, 1)
+  t2 <- t1[-4]
+  df <- suppressWarnings(createDataFrame(t2))
+  m <- spark.naiveBayes(df, NumericSurvived ~ ., smoothing = 0.0)
+  s <- summary(m)
+  expect_equal(as.double(s$apriori[1, 1]), 0.5833333, tolerance = 1e-6)
+  expect_equal(sum(s$apriori), 1)
+  expect_equal(as.double(s$tables[1, "Age_Adult"]), 0.5714286, tolerance = 1e-6)
+})
+
+sparkR.session.stop()

http://git-wip-us.apache.org/repos/asf/spark/blob/dc4c3518/R/pkg/tests/fulltests/test_mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R
new file mode 100644
index 0000000..e827e96
--- /dev/null
+++ b/R/pkg/tests/fulltests/test_mllib_clustering.R
@@ -0,0 +1,328 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+library(testthat)
+
+context("MLlib clustering algorithms")
+
+# Tests for MLlib clustering algorithms in SparkR
+sparkSession <- sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE)
+
+absoluteSparkPath <- function(x) {
+  sparkHome <- sparkR.conf("spark.home")
+  file.path(sparkHome, x)
+}
+
+test_that("spark.bisectingKmeans", {
+  skip_on_cran()
+
+  newIris <- iris
+  newIris$Species <- NULL
+  training <- suppressWarnings(createDataFrame(newIris))
+
+  take(training, 1)
+
+  model <- spark.bisectingKmeans(data = training, ~ .)
+  sample <- take(select(predict(model, training), "prediction"), 1)
+  expect_equal(typeof(sample$prediction), "integer")
+  expect_equal(sample$prediction, 1)
+
+  # Test fitted works on Bisecting KMeans
+  fitted.model <- fitted(model)
+  expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction),
+               c(0, 1, 2, 3))
+
+  # Test summary works on KMeans
+  summary.model <- summary(model)
+  cluster <- summary.model$cluster
+  k <- summary.model$k
+  expect_equal(k, 4)
+  expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction),
+               c(0, 1, 2, 3))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-bisectingkmeans", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    summary2 <- summary(model2)
+    expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
+    expect_equal(summary.model$coefficients, summary2$coefficients)
+    expect_true(!summary.model$is.loaded)
+    expect_true(summary2$is.loaded)
+
+    unlink(modelPath)
+  }
+})
+
+test_that("spark.gaussianMixture", {
+  # R code to reproduce the result.
+  # nolint start
+  #' library(mvtnorm)
+  #' set.seed(1)
+  #' a <- rmvnorm(7, c(0, 0))
+  #' b <- rmvnorm(8, c(10, 10))
+  #' data <- rbind(a, b)
+  #' model <- mvnormalmixEM(data, k = 2)
+  #' model$lambda
+  #
+  #  [1] 0.4666667 0.5333333
+  #
+  #' model$mu
+  #
+  #  [1] 0.11731091 -0.06192351
+  #  [1] 10.363673  9.897081
+  #
+  #' model$sigma
+  #
+  #  [[1]]
+  #             [,1]       [,2]
+  #  [1,] 0.62049934 0.06880802
+  #  [2,] 0.06880802 1.27431874
+  #
+  #  [[2]]
+  #            [,1]     [,2]
+  #  [1,] 0.2961543 0.160783
+  #  [2,] 0.1607830 1.008878
+  #
+  #' model$loglik
+  #
+  #  [1] -46.89499
+  # nolint end
+  data <- list(list(-0.6264538, 0.1836433), list(-0.8356286, 1.5952808),
+               list(0.3295078, -0.8204684), list(0.4874291, 0.7383247),
+               list(0.5757814, -0.3053884), list(1.5117812, 0.3898432),
+               list(-0.6212406, -2.2146999), list(11.1249309, 9.9550664),
+               list(9.9838097, 10.9438362), list(10.8212212, 10.5939013),
+               list(10.9189774, 10.7821363), list(10.0745650, 8.0106483),
+               list(10.6198257, 9.9438713), list(9.8442045, 8.5292476),
+               list(9.5218499, 10.4179416))
+  df <- createDataFrame(data, c("x1", "x2"))
+  model <- spark.gaussianMixture(df, ~ x1 + x2, k = 2)
+  stats <- summary(model)
+  rLambda <- c(0.4666667, 0.5333333)
+  rMu <- c(0.11731091, -0.06192351, 10.363673, 9.897081)
+  rSigma <- c(0.62049934, 0.06880802, 0.06880802, 1.27431874,
+              0.2961543, 0.160783, 0.1607830, 1.008878)
+  rLoglik <- -46.89499
+  expect_equal(stats$lambda, rLambda, tolerance = 1e-3)
+  expect_equal(unlist(stats$mu), rMu, tolerance = 1e-3)
+  expect_equal(unlist(stats$sigma), rSigma, tolerance = 1e-3)
+  expect_equal(unlist(stats$loglik), rLoglik, tolerance = 1e-3)
+  p <- collect(select(predict(model, df), "prediction"))
+  expect_equal(p$prediction, c(0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-gaussianMixture", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+    expect_equal(stats$lambda, stats2$lambda)
+    expect_equal(unlist(stats$mu), unlist(stats2$mu))
+    expect_equal(unlist(stats$sigma), unlist(stats2$sigma))
+    expect_equal(unlist(stats$loglik), unlist(stats2$loglik))
+
+    unlink(modelPath)
+  }
+})
+
+test_that("spark.kmeans", {
+  newIris <- iris
+  newIris$Species <- NULL
+  training <- suppressWarnings(createDataFrame(newIris))
+
+  take(training, 1)
+
+  model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random")
+  sample <- take(select(predict(model, training), "prediction"), 1)
+  expect_equal(typeof(sample$prediction), "integer")
+  expect_equal(sample$prediction, 1)
+
+  # Test stats::kmeans is working
+  statsModel <- kmeans(x = newIris, centers = 2)
+  expect_equal(sort(unique(statsModel$cluster)), c(1, 2))
+
+  # Test fitted works on KMeans
+  fitted.model <- fitted(model)
+  expect_equal(sort(collect(distinct(select(fitted.model, "prediction")))$prediction), c(0, 1))
+
+  # Test summary works on KMeans
+  summary.model <- summary(model)
+  cluster <- summary.model$cluster
+  k <- summary.model$k
+  expect_equal(k, 2)
+  expect_equal(sort(collect(distinct(select(cluster, "prediction")))$prediction), c(0, 1))
+
+  # test summary coefficients return matrix type
+  expect_true(class(summary.model$coefficients) == "matrix")
+  expect_true(class(summary.model$coefficients[1, ]) == "numeric")
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-kmeans", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    summary2 <- summary(model2)
+    expect_equal(sort(unlist(summary.model$size)), sort(unlist(summary2$size)))
+    expect_equal(summary.model$coefficients, summary2$coefficients)
+    expect_true(!summary.model$is.loaded)
+    expect_true(summary2$is.loaded)
+
+    unlink(modelPath)
+  }
+
+  # Test Kmeans on dataset that is sensitive to seed value
+  col1 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  col2 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  col3 <- c(1, 2, 3, 4, 0, 1, 2, 3, 4, 0)
+  cols <- as.data.frame(cbind(col1, col2, col3))
+  df <- createDataFrame(cols)
+
+  model1 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+                         initMode = "random", seed = 1, tol = 1E-5)
+  model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
+                         initMode = "random", seed = 22222, tol = 1E-5)
+
+  summary.model1 <- summary(model1)
+  summary.model2 <- summary(model2)
+  cluster1 <- summary.model1$cluster
+  cluster2 <- summary.model2$cluster
+  clusterSize1 <- summary.model1$clusterSize
+  clusterSize2 <- summary.model2$clusterSize
+
+  # The predicted clusters are different
+  expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
+             c(0, 1, 2, 3))
+  expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
+             c(0, 1, 2))
+  expect_equal(clusterSize1, 4)
+  expect_equal(clusterSize2, 3)
+})
+
+test_that("spark.lda with libsvm", {
+  text <- read.df(absoluteSparkPath("data/mllib/sample_lda_libsvm_data.txt"), source = "libsvm")
+  model <- spark.lda(text, optimizer = "em")
+
+  stats <- summary(model, 10)
+  isDistributed <- stats$isDistributed
+  logLikelihood <- stats$logLikelihood
+  logPerplexity <- stats$logPerplexity
+  vocabSize <- stats$vocabSize
+  topics <- stats$topicTopTerms
+  weights <- stats$topicTopTermsWeights
+  vocabulary <- stats$vocabulary
+  trainingLogLikelihood <- stats$trainingLogLikelihood
+  logPrior <- stats$logPrior
+
+  expect_true(isDistributed)
+  expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
+  expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
+  expect_equal(vocabSize, 11)
+  expect_true(is.null(vocabulary))
+  expect_true(trainingLogLikelihood <= 0 & !is.na(trainingLogLikelihood))
+  expect_true(logPrior <= 0 & !is.na(logPrior))
+
+  # Test model save/load
+  if (not_cran_or_windows_with_hadoop()) {
+    modelPath <- tempfile(pattern = "spark-lda", fileext = ".tmp")
+    write.ml(model, modelPath)
+    expect_error(write.ml(model, modelPath))
+    write.ml(model, modelPath, overwrite = TRUE)
+    model2 <- read.ml(modelPath)
+    stats2 <- summary(model2)
+
+    expect_true(stats2$isDistributed)
+    expect_equal(logLikelihood, stats2$logLikelihood)
+    expect_equal(logPerplexity, stats2$logPerplexity)
+    expect_equal(vocabSize, stats2$vocabSize)
+    expect_equal(vocabulary, stats2$vocabulary)
+    expect_equal(trainingLogLikelihood, stats2$trainingLogLikelihood)
+    expect_equal(logPrior, stats2$logPrior)
+
+    unlink(modelPath)
+  }
+})
+
+test_that("spark.lda with text input", {
+  skip_on_cran()
+
+  text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
+  model <- spark.lda(text, optimizer = "online", features = "value")
+
+  stats <- summary(model)
+  isDistributed <- stats$isDistributed
+  logLikelihood <- stats$logLikelihood
+  logPerplexity <- stats$logPerplexity
+  vocabSize <- stats$vocabSize
+  topics <- stats$topicTopTerms
+  weights <- stats$topicTopTermsWeights
+  vocabulary <- stats$vocabulary
+  trainingLogLikelihood <- stats$trainingLogLikelihood
+  logPrior <- stats$logPrior
+
+  expect_false(isDistributed)
+  expect_true(logLikelihood <= 0 & is.finite(logLikelihood))
+  expect_true(logPerplexity >= 0 & is.finite(logPerplexity))
+  expect_equal(vocabSize, 10)
+  expect_true(setequal(stats$vocabulary, c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")))
+  expect_true(is.na(trainingLogLikelihood))
+  expect_true(is.na(logPrior))
+
+  # Test model save/load
+  modelPath <- tempfile(pattern = "spark-lda-text", fileext = ".tmp")
+  write.ml(model, modelPath)
+  expect_error(write.ml(model, modelPath))
+  write.ml(model, modelPath, overwrite = TRUE)
+  model2 <- read.ml(modelPath)
+  stats2 <- summary(model2)
+
+  expect_false(stats2$isDistributed)
+  expect_equal(logLikelihood, stats2$logLikelihood)
+  expect_equal(logPerplexity, stats2$logPerplexity)
+  expect_equal(vocabSize, stats2$vocabSize)
+  expect_true(all.equal(vocabulary, stats2$vocabulary))
+  expect_true(is.na(stats2$trainingLogLikelihood))
+  expect_true(is.na(stats2$logPrior))
+
+  unlink(modelPath)
+})
+
+test_that("spark.posterior and spark.perplexity", {
+  skip_on_cran()
+
+  text <- read.text(absoluteSparkPath("data/mllib/sample_lda_data.txt"))
+  model <- spark.lda(text, features = "value", k = 3)
+
+  # Assert perplexities are equal
+  stats <- summary(model)
+  logPerplexity <- spark.perplexity(model, text)
+  expect_equal(logPerplexity, stats$logPerplexity)
+
+  # Assert the sum of every topic distribution is equal to 1
+  posterior <- spark.posterior(model, text)
+  local.posterior <- collect(posterior)$topicDistribution
+  expect_equal(length(local.posterior), sum(unlist(local.posterior)))
+})
+
+sparkR.session.stop()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org