You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@toree.apache.org by ch...@apache.org on 2017/01/23 01:06:30 UTC
[7/7] incubator-toree git commit: Remove SparkR fork (mariusvniekerk
via chipsenkbeil) closes #87
Remove SparkR fork (mariusvniekerk via chipsenkbeil) closes #87
Project: http://git-wip-us.apache.org/repos/asf/incubator-toree/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-toree/commit/2eb26cd3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-toree/tree/2eb26cd3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-toree/diff/2eb26cd3
Branch: refs/heads/0.1.x
Commit: 2eb26cd3cc923c238e03c01276bba94452d645df
Parents: acc0285
Author: mariusvniekerk <ma...@gmail.com>
Authored: Sun Jan 22 19:04:31 2017 -0600
Committer: Chip Senkbeil <ch...@gmail.com>
Committed: Sun Jan 22 19:04:31 2017 -0600
----------------------------------------------------------------------
etc/examples/notebooks/magic-tutorial.ipynb | 8 +-
.../notebooks/meetup-streaming-toree.ipynb | 12 +-
.../src/main/resources/R/.gitignore | 6 -
.../src/main/resources/R/DOCUMENTATION.md | 12 -
.../src/main/resources/R/README.md | 67 -
.../src/main/resources/R/WINDOWS.md | 13 -
.../src/main/resources/R/create-docs.sh | 46 -
.../src/main/resources/R/install-dev.bat | 27 -
.../src/main/resources/R/install-dev.sh | 45 -
.../src/main/resources/R/log4j.properties | 28 -
.../src/main/resources/R/pkg/.lintr | 2 -
.../src/main/resources/R/pkg/DESCRIPTION | 36 -
.../src/main/resources/R/pkg/NAMESPACE | 259 ---
.../src/main/resources/R/pkg/R/DataFrame.R | 1819 ----------------
.../src/main/resources/R/pkg/R/RDD.R | 1644 ---------------
.../src/main/resources/R/pkg/R/SQLContext.R | 513 -----
.../src/main/resources/R/pkg/R/backend.R | 117 --
.../src/main/resources/R/pkg/R/broadcast.R | 86 -
.../src/main/resources/R/pkg/R/client.R | 69 -
.../src/main/resources/R/pkg/R/column.R | 234 ---
.../src/main/resources/R/pkg/R/context.R | 225 --
.../src/main/resources/R/pkg/R/deserialize.R | 189 --
.../src/main/resources/R/pkg/R/functions.R | 1988 ------------------
.../src/main/resources/R/pkg/R/generics.R | 985 ---------
.../src/main/resources/R/pkg/R/group.R | 138 --
.../src/main/resources/R/pkg/R/jobj.R | 104 -
.../src/main/resources/R/pkg/R/mllib.R | 99 -
.../src/main/resources/R/pkg/R/pairRDD.R | 908 --------
.../src/main/resources/R/pkg/R/schema.R | 166 --
.../src/main/resources/R/pkg/R/serialize.R | 208 --
.../src/main/resources/R/pkg/R/sparkR.R | 360 ----
.../src/main/resources/R/pkg/R/utils.R | 600 ------
.../main/resources/R/pkg/inst/profile/general.R | 22 -
.../main/resources/R/pkg/inst/profile/shell.R | 47 -
.../inst/test_support/sparktestjar_2.10-1.0.jar | Bin 2886 -> 0 bytes
.../main/resources/R/pkg/inst/tests/jarTest.R | 32 -
.../R/pkg/inst/tests/packageInAJarTest.R | 30 -
.../R/pkg/inst/tests/test_binaryFile.R | 89 -
.../R/pkg/inst/tests/test_binary_function.R | 101 -
.../resources/R/pkg/inst/tests/test_broadcast.R | 48 -
.../resources/R/pkg/inst/tests/test_client.R | 36 -
.../resources/R/pkg/inst/tests/test_context.R | 57 -
.../R/pkg/inst/tests/test_includeJAR.R | 37 -
.../R/pkg/inst/tests/test_includePackage.R | 57 -
.../resources/R/pkg/inst/tests/test_mllib.R | 61 -
.../R/pkg/inst/tests/test_parallelize_collect.R | 109 -
.../main/resources/R/pkg/inst/tests/test_rdd.R | 793 -------
.../resources/R/pkg/inst/tests/test_shuffle.R | 221 --
.../resources/R/pkg/inst/tests/test_sparkSQL.R | 1244 -----------
.../main/resources/R/pkg/inst/tests/test_take.R | 66 -
.../resources/R/pkg/inst/tests/test_textFile.R | 161 --
.../resources/R/pkg/inst/tests/test_utils.R | 140 --
.../main/resources/R/pkg/inst/worker/daemon.R | 52 -
.../main/resources/R/pkg/inst/worker/worker.R | 177 --
.../main/resources/R/pkg/src-native/Makefile | 27 -
.../resources/R/pkg/src-native/Makefile.win | 27 -
.../R/pkg/src-native/string_hash_code.c | 49 -
.../src/main/resources/R/pkg/tests/run-all.R | 21 -
.../src/main/resources/R/run-tests.sh | 39 -
sparkr-interpreter/src/main/resources/README.md | 24 +-
.../src/main/resources/kernelR/sparkr_runner.R | 64 +-
.../src/main/resources/package-sparkR.sh | 20 -
.../src/main/resources/sparkr_bundle.tar.gz | Bin 455638 -> 0 bytes
.../interpreter/sparkr/ReflectiveRBackend.scala | 4 +-
.../interpreter/sparkr/SparkRProcess.scala | 2 +-
.../interpreter/sparkr/SparkRService.scala | 3 +-
66 files changed, 70 insertions(+), 14803 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/etc/examples/notebooks/magic-tutorial.ipynb
----------------------------------------------------------------------
diff --git a/etc/examples/notebooks/magic-tutorial.ipynb b/etc/examples/notebooks/magic-tutorial.ipynb
index e6d6a62..d128ea5 100644
--- a/etc/examples/notebooks/magic-tutorial.ipynb
+++ b/etc/examples/notebooks/magic-tutorial.ipynb
@@ -724,11 +724,7 @@
},
"outputs": [
{
- "data": {
- "application/javascript": [
- "alert(\"Hello, Magics!\")"
- ]
- },
+ "data": {},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
@@ -950,4 +946,4 @@
},
"nbformat": 4,
"nbformat_minor": 0
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/etc/examples/notebooks/meetup-streaming-toree.ipynb
----------------------------------------------------------------------
diff --git a/etc/examples/notebooks/meetup-streaming-toree.ipynb b/etc/examples/notebooks/meetup-streaming-toree.ipynb
index eec9072..5ce6c07 100644
--- a/etc/examples/notebooks/meetup-streaming-toree.ipynb
+++ b/etc/examples/notebooks/meetup-streaming-toree.ipynb
@@ -720,7 +720,9 @@
"collapsed": true
},
"outputs": [],
- "source": []
+ "source": [
+ ""
+ ]
}
],
"metadata": {
@@ -734,13 +736,13 @@
},
"urth": {
"dashboard": {
- "cellMargin": 10,
- "defaultCellHeight": 20,
+ "cellMargin": 10.0,
+ "defaultCellHeight": 20.0,
"layoutStrategy": "packed",
- "maxColumns": 12
+ "maxColumns": 12.0
}
}
},
"nbformat": 4,
"nbformat_minor": 0
-}
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/.gitignore
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/.gitignore b/sparkr-interpreter/src/main/resources/R/.gitignore
deleted file mode 100644
index 9a5889b..0000000
--- a/sparkr-interpreter/src/main/resources/R/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*.o
-*.so
-*.Rd
-lib
-pkg/man
-pkg/html
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/DOCUMENTATION.md
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/DOCUMENTATION.md b/sparkr-interpreter/src/main/resources/R/DOCUMENTATION.md
deleted file mode 100644
index 931d015..0000000
--- a/sparkr-interpreter/src/main/resources/R/DOCUMENTATION.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# SparkR Documentation
-
-SparkR documentation is generated using in-source comments annotated using using
-`roxygen2`. After making changes to the documentation, to generate man pages,
-you can run the following from an R console in the SparkR home directory
-
- library(devtools)
- devtools::document(pkg="./pkg", roclets=c("rd"))
-
-You can verify if your changes are good by running
-
- R CMD check pkg/
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/README.md
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/README.md b/sparkr-interpreter/src/main/resources/R/README.md
deleted file mode 100644
index 005f56d..0000000
--- a/sparkr-interpreter/src/main/resources/R/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# R on Spark
-
-SparkR is an R package that provides a light-weight frontend to use Spark from R.
-
-### SparkR development
-
-#### Build Spark
-
-Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run
-```
- build/mvn -DskipTests -Psparkr package
-```
-
-#### Running sparkR
-
-You can start using SparkR by launching the SparkR shell with
-
- ./bin/sparkR
-
-The `sparkR` script automatically creates a SparkContext with Spark by default in
-local mode. To specify the Spark master of a cluster for the automatically created
-SparkContext, you can run
-
- ./bin/sparkR --master "local[2]"
-
-To set other options like driver memory, executor memory etc. you can pass in the [spark-submit](http://spark.apache.org/docs/latest/submitting-applications.html) arguments to `./bin/sparkR`
-
-#### Using SparkR from RStudio
-
-If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example
-```
-# Set this to where Spark is installed
-Sys.setenv(SPARK_HOME="/Users/shivaram/spark")
-# This line loads SparkR from the installed directory
-.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib"), .libPaths()))
-library(SparkR)
-sc <- sparkR.init(master="local")
-```
-
-#### Making changes to SparkR
-
-The [instructions](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark) for making contributions to Spark also apply to SparkR.
-If you only make R file changes (i.e. no Scala changes) then you can just re-install the R package using `R/install-dev.sh` and test your changes.
-Once you have made your changes, please include unit tests for them and run existing unit tests using the `run-tests.sh` script as described below.
-
-#### Generating documentation
-
-The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script.
-
-### Examples, Unit tests
-
-SparkR comes with several sample programs in the `examples/src/main/r` directory.
-To run one of them, use `./bin/sparkR <filename> <args>`. For example:
-
- ./bin/sparkR examples/src/main/r/dataframe.R
-
-You can also run the unit-tests for SparkR by running (you need to install the [testthat](http://cran.r-project.org/web/packages/testthat/index.html) package first):
-
- R -e 'install.packages("testthat", repos="http://cran.us.r-project.org")'
- ./R/run-tests.sh
-
-### Running on YARN
-The `./bin/spark-submit` and `./bin/sparkR` can also be used to submit jobs to YARN clusters. You will need to set YARN conf dir before doing so. For example on CDH you can run
-```
-export YARN_CONF_DIR=/etc/hadoop/conf
-./bin/spark-submit --master yarn examples/src/main/r/dataframe.R
-```
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/WINDOWS.md
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/WINDOWS.md b/sparkr-interpreter/src/main/resources/R/WINDOWS.md
deleted file mode 100644
index 3f889c0..0000000
--- a/sparkr-interpreter/src/main/resources/R/WINDOWS.md
+++ /dev/null
@@ -1,13 +0,0 @@
-## Building SparkR on Windows
-
-To build SparkR on Windows, the following steps are required
-
-1. Install R (>= 3.1) and [Rtools](http://cran.r-project.org/bin/windows/Rtools/). Make sure to
-include Rtools and R in `PATH`.
-2. Install
-[JDK7](http://www.oracle.com/technetwork/java/javase/downloads/jdk7-downloads-1880260.html) and set
-`JAVA_HOME` in the system environment variables.
-3. Download and install [Maven](http://maven.apache.org/download.html). Also include the `bin`
-directory in Maven in `PATH`.
-4. Set `MAVEN_OPTS` as described in [Building Spark](http://spark.apache.org/docs/latest/building-spark.html).
-5. Open a command shell (`cmd`) in the Spark directory and run `mvn -DskipTests -Psparkr package`
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/create-docs.sh
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/create-docs.sh b/sparkr-interpreter/src/main/resources/R/create-docs.sh
deleted file mode 100755
index d2ae160..0000000
--- a/sparkr-interpreter/src/main/resources/R/create-docs.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Script to create API docs for SparkR
-# This requires `devtools` and `knitr` to be installed on the machine.
-
-# After running this script the html docs can be found in
-# $SPARK_HOME/R/pkg/html
-
-set -o pipefail
-set -e
-
-# Figure out where the script is
-export FWDIR="$(cd "`dirname "$0"`"; pwd)"
-pushd $FWDIR
-
-# Install the package (this will also generate the Rd files)
-./install-dev.sh
-
-# Now create HTML files
-
-# knit_rd puts html in current working directory
-mkdir -p pkg/html
-pushd pkg/html
-
-Rscript -e 'libDir <- "../../lib"; library(SparkR, lib.loc=libDir); library(knitr); knit_rd("SparkR", links = tools::findHTMLlinks(paste(libDir, "SparkR", sep="/")))'
-
-popd
-
-popd
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/install-dev.bat
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/install-dev.bat b/sparkr-interpreter/src/main/resources/R/install-dev.bat
deleted file mode 100644
index 008a5c6..0000000
--- a/sparkr-interpreter/src/main/resources/R/install-dev.bat
+++ /dev/null
@@ -1,27 +0,0 @@
-@echo off
-
-rem
-rem Licensed to the Apache Software Foundation (ASF) under one or more
-rem contributor license agreements. See the NOTICE file distributed with
-rem this work for additional information regarding copyright ownership.
-rem The ASF licenses this file to You under the Apache License, Version 2.0
-rem (the "License"); you may not use this file except in compliance with
-rem the License. You may obtain a copy of the License at
-rem
-rem http://www.apache.org/licenses/LICENSE-2.0
-rem
-rem Unless required by applicable law or agreed to in writing, software
-rem distributed under the License is distributed on an "AS IS" BASIS,
-rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-rem See the License for the specific language governing permissions and
-rem limitations under the License.
-rem
-
-rem Install development version of SparkR
-rem
-
-set SPARK_HOME=%~dp0..
-
-MKDIR %SPARK_HOME%\R\lib
-
-R.exe CMD INSTALL --library="%SPARK_HOME%\R\lib" %SPARK_HOME%\R\pkg\
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/install-dev.sh
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/install-dev.sh b/sparkr-interpreter/src/main/resources/R/install-dev.sh
deleted file mode 100755
index 59d98c9..0000000
--- a/sparkr-interpreter/src/main/resources/R/install-dev.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This scripts packages the SparkR source files (R and C files) and
-# creates a package that can be loaded in R. The package is by default installed to
-# $FWDIR/lib and the package can be loaded by using the following command in R:
-#
-# library(SparkR, lib.loc="$FWDIR/lib")
-#
-# NOTE(shivaram): Right now we use $SPARK_HOME/R/lib to be the installation directory
-# to load the SparkR package on the worker nodes.
-
-set -o pipefail
-set -e
-
-FWDIR="$(cd `dirname $0`; pwd)"
-LIB_DIR="$FWDIR/lib"
-
-mkdir -p $LIB_DIR
-
-pushd $FWDIR > /dev/null
-
-# Generate Rd files if devtools is installed
-Rscript -e ' if("devtools" %in% rownames(installed.packages())) { library(devtools); devtools::document(pkg="./pkg", roclets=c("rd")) }'
-
-# Install SparkR to $LIB_DIR
-R CMD INSTALL --library=$LIB_DIR $FWDIR/pkg/
-
-popd > /dev/null
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/log4j.properties
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/log4j.properties b/sparkr-interpreter/src/main/resources/R/log4j.properties
deleted file mode 100644
index fa3e996..0000000
--- a/sparkr-interpreter/src/main/resources/R/log4j.properties
+++ /dev/null
@@ -1,28 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-#
-
-# Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
-log4j.appender.file=org.apache.log4j.FileAppender
-log4j.appender.file.append=true
-log4j.appender.file.file=R/target/unit-tests.log
-log4j.appender.file.layout=org.apache.log4j.PatternLayout
-log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
-
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.eclipse.jetty=WARN
-org.eclipse.jetty.LEVEL=WARN
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/pkg/.lintr
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/pkg/.lintr b/sparkr-interpreter/src/main/resources/R/pkg/.lintr
deleted file mode 100644
index 038236f..0000000
--- a/sparkr-interpreter/src/main/resources/R/pkg/.lintr
+++ /dev/null
@@ -1,2 +0,0 @@
-linters: with_defaults(line_length_linter(100), camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), closed_curly_linter(allow_single_line = TRUE))
-exclusions: list("inst/profile/general.R" = 1, "inst/profile/shell.R")
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/pkg/DESCRIPTION
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/pkg/DESCRIPTION b/sparkr-interpreter/src/main/resources/R/pkg/DESCRIPTION
deleted file mode 100644
index d0d7201..0000000
--- a/sparkr-interpreter/src/main/resources/R/pkg/DESCRIPTION
+++ /dev/null
@@ -1,36 +0,0 @@
-Package: SparkR
-Type: Package
-Title: R frontend for Spark
-Version: 1.5.0
-Date: 2013-09-09
-Author: The Apache Software Foundation
-Maintainer: Shivaram Venkataraman <sh...@cs.berkeley.edu>
-Imports:
- methods
-Depends:
- R (>= 3.0),
- methods,
-Suggests:
- testthat
-Description: R frontend for Spark
-License: Apache License (== 2.0)
-Collate:
- 'schema.R'
- 'generics.R'
- 'jobj.R'
- 'RDD.R'
- 'pairRDD.R'
- 'column.R'
- 'group.R'
- 'DataFrame.R'
- 'SQLContext.R'
- 'backend.R'
- 'broadcast.R'
- 'client.R'
- 'context.R'
- 'deserialize.R'
- 'functions.R'
- 'mllib.R'
- 'serialize.R'
- 'sparkR.R'
- 'utils.R'
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/pkg/NAMESPACE b/sparkr-interpreter/src/main/resources/R/pkg/NAMESPACE
deleted file mode 100644
index e7f9770..0000000
--- a/sparkr-interpreter/src/main/resources/R/pkg/NAMESPACE
+++ /dev/null
@@ -1,259 +0,0 @@
-# Imports from base R
-importFrom(methods, setGeneric, setMethod, setOldClass)
-
-# Disable native libraries till we figure out how to package it
-# See SPARKR-7839
-#useDynLib(SparkR, stringHashCode)
-
-# S3 methods exported
-export("sparkR.init")
-export("sparkR.stop")
-export("print.jobj")
-
-# Needed exports for runner
-export("sparkR.connect")
-export("isInstanceOf")
-export("callJMethod")
-export("callJStatic")
-export("newJObject")
-export("removeJObject")
-export("isRemoveMethod")
-export("invokeJava")
-
-# MLlib integration
-exportMethods("glm",
- "predict",
- "summary")
-
-# Job group lifecycle management methods
-export("setJobGroup",
- "clearJobGroup",
- "cancelJobGroup")
-
-exportClasses("DataFrame")
-
-exportMethods("arrange",
- "cache",
- "collect",
- "columns",
- "count",
- "crosstab",
- "describe",
- "dim",
- "distinct",
- "dropna",
- "dtypes",
- "except",
- "explain",
- "fillna",
- "filter",
- "first",
- "group_by",
- "groupBy",
- "head",
- "insertInto",
- "intersect",
- "isLocal",
- "join",
- "limit",
- "merge",
- "mutate",
- "na.omit",
- "names",
- "ncol",
- "nrow",
- "orderBy",
- "persist",
- "printSchema",
- "rbind",
- "registerTempTable",
- "rename",
- "repartition",
- "sample",
- "sample_frac",
- "saveAsParquetFile",
- "saveAsTable",
- "saveDF",
- "schema",
- "select",
- "selectExpr",
- "show",
- "showDF",
- "subset",
- "summarize",
- "summary",
- "take",
- "transform",
- "unionAll",
- "unique",
- "unpersist",
- "where",
- "withColumn",
- "withColumnRenamed",
- "write.df")
-
-exportClasses("Column")
-
-exportMethods("%in%",
- "abs",
- "acos",
- "add_months",
- "alias",
- "approxCountDistinct",
- "asc",
- "ascii",
- "asin",
- "atan",
- "atan2",
- "avg",
- "base64",
- "between",
- "bin",
- "bitwiseNOT",
- "cast",
- "cbrt",
- "ceil",
- "ceiling",
- "concat",
- "concat_ws",
- "contains",
- "conv",
- "cos",
- "cosh",
- "count",
- "countDistinct",
- "crc32",
- "date_add",
- "date_format",
- "date_sub",
- "datediff",
- "dayofmonth",
- "dayofyear",
- "desc",
- "endsWith",
- "exp",
- "explode",
- "expm1",
- "expr",
- "factorial",
- "first",
- "floor",
- "format_number",
- "format_string",
- "from_unixtime",
- "from_utc_timestamp",
- "getField",
- "getItem",
- "greatest",
- "hex",
- "hour",
- "hypot",
- "ifelse",
- "initcap",
- "instr",
- "isNaN",
- "isNotNull",
- "isNull",
- "last",
- "last_day",
- "least",
- "length",
- "levenshtein",
- "like",
- "lit",
- "locate",
- "log",
- "log10",
- "log1p",
- "log2",
- "lower",
- "lpad",
- "ltrim",
- "max",
- "md5",
- "mean",
- "min",
- "minute",
- "month",
- "months_between",
- "n",
- "n_distinct",
- "nanvl",
- "negate",
- "next_day",
- "otherwise",
- "pmod",
- "quarter",
- "rand",
- "randn",
- "regexp_extract",
- "regexp_replace",
- "reverse",
- "rint",
- "rlike",
- "round",
- "rpad",
- "rtrim",
- "second",
- "sha1",
- "sha2",
- "shiftLeft",
- "shiftRight",
- "shiftRightUnsigned",
- "sign",
- "signum",
- "sin",
- "sinh",
- "size",
- "soundex",
- "sqrt",
- "startsWith",
- "substr",
- "substring_index",
- "sum",
- "sumDistinct",
- "tan",
- "tanh",
- "toDegrees",
- "toRadians",
- "to_date",
- "to_utc_timestamp",
- "translate",
- "trim",
- "unbase64",
- "unhex",
- "unix_timestamp",
- "upper",
- "weekofyear",
- "when",
- "year")
-
-exportClasses("GroupedData")
-exportMethods("agg")
-
-export("sparkRSQL.init",
- "sparkRHive.init")
-
-export("cacheTable",
- "clearCache",
- "createDataFrame",
- "createExternalTable",
- "dropTempTable",
- "jsonFile",
- "loadDF",
- "parquetFile",
- "read.df",
- "sql",
- "table",
- "tableNames",
- "tables",
- "uncacheTable")
-
-export("structField",
- "structField.jobj",
- "structField.character",
- "print.structField",
- "structType",
- "structType.jobj",
- "structType.structField",
- "print.structType")
http://git-wip-us.apache.org/repos/asf/incubator-toree/blob/2eb26cd3/sparkr-interpreter/src/main/resources/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/sparkr-interpreter/src/main/resources/R/pkg/R/DataFrame.R b/sparkr-interpreter/src/main/resources/R/pkg/R/DataFrame.R
deleted file mode 100644
index 5605124..0000000
--- a/sparkr-interpreter/src/main/resources/R/pkg/R/DataFrame.R
+++ /dev/null
@@ -1,1819 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# DataFrame.R - DataFrame class and methods implemented in S4 OO classes
-
-#' @include generics.R jobj.R schema.R RDD.R pairRDD.R column.R group.R
-NULL
-
-setOldClass("jobj")
-
-#' @title S4 class that represents a DataFrame
-#' @description DataFrames can be created using functions like
-#' \code{jsonFile}, \code{table} etc.
-#' @rdname DataFrame
-#' @seealso jsonFile, table
-#' @docType class
-#'
-#' @slot env An R environment that stores bookkeeping states of the DataFrame
-#' @slot sdf A Java object reference to the backing Scala DataFrame
-#' @export
-setClass("DataFrame",
- slots = list(env = "environment",
- sdf = "jobj"))
-
-setMethod("initialize", "DataFrame", function(.Object, sdf, isCached) {
- .Object@env <- new.env()
- .Object@env$isCached <- isCached
-
- .Object@sdf <- sdf
- .Object
-})
-
-#' @rdname DataFrame
-#' @export
-#'
-#' @param sdf A Java object reference to the backing Scala DataFrame
-#' @param isCached TRUE if the dataFrame is cached
-dataFrame <- function(sdf, isCached = FALSE) {
- new("DataFrame", sdf, isCached)
-}
-
-############################ DataFrame Methods ##############################################
-
-#' Print Schema of a DataFrame
-#'
-#' Prints out the schema in tree format
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname printSchema
-#' @name printSchema
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' printSchema(df)
-#'}
-setMethod("printSchema",
- signature(x = "DataFrame"),
- function(x) {
- schemaString <- callJMethod(schema(x)$jobj, "treeString")
- cat(schemaString)
- })
-
-#' Get schema object
-#'
-#' Returns the schema of this DataFrame as a structType object.
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname schema
-#' @name schema
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' dfSchema <- schema(df)
-#'}
-setMethod("schema",
- signature(x = "DataFrame"),
- function(x) {
- structType(callJMethod(x@sdf, "schema"))
- })
-
-#' Explain
-#'
-#' Print the logical and physical Catalyst plans to the console for debugging.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param extended Logical. If extended is False, explain() only prints the physical plan.
-#' @rdname explain
-#' @name explain
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' explain(df, TRUE)
-#'}
-setMethod("explain",
- signature(x = "DataFrame"),
- function(x, extended = FALSE) {
- queryExec <- callJMethod(x@sdf, "queryExecution")
- if (extended) {
- cat(callJMethod(queryExec, "toString"))
- } else {
- execPlan <- callJMethod(queryExec, "executedPlan")
- cat(callJMethod(execPlan, "toString"))
- }
- })
-
-#' isLocal
-#'
-#' Returns True if the `collect` and `take` methods can be run locally
-#' (without any Spark executors).
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname isLocal
-#' @name isLocal
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' isLocal(df)
-#'}
-setMethod("isLocal",
- signature(x = "DataFrame"),
- function(x) {
- callJMethod(x@sdf, "isLocal")
- })
-
-#' showDF
-#'
-#' Print the first numRows rows of a DataFrame
-#'
-#' @param x A SparkSQL DataFrame
-#' @param numRows The number of rows to print. Defaults to 20.
-#'
-#' @rdname showDF
-#' @name showDF
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' showDF(df)
-#'}
-setMethod("showDF",
- signature(x = "DataFrame"),
- function(x, numRows = 20, truncate = TRUE) {
- s <- callJMethod(x@sdf, "showString", numToInt(numRows), truncate)
- cat(s)
- })
-
-#' show
-#'
-#' Print the DataFrame column names and types
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname show
-#' @name show
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' df
-#'}
-setMethod("show", "DataFrame",
- function(object) {
- cols <- lapply(dtypes(object), function(l) {
- paste(l, collapse = ":")
- })
- s <- paste(cols, collapse = ", ")
- cat(paste("DataFrame[", s, "]\n", sep = ""))
- })
-
-#' DataTypes
-#'
-#' Return all column names and their data types as a list
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname dtypes
-#' @name dtypes
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' dtypes(df)
-#'}
-setMethod("dtypes",
- signature(x = "DataFrame"),
- function(x) {
- lapply(schema(x)$fields(), function(f) {
- c(f$name(), f$dataType.simpleString())
- })
- })
-
-#' Column names
-#'
-#' Return all column names as a list
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname columns
-#' @name columns
-#' @aliases names
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' columns(df)
-#'}
-setMethod("columns",
- signature(x = "DataFrame"),
- function(x) {
- sapply(schema(x)$fields(), function(f) {
- f$name()
- })
- })
-
-#' @rdname columns
-#' @name names
-setMethod("names",
- signature(x = "DataFrame"),
- function(x) {
- columns(x)
- })
-
-#' @rdname columns
-#' @name names<-
-setMethod("names<-",
- signature(x = "DataFrame"),
- function(x, value) {
- if (!is.null(value)) {
- sdf <- callJMethod(x@sdf, "toDF", listToSeq(as.list(value)))
- dataFrame(sdf)
- }
- })
-
-#' Register Temporary Table
-#'
-#' Registers a DataFrame as a Temporary Table in the SQLContext
-#'
-#' @param x A SparkSQL DataFrame
-#' @param tableName A character vector containing the name of the table
-#'
-#' @rdname registerTempTable
-#' @name registerTempTable
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' registerTempTable(df, "json_df")
-#' new_df <- sql(sqlContext, "SELECT * FROM json_df")
-#'}
-setMethod("registerTempTable",
- signature(x = "DataFrame", tableName = "character"),
- function(x, tableName) {
- invisible(callJMethod(x@sdf, "registerTempTable", tableName))
- })
-
-#' insertInto
-#'
-#' Insert the contents of a DataFrame into a table registered in the current SQL Context.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param tableName A character vector containing the name of the table
-#' @param overwrite A logical argument indicating whether or not to overwrite
-#' the existing rows in the table.
-#'
-#' @rdname insertInto
-#' @name insertInto
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' df <- read.df(sqlContext, path, "parquet")
-#' df2 <- read.df(sqlContext, path2, "parquet")
-#' registerTempTable(df, "table1")
-#' insertInto(df2, "table1", overwrite = TRUE)
-#'}
-setMethod("insertInto",
- signature(x = "DataFrame", tableName = "character"),
- function(x, tableName, overwrite = FALSE) {
- callJMethod(x@sdf, "insertInto", tableName, overwrite)
- })
-
-#' Cache
-#'
-#' Persist with the default storage level (MEMORY_ONLY).
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname cache
-#' @name cache
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' cache(df)
-#'}
-setMethod("cache",
- signature(x = "DataFrame"),
- function(x) {
- cached <- callJMethod(x@sdf, "cache")
- x@env$isCached <- TRUE
- x
- })
-
-#' Persist
-#'
-#' Persist this DataFrame with the specified storage level. For details of the
-#' supported storage levels, refer to
-#' http://spark.apache.org/docs/latest/programming-guide.html#rdd-persistence.
-#'
-#' @param x The DataFrame to persist
-#' @rdname persist
-#' @name persist
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' persist(df, "MEMORY_AND_DISK")
-#'}
-setMethod("persist",
- signature(x = "DataFrame", newLevel = "character"),
- function(x, newLevel) {
- callJMethod(x@sdf, "persist", getStorageLevel(newLevel))
- x@env$isCached <- TRUE
- x
- })
-
-#' Unpersist
-#'
-#' Mark this DataFrame as non-persistent, and remove all blocks for it from memory and
-#' disk.
-#'
-#' @param x The DataFrame to unpersist
-#' @param blocking Whether to block until all blocks are deleted
-#' @rdname unpersist-methods
-#' @name unpersist
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' persist(df, "MEMORY_AND_DISK")
-#' unpersist(df)
-#'}
-setMethod("unpersist",
- signature(x = "DataFrame"),
- function(x, blocking = TRUE) {
- callJMethod(x@sdf, "unpersist", blocking)
- x@env$isCached <- FALSE
- x
- })
-
-#' Repartition
-#'
-#' Return a new DataFrame that has exactly numPartitions partitions.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param numPartitions The number of partitions to use.
-#' @rdname repartition
-#' @name repartition
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' newDF <- repartition(df, 2L)
-#'}
-setMethod("repartition",
- signature(x = "DataFrame", numPartitions = "numeric"),
- function(x, numPartitions) {
- sdf <- callJMethod(x@sdf, "repartition", numToInt(numPartitions))
- dataFrame(sdf)
- })
-
-# toJSON
-#
-# Convert the rows of a DataFrame into JSON objects and return an RDD where
-# each element contains a JSON string.
-#
-#@param x A SparkSQL DataFrame
-# @return A StringRRDD of JSON objects
-# @rdname tojson
-# @export
-# @examples
-#\dontrun{
-# sc <- sparkR.init()
-# sqlContext <- sparkRSQL.init(sc)
-# path <- "path/to/file.json"
-# df <- jsonFile(sqlContext, path)
-# newRDD <- toJSON(df)
-#}
-setMethod("toJSON",
- signature(x = "DataFrame"),
- function(x) {
- rdd <- callJMethod(x@sdf, "toJSON")
- jrdd <- callJMethod(rdd, "toJavaRDD")
- RDD(jrdd, serializedMode = "string")
- })
-
-#' saveAsParquetFile
-#'
-#' Save the contents of a DataFrame as a Parquet file, preserving the schema. Files written out
-#' with this method can be read back in as a DataFrame using parquetFile().
-#'
-#' @param x A SparkSQL DataFrame
-#' @param path The directory where the file is saved
-#' @rdname saveAsParquetFile
-#' @name saveAsParquetFile
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' saveAsParquetFile(df, "/tmp/sparkr-tmp/")
-#'}
-setMethod("saveAsParquetFile",
- signature(x = "DataFrame", path = "character"),
- function(x, path) {
- invisible(callJMethod(x@sdf, "saveAsParquetFile", path))
- })
-
-#' Distinct
-#'
-#' Return a new DataFrame containing the distinct rows in this DataFrame.
-#'
-#' @param x A SparkSQL DataFrame
-#' @rdname distinct
-#' @name distinct
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' distinctDF <- distinct(df)
-#'}
-setMethod("distinct",
- signature(x = "DataFrame"),
- function(x) {
- sdf <- callJMethod(x@sdf, "distinct")
- dataFrame(sdf)
- })
-
-#' @title Distinct rows in a DataFrame
-#
-#' @description Returns a new DataFrame containing distinct rows in this DataFrame
-#'
-#' @rdname unique
-#' @name unique
-#' @aliases distinct
-setMethod("unique",
- signature(x = "DataFrame"),
- function(x) {
- distinct(x)
- })
-
-#' Sample
-#'
-#' Return a sampled subset of this DataFrame using a random seed.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param withReplacement Sampling with replacement or not
-#' @param fraction The (rough) sample target fraction
-#' @rdname sample
-#' @aliases sample_frac
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' collect(sample(df, FALSE, 0.5))
-#' collect(sample(df, TRUE, 0.5))
-#'}
-setMethod("sample",
- # TODO : Figure out how to send integer as java.lang.Long to JVM so
- # we can send seed as an argument through callJMethod
- signature(x = "DataFrame", withReplacement = "logical",
- fraction = "numeric"),
- function(x, withReplacement, fraction) {
- if (fraction < 0.0) stop(cat("Negative fraction value:", fraction))
- sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction)
- dataFrame(sdf)
- })
-
-#' @rdname sample
-#' @name sample_frac
-setMethod("sample_frac",
- signature(x = "DataFrame", withReplacement = "logical",
- fraction = "numeric"),
- function(x, withReplacement, fraction) {
- sample(x, withReplacement, fraction)
- })
-
-#' Count
-#'
-#' Returns the number of rows in a DataFrame
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname count
-#' @name count
-#' @aliases nrow
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' count(df)
-#' }
-setMethod("count",
- signature(x = "DataFrame"),
- function(x) {
- callJMethod(x@sdf, "count")
- })
-
-#' @title Number of rows for a DataFrame
-#' @description Returns number of rows in a DataFrames
-#'
-#' @name nrow
-#'
-#' @rdname nrow
-#' @aliases count
-setMethod("nrow",
- signature(x = "DataFrame"),
- function(x) {
- count(x)
- })
-
-#' Returns the number of columns in a DataFrame
-#'
-#' @param x a SparkSQL DataFrame
-#'
-#' @rdname ncol
-#' @name ncol
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' ncol(df)
-#' }
-setMethod("ncol",
- signature(x = "DataFrame"),
- function(x) {
- length(columns(x))
- })
-
-#' Returns the dimensions (number of rows and columns) of a DataFrame
-#' @param x a SparkSQL DataFrame
-#'
-#' @rdname dim
-#' @name dim
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' dim(df)
-#' }
-setMethod("dim",
- signature(x = "DataFrame"),
- function(x) {
- c(count(x), ncol(x))
- })
-
-#' Collects all the elements of a Spark DataFrame and coerces them into an R data.frame.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param stringsAsFactors (Optional) A logical indicating whether or not string columns
-#' should be converted to factors. FALSE by default.
-#' @rdname collect
-#' @name collect
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' collected <- collect(df)
-#' firstName <- collected[[1]]$name
-#' }
-setMethod("collect",
- signature(x = "DataFrame"),
- function(x, stringsAsFactors = FALSE) {
- # listCols is a list of raw vectors, one per column
- listCols <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToCols", x@sdf)
- cols <- lapply(listCols, function(col) {
- objRaw <- rawConnection(col)
- numRows <- readInt(objRaw)
- col <- readCol(objRaw, numRows)
- close(objRaw)
- col
- })
- names(cols) <- columns(x)
- do.call(cbind.data.frame, list(cols, stringsAsFactors = stringsAsFactors))
- })
-
-#' Limit
-#'
-#' Limit the resulting DataFrame to the number of rows specified.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param num The number of rows to return
-#' @return A new DataFrame containing the number of rows specified.
-#'
-#' @rdname limit
-#' @name limit
-#' @export
-#' @examples
-#' \dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' limitedDF <- limit(df, 10)
-#' }
-setMethod("limit",
- signature(x = "DataFrame", num = "numeric"),
- function(x, num) {
- res <- callJMethod(x@sdf, "limit", as.integer(num))
- dataFrame(res)
- })
-
-#' Take the first NUM rows of a DataFrame and return a the results as a data.frame
-#'
-#' @rdname take
-#' @name take
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' take(df, 2)
-#' }
-setMethod("take",
- signature(x = "DataFrame", num = "numeric"),
- function(x, num) {
- limited <- limit(x, num)
- collect(limited)
- })
-
-#' Head
-#'
-#' Return the first NUM rows of a DataFrame as a data.frame. If NUM is NULL,
-#' then head() returns the first 6 rows in keeping with the current data.frame
-#' convention in R.
-#'
-#' @param x A SparkSQL DataFrame
-#' @param num The number of rows to return. Default is 6.
-#' @return A data.frame
-#'
-#' @rdname head
-#' @name head
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' head(df)
-#' }
-setMethod("head",
- signature(x = "DataFrame"),
- function(x, num = 6L) {
- # Default num is 6L in keeping with R's data.frame convention
- take(x, num)
- })
-
-#' Return the first row of a DataFrame
-#'
-#' @param x A SparkSQL DataFrame
-#'
-#' @rdname first
-#' @name first
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' first(df)
-#' }
-setMethod("first",
- signature(x = "DataFrame"),
- function(x) {
- take(x, 1)
- })
-
-# toRDD
-#
-# Converts a Spark DataFrame to an RDD while preserving column names.
-#
-# @param x A Spark DataFrame
-#
-# @rdname DataFrame
-# @export
-# @examples
-#\dontrun{
-# sc <- sparkR.init()
-# sqlContext <- sparkRSQL.init(sc)
-# path <- "path/to/file.json"
-# df <- jsonFile(sqlContext, path)
-# rdd <- toRDD(df)
-# }
-setMethod("toRDD",
- signature(x = "DataFrame"),
- function(x) {
- jrdd <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "dfToRowRDD", x@sdf)
- colNames <- callJMethod(x@sdf, "columns")
- rdd <- RDD(jrdd, serializedMode = "row")
- lapply(rdd, function(row) {
- names(row) <- colNames
- row
- })
- })
-
-#' GroupBy
-#'
-#' Groups the DataFrame using the specified columns, so we can run aggregation on them.
-#'
-#' @param x a DataFrame
-#' @return a GroupedData
-#' @seealso GroupedData
-#' @aliases group_by
-#' @rdname groupBy
-#' @name groupBy
-#' @export
-#' @examples
-#' \dontrun{
-#' # Compute the average for all numeric columns grouped by department.
-#' avg(groupBy(df, "department"))
-#'
-#' # Compute the max age and average salary, grouped by department and gender.
-#' agg(groupBy(df, "department", "gender"), salary="avg", "age" -> "max")
-#' }
-setMethod("groupBy",
- signature(x = "DataFrame"),
- function(x, ...) {
- cols <- list(...)
- if (length(cols) >= 1 && class(cols[[1]]) == "character") {
- sgd <- callJMethod(x@sdf, "groupBy", cols[[1]], listToSeq(cols[-1]))
- } else {
- jcol <- lapply(cols, function(c) { c@jc })
- sgd <- callJMethod(x@sdf, "groupBy", listToSeq(jcol))
- }
- groupedData(sgd)
- })
-
-#' @rdname groupBy
-#' @name group_by
-setMethod("group_by",
- signature(x = "DataFrame"),
- function(x, ...) {
- groupBy(x, ...)
- })
-
-#' Summarize data across columns
-#'
-#' Compute aggregates by specifying a list of columns
-#'
-#' @param x a DataFrame
-#' @rdname agg
-#' @name agg
-#' @aliases summarize
-#' @export
-setMethod("agg",
- signature(x = "DataFrame"),
- function(x, ...) {
- agg(groupBy(x), ...)
- })
-
-#' @rdname agg
-#' @name summarize
-setMethod("summarize",
- signature(x = "DataFrame"),
- function(x, ...) {
- agg(x, ...)
- })
-
-
-############################## RDD Map Functions ##################################
-# All of the following functions mirror the existing RDD map functions, #
-# but allow for use with DataFrames by first converting to an RRDD before calling #
-# the requested map function. #
-###################################################################################
-
-# @rdname lapply
-setMethod("lapply",
- signature(X = "DataFrame", FUN = "function"),
- function(X, FUN) {
- rdd <- toRDD(X)
- lapply(rdd, FUN)
- })
-
-# @rdname lapply
-setMethod("map",
- signature(X = "DataFrame", FUN = "function"),
- function(X, FUN) {
- lapply(X, FUN)
- })
-
-# @rdname flatMap
-setMethod("flatMap",
- signature(X = "DataFrame", FUN = "function"),
- function(X, FUN) {
- rdd <- toRDD(X)
- flatMap(rdd, FUN)
- })
-
-# @rdname lapplyPartition
-setMethod("lapplyPartition",
- signature(X = "DataFrame", FUN = "function"),
- function(X, FUN) {
- rdd <- toRDD(X)
- lapplyPartition(rdd, FUN)
- })
-
-# @rdname lapplyPartition
-setMethod("mapPartitions",
- signature(X = "DataFrame", FUN = "function"),
- function(X, FUN) {
- lapplyPartition(X, FUN)
- })
-
-# @rdname foreach
-setMethod("foreach",
- signature(x = "DataFrame", func = "function"),
- function(x, func) {
- rdd <- toRDD(x)
- foreach(rdd, func)
- })
-
-# @rdname foreach
-setMethod("foreachPartition",
- signature(x = "DataFrame", func = "function"),
- function(x, func) {
- rdd <- toRDD(x)
- foreachPartition(rdd, func)
- })
-
-
-############################## SELECT ##################################
-
-getColumn <- function(x, c) {
- column(callJMethod(x@sdf, "col", c))
-}
-
-#' @rdname select
-#' @name $
-setMethod("$", signature(x = "DataFrame"),
- function(x, name) {
- getColumn(x, name)
- })
-
-#' @rdname select
-#' @name $<-
-setMethod("$<-", signature(x = "DataFrame"),
- function(x, name, value) {
- stopifnot(class(value) == "Column" || is.null(value))
- cols <- columns(x)
- if (name %in% cols) {
- if (is.null(value)) {
- cols <- Filter(function(c) { c != name }, cols)
- }
- cols <- lapply(cols, function(c) {
- if (c == name) {
- alias(value, name)
- } else {
- col(c)
- }
- })
- nx <- select(x, cols)
- } else {
- if (is.null(value)) {
- return(x)
- }
- nx <- withColumn(x, name, value)
- }
- x@sdf <- nx@sdf
- x
- })
-
-setClassUnion("numericOrcharacter", c("numeric", "character"))
-
-#' @rdname subset
-#' @name [[
-setMethod("[[", signature(x = "DataFrame", i = "numericOrcharacter"),
- function(x, i) {
- if (is.numeric(i)) {
- cols <- columns(x)
- i <- cols[[i]]
- }
- getColumn(x, i)
- })
-
-#' @rdname subset
-#' @name [
-setMethod("[", signature(x = "DataFrame", i = "missing"),
- function(x, i, j, ...) {
- if (is.numeric(j)) {
- cols <- columns(x)
- j <- cols[j]
- }
- if (length(j) > 1) {
- j <- as.list(j)
- }
- select(x, j)
- })
-
-#' @rdname subset
-#' @name [
-setMethod("[", signature(x = "DataFrame", i = "Column"),
- function(x, i, j, ...) {
- # It could handle i as "character" but it seems confusing and not required
- # https://stat.ethz.ch/R-manual/R-devel/library/base/html/Extract.data.frame.html
- filtered <- filter(x, i)
- if (!missing(j)) {
- filtered[, j, ...]
- } else {
- filtered
- }
- })
-
-#' Subset
-#'
-#' Return subsets of DataFrame according to given conditions
-#' @param x A DataFrame
-#' @param subset A logical expression to filter on rows
-#' @param select expression for the single Column or a list of columns to select from the DataFrame
-#' @return A new DataFrame containing only the rows that meet the condition with selected columns
-#' @export
-#' @rdname subset
-#' @name subset
-#' @aliases [
-#' @family subsetting functions
-#' @examples
-#' \dontrun{
-#' # Columns can be selected using `[[` and `[`
-#' df[[2]] == df[["age"]]
-#' df[,2] == df[,"age"]
-#' df[,c("name", "age")]
-#' # Or to filter rows
-#' df[df$age > 20,]
-#' # DataFrame can be subset on both rows and Columns
-#' df[df$name == "Smith", c(1,2)]
-#' df[df$age %in% c(19, 30), 1:2]
-#' subset(df, df$age %in% c(19, 30), 1:2)
-#' subset(df, df$age %in% c(19), select = c(1,2))
-#' }
-setMethod("subset", signature(x = "DataFrame"),
- function(x, subset, select, ...) {
- x[subset, select, ...]
- })
-
-#' Select
-#'
-#' Selects a set of columns with names or Column expressions.
-#' @param x A DataFrame
-#' @param col A list of columns or single Column or name
-#' @return A new DataFrame with selected columns
-#' @export
-#' @rdname select
-#' @name select
-#' @family subsetting functions
-#' @examples
-#' \dontrun{
-#' select(df, "*")
-#' select(df, "col1", "col2")
-#' select(df, df$name, df$age + 1)
-#' select(df, c("col1", "col2"))
-#' select(df, list(df$name, df$age + 1))
-#' # Similar to R data frames columns can also be selected using `$`
-#' df$age
-#' }
-setMethod("select", signature(x = "DataFrame", col = "character"),
- function(x, col, ...) {
- sdf <- callJMethod(x@sdf, "select", col, toSeq(...))
- dataFrame(sdf)
- })
-
-#' @rdname select
-#' @export
-setMethod("select", signature(x = "DataFrame", col = "Column"),
- function(x, col, ...) {
- jcols <- lapply(list(col, ...), function(c) {
- c@jc
- })
- sdf <- callJMethod(x@sdf, "select", listToSeq(jcols))
- dataFrame(sdf)
- })
-
-#' @rdname select
-#' @export
-setMethod("select",
- signature(x = "DataFrame", col = "list"),
- function(x, col) {
- cols <- lapply(col, function(c) {
- if (class(c) == "Column") {
- c@jc
- } else {
- col(c)@jc
- }
- })
- sdf <- callJMethod(x@sdf, "select", listToSeq(cols))
- dataFrame(sdf)
- })
-
-#' SelectExpr
-#'
-#' Select from a DataFrame using a set of SQL expressions.
-#'
-#' @param x A DataFrame to be selected from.
-#' @param expr A string containing a SQL expression
-#' @param ... Additional expressions
-#' @return A DataFrame
-#' @rdname selectExpr
-#' @name selectExpr
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' selectExpr(df, "col1", "(col2 * 5) as newCol")
-#' }
-setMethod("selectExpr",
- signature(x = "DataFrame", expr = "character"),
- function(x, expr, ...) {
- exprList <- list(expr, ...)
- sdf <- callJMethod(x@sdf, "selectExpr", listToSeq(exprList))
- dataFrame(sdf)
- })
-
-#' WithColumn
-#'
-#' Return a new DataFrame with the specified column added.
-#'
-#' @param x A DataFrame
-#' @param colName A string containing the name of the new column.
-#' @param col A Column expression.
-#' @return A DataFrame with the new column added.
-#' @rdname withColumn
-#' @name withColumn
-#' @aliases mutate transform
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' newDF <- withColumn(df, "newCol", df$col1 * 5)
-#' }
-setMethod("withColumn",
- signature(x = "DataFrame", colName = "character", col = "Column"),
- function(x, colName, col) {
- select(x, x$"*", alias(col, colName))
- })
-
-#' Mutate
-#'
-#' Return a new DataFrame with the specified columns added.
-#'
-#' @param .data A DataFrame
-#' @param col a named argument of the form name = col
-#' @return A new DataFrame with the new columns added.
-#' @rdname withColumn
-#' @name mutate
-#' @aliases withColumn transform
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' newDF <- mutate(df, newCol = df$col1 * 5, newCol2 = df$col1 * 2)
-#' names(newDF) # Will contain newCol, newCol2
-#' newDF2 <- transform(df, newCol = df$col1 / 5, newCol2 = df$col1 * 2)
-#' }
-setMethod("mutate",
- signature(.data = "DataFrame"),
- function(.data, ...) {
- x <- .data
- cols <- list(...)
- stopifnot(length(cols) > 0)
- stopifnot(class(cols[[1]]) == "Column")
- ns <- names(cols)
- if (!is.null(ns)) {
- for (n in ns) {
- if (n != "") {
- cols[[n]] <- alias(cols[[n]], n)
- }
- }
- }
- do.call(select, c(x, x$"*", cols))
- })
-
-#' @export
-#' @rdname withColumn
-#' @name transform
-#' @aliases withColumn mutate
-setMethod("transform",
- signature(`_data` = "DataFrame"),
- function(`_data`, ...) {
- mutate(`_data`, ...)
- })
-
-#' WithColumnRenamed
-#'
-#' Rename an existing column in a DataFrame.
-#'
-#' @param x A DataFrame
-#' @param existingCol The name of the column you want to change.
-#' @param newCol The new column name.
-#' @return A DataFrame with the column name changed.
-#' @rdname withColumnRenamed
-#' @name withColumnRenamed
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' newDF <- withColumnRenamed(df, "col1", "newCol1")
-#' }
-setMethod("withColumnRenamed",
- signature(x = "DataFrame", existingCol = "character", newCol = "character"),
- function(x, existingCol, newCol) {
- cols <- lapply(columns(x), function(c) {
- if (c == existingCol) {
- alias(col(c), newCol)
- } else {
- col(c)
- }
- })
- select(x, cols)
- })
-
-#' Rename
-#'
-#' Rename an existing column in a DataFrame.
-#'
-#' @param x A DataFrame
-#' @param newCol A named pair of the form new_column_name = existing_column
-#' @return A DataFrame with the column name changed.
-#' @rdname withColumnRenamed
-#' @name rename
-#' @aliases withColumnRenamed
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' newDF <- rename(df, col1 = df$newCol1)
-#' }
-setMethod("rename",
- signature(x = "DataFrame"),
- function(x, ...) {
- renameCols <- list(...)
- stopifnot(length(renameCols) > 0)
- stopifnot(class(renameCols[[1]]) == "Column")
- newNames <- names(renameCols)
- oldNames <- lapply(renameCols, function(col) {
- callJMethod(col@jc, "toString")
- })
- cols <- lapply(columns(x), function(c) {
- if (c %in% oldNames) {
- alias(col(c), newNames[[match(c, oldNames)]])
- } else {
- col(c)
- }
- })
- select(x, cols)
- })
-
-setClassUnion("characterOrColumn", c("character", "Column"))
-
-#' Arrange
-#'
-#' Sort a DataFrame by the specified column(s).
-#'
-#' @param x A DataFrame to be sorted.
-#' @param col Either a Column object or character vector indicating the field to sort on
-#' @param ... Additional sorting fields
-#' @return A DataFrame where all elements are sorted.
-#' @rdname arrange
-#' @name arrange
-#' @aliases orderby
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' arrange(df, df$col1)
-#' arrange(df, "col1")
-#' arrange(df, asc(df$col1), desc(abs(df$col2)))
-#' }
-setMethod("arrange",
- signature(x = "DataFrame", col = "characterOrColumn"),
- function(x, col, ...) {
- if (class(col) == "character") {
- sdf <- callJMethod(x@sdf, "sort", col, toSeq(...))
- } else if (class(col) == "Column") {
- jcols <- lapply(list(col, ...), function(c) {
- c@jc
- })
- sdf <- callJMethod(x@sdf, "sort", listToSeq(jcols))
- }
- dataFrame(sdf)
- })
-
-#' @rdname arrange
-#' @name orderby
-setMethod("orderBy",
- signature(x = "DataFrame", col = "characterOrColumn"),
- function(x, col) {
- arrange(x, col)
- })
-
-#' Filter
-#'
-#' Filter the rows of a DataFrame according to a given condition.
-#'
-#' @param x A DataFrame to be sorted.
-#' @param condition The condition to filter on. This may either be a Column expression
-#' or a string containing a SQL statement
-#' @return A DataFrame containing only the rows that meet the condition.
-#' @rdname filter
-#' @name filter
-#' @family subsetting functions
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' filter(df, "col1 > 0")
-#' filter(df, df$col2 != "abcdefg")
-#' }
-setMethod("filter",
- signature(x = "DataFrame", condition = "characterOrColumn"),
- function(x, condition) {
- if (class(condition) == "Column") {
- condition <- condition@jc
- }
- sdf <- callJMethod(x@sdf, "filter", condition)
- dataFrame(sdf)
- })
-
-#' @rdname filter
-#' @name where
-setMethod("where",
- signature(x = "DataFrame", condition = "characterOrColumn"),
- function(x, condition) {
- filter(x, condition)
- })
-
-#' Join
-#'
-#' Join two DataFrames based on the given join expression.
-#'
-#' @param x A Spark DataFrame
-#' @param y A Spark DataFrame
-#' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
-#' Column expression. If joinExpr is omitted, join() wil perform a Cartesian join
-#' @param joinType The type of join to perform. The following join types are available:
-#' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner".
-#' @return A DataFrame containing the result of the join operation.
-#' @rdname join
-#' @name join
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
-#' join(df1, df2) # Performs a Cartesian
-#' join(df1, df2, df1$col1 == df2$col2) # Performs an inner join based on expression
-#' join(df1, df2, df1$col1 == df2$col2, "right_outer")
-#' }
-setMethod("join",
- signature(x = "DataFrame", y = "DataFrame"),
- function(x, y, joinExpr = NULL, joinType = NULL) {
- if (is.null(joinExpr)) {
- sdf <- callJMethod(x@sdf, "join", y@sdf)
- } else {
- if (class(joinExpr) != "Column") stop("joinExpr must be a Column")
- if (is.null(joinType)) {
- sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc)
- } else {
- if (joinType %in% c("inner", "outer", "left_outer", "right_outer", "semijoin")) {
- sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
- } else {
- stop("joinType must be one of the following types: ",
- "'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'")
- }
- }
- }
- dataFrame(sdf)
- })
-
-#' @rdname merge
-#' @name merge
-#' @aliases join
-setMethod("merge",
- signature(x = "DataFrame", y = "DataFrame"),
- function(x, y, joinExpr = NULL, joinType = NULL, ...) {
- join(x, y, joinExpr, joinType)
- })
-
-
-#' UnionAll
-#'
-#' Return a new DataFrame containing the union of rows in this DataFrame
-#' and another DataFrame. This is equivalent to `UNION ALL` in SQL.
-#' Note that this does not remove duplicate rows across the two DataFrames.
-#'
-#' @param x A Spark DataFrame
-#' @param y A Spark DataFrame
-#' @return A DataFrame containing the result of the union.
-#' @rdname unionAll
-#' @name unionAll
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
-#' unioned <- unionAll(df, df2)
-#' }
-setMethod("unionAll",
- signature(x = "DataFrame", y = "DataFrame"),
- function(x, y) {
- unioned <- callJMethod(x@sdf, "unionAll", y@sdf)
- dataFrame(unioned)
- })
-
-#' @title Union two or more DataFrames
-#
-#' @description Returns a new DataFrame containing rows of all parameters.
-#
-#' @rdname rbind
-#' @name rbind
-#' @aliases unionAll
-setMethod("rbind",
- signature(... = "DataFrame"),
- function(x, ..., deparse.level = 1) {
- if (nargs() == 3) {
- unionAll(x, ...)
- } else {
- unionAll(x, Recall(..., deparse.level = 1))
- }
- })
-
-#' Intersect
-#'
-#' Return a new DataFrame containing rows only in both this DataFrame
-#' and another DataFrame. This is equivalent to `INTERSECT` in SQL.
-#'
-#' @param x A Spark DataFrame
-#' @param y A Spark DataFrame
-#' @return A DataFrame containing the result of the intersect.
-#' @rdname intersect
-#' @name intersect
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
-#' intersectDF <- intersect(df, df2)
-#' }
-setMethod("intersect",
- signature(x = "DataFrame", y = "DataFrame"),
- function(x, y) {
- intersected <- callJMethod(x@sdf, "intersect", y@sdf)
- dataFrame(intersected)
- })
-
-#' except
-#'
-#' Return a new DataFrame containing rows in this DataFrame
-#' but not in another DataFrame. This is equivalent to `EXCEPT` in SQL.
-#'
-#' @param x A Spark DataFrame
-#' @param y A Spark DataFrame
-#' @return A DataFrame containing the result of the except operation.
-#' @rdname except
-#' @name except
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' df1 <- jsonFile(sqlContext, path)
-#' df2 <- jsonFile(sqlContext, path2)
-#' exceptDF <- except(df, df2)
-#' }
-#' @rdname except
-#' @export
-setMethod("except",
- signature(x = "DataFrame", y = "DataFrame"),
- function(x, y) {
- excepted <- callJMethod(x@sdf, "except", y@sdf)
- dataFrame(excepted)
- })
-
-#' Save the contents of the DataFrame to a data source
-#'
-#' The data source is specified by the `source` and a set of options (...).
-#' If `source` is not specified, the default data source configured by
-#' spark.sql.sources.default will be used.
-#'
-#' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes:
-#' append: Contents of this DataFrame are expected to be appended to existing data.
-#' overwrite: Existing data is expected to be overwritten by the contents of
-# this DataFrame.
-#' error: An exception is expected to be thrown.
-#' ignore: The save operation is expected to not save the contents of the DataFrame
-# and to not change the existing data.
-#'
-#' @param df A SparkSQL DataFrame
-#' @param path A name for the table
-#' @param source A name for external data source
-#' @param mode One of 'append', 'overwrite', 'error', 'ignore'
-#'
-#' @rdname write.df
-#' @name write.df
-#' @aliases saveDF
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' write.df(df, "myfile", "parquet", "overwrite")
-#' }
-setMethod("write.df",
- signature(df = "DataFrame", path = "character"),
- function(df, path, source = NULL, mode = "append", ...){
- if (is.null(source)) {
- sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
- source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
- "org.apache.spark.sql.parquet")
- }
- allModes <- c("append", "overwrite", "error", "ignore")
- # nolint start
- if (!(mode %in% allModes)) {
- stop('mode should be one of "append", "overwrite", "error", "ignore"')
- }
- # nolint end
- jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
- options <- varargsToEnv(...)
- if (!is.null(path)) {
- options[["path"]] <- path
- }
- callJMethod(df@sdf, "save", source, jmode, options)
- })
-
-#' @rdname write.df
-#' @name saveDF
-#' @export
-setMethod("saveDF",
- signature(df = "DataFrame", path = "character"),
- function(df, path, source = NULL, mode = "append", ...){
- write.df(df, path, source, mode, ...)
- })
-
-#' saveAsTable
-#'
-#' Save the contents of the DataFrame to a data source as a table
-#'
-#' The data source is specified by the `source` and a set of options (...).
-#' If `source` is not specified, the default data source configured by
-#' spark.sql.sources.default will be used.
-#'
-#' Additionally, mode is used to specify the behavior of the save operation when
-#' data already exists in the data source. There are four modes:
-#' append: Contents of this DataFrame are expected to be appended to existing data.
-#' overwrite: Existing data is expected to be overwritten by the contents of
-# this DataFrame.
-#' error: An exception is expected to be thrown.
-#' ignore: The save operation is expected to not save the contents of the DataFrame
-# and to not change the existing data.
-#'
-#' @param df A SparkSQL DataFrame
-#' @param tableName A name for the table
-#' @param source A name for external data source
-#' @param mode One of 'append', 'overwrite', 'error', 'ignore'
-#'
-#' @rdname saveAsTable
-#' @name saveAsTable
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' saveAsTable(df, "myfile")
-#' }
-setMethod("saveAsTable",
- signature(df = "DataFrame", tableName = "character", source = "character",
- mode = "character"),
- function(df, tableName, source = NULL, mode="append", ...){
- if (is.null(source)) {
- sqlContext <- get(".sparkRSQLsc", envir = .sparkREnv)
- source <- callJMethod(sqlContext, "getConf", "spark.sql.sources.default",
- "org.apache.spark.sql.parquet")
- }
- allModes <- c("append", "overwrite", "error", "ignore")
- # nolint start
- if (!(mode %in% allModes)) {
- stop('mode should be one of "append", "overwrite", "error", "ignore"')
- }
- # nolint end
- jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
- options <- varargsToEnv(...)
- callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
- })
-
-#' describe
-#'
-#' Computes statistics for numeric columns.
-#' If no columns are given, this function computes statistics for all numerical columns.
-#'
-#' @param x A DataFrame to be computed.
-#' @param col A string of name
-#' @param ... Additional expressions
-#' @return A DataFrame
-#' @rdname describe
-#' @name describe
-#' @aliases summary
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlContext <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlContext, path)
-#' describe(df)
-#' describe(df, "col1")
-#' describe(df, "col1", "col2")
-#' }
-setMethod("describe",
- signature(x = "DataFrame", col = "character"),
- function(x, col, ...) {
- colList <- list(col, ...)
- sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
- dataFrame(sdf)
- })
-
-#' @rdname describe
-#' @name describe
-setMethod("describe",
- signature(x = "DataFrame"),
- function(x) {
- colList <- as.list(c(columns(x)))
- sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
- dataFrame(sdf)
- })
-
-#' @title Summary
-#'
-#' @description Computes statistics for numeric columns of the DataFrame
-#'
-#' @rdname summary
-#' @name summary
-setMethod("summary",
- signature(x = "DataFrame"),
- function(x) {
- describe(x)
- })
-
-
-#' dropna
-#'
-#' Returns a new DataFrame omitting rows with null values.
-#'
-#' @param x A SparkSQL DataFrame.
-#' @param how "any" or "all".
-#' if "any", drop a row if it contains any nulls.
-#' if "all", drop a row only if all its values are null.
-#' if minNonNulls is specified, how is ignored.
-#' @param minNonNulls If specified, drop rows that have less than
-#' minNonNulls non-null values.
-#' This overwrites the how parameter.
-#' @param cols Optional list of column names to consider.
-#' @return A DataFrame
-#'
-#' @rdname nafunctions
-#' @name dropna
-#' @aliases na.omit
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
-#' dropna(df)
-#' }
-setMethod("dropna",
- signature(x = "DataFrame"),
- function(x, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
- how <- match.arg(how)
- if (is.null(cols)) {
- cols <- columns(x)
- }
- if (is.null(minNonNulls)) {
- minNonNulls <- if (how == "any") { length(cols) } else { 1 }
- }
-
- naFunctions <- callJMethod(x@sdf, "na")
- sdf <- callJMethod(naFunctions, "drop",
- as.integer(minNonNulls), listToSeq(as.list(cols)))
- dataFrame(sdf)
- })
-
-#' @rdname nafunctions
-#' @name na.omit
-#' @export
-setMethod("na.omit",
- signature(object = "DataFrame"),
- function(object, how = c("any", "all"), minNonNulls = NULL, cols = NULL) {
- dropna(object, how, minNonNulls, cols)
- })
-
-#' fillna
-#'
-#' Replace null values.
-#'
-#' @param x A SparkSQL DataFrame.
-#' @param value Value to replace null values with.
-#' Should be an integer, numeric, character or named list.
-#' If the value is a named list, then cols is ignored and
-#' value must be a mapping from column name (character) to
-#' replacement value. The replacement value must be an
-#' integer, numeric or character.
-#' @param cols optional list of column names to consider.
-#' Columns specified in cols that do not have matching data
-#' type are ignored. For example, if value is a character, and
-#' subset contains a non-character column, then the non-character
-#' column is simply ignored.
-#' @return A DataFrame
-#'
-#' @rdname nafunctions
-#' @name fillna
-#' @export
-#' @examples
-#'\dontrun{
-#' sc <- sparkR.init()
-#' sqlCtx <- sparkRSQL.init(sc)
-#' path <- "path/to/file.json"
-#' df <- jsonFile(sqlCtx, path)
-#' fillna(df, 1)
-#' fillna(df, list("age" = 20, "name" = "unknown"))
-#' }
-setMethod("fillna",
- signature(x = "DataFrame"),
- function(x, value, cols = NULL) {
- if (!(class(value) %in% c("integer", "numeric", "character", "list"))) {
- stop("value should be an integer, numeric, charactor or named list.")
- }
-
- if (class(value) == "list") {
- # Check column names in the named list
- colNames <- names(value)
- if (length(colNames) == 0 || !all(colNames != "")) {
- stop("value should be an a named list with each name being a column name.")
- }
-
- # Convert to the named list to an environment to be passed to JVM
- valueMap <- new.env()
- for (col in colNames) {
- # Check each item in the named list is of valid type
- v <- value[[col]]
- if (!(class(v) %in% c("integer", "numeric", "character"))) {
- stop("Each item in value should be an integer, numeric or charactor.")
- }
- valueMap[[col]] <- v
- }
-
- # When value is a named list, caller is expected not to pass in cols
- if (!is.null(cols)) {
- warning("When value is a named list, cols is ignored!")
- cols <- NULL
- }
-
- value <- valueMap
- } else if (is.integer(value)) {
- # Cast an integer to a numeric
- value <- as.numeric(value)
- }
-
- naFunctions <- callJMethod(x@sdf, "na")
- sdf <- if (length(cols) == 0) {
- callJMethod(naFunctions, "fill", value)
- } else {
- callJMethod(naFunctions, "fill", value, listToSeq(as.list(cols)))
- }
- dataFrame(sdf)
- })
-
-#' crosstab
-#'
-#' Computes a pair-wise frequency table of the given columns. Also known as a contingency
-#' table. The number of distinct values for each column should be less than 1e4. At most 1e6
-#' non-zero pair frequencies will be returned.
-#'
-#' @param col1 name of the first column. Distinct items will make the first item of each row.
-#' @param col2 name of the second column. Distinct items will make the column names of the output.
-#' @return a local R data.frame representing the contingency table. The first column of each row
-#' will be the distinct values of `col1` and the column names will be the distinct values
-#' of `col2`. The name of the first column will be `$col1_$col2`. Pairs that have no
-#' occurrences will have zero as their counts.
-#'
-#' @rdname statfunctions
-#' @name crosstab
-#' @export
-#' @examples
-#' \dontrun{
-#' df <- jsonFile(sqlCtx, "/path/to/file.json")
-#' ct = crosstab(df, "title", "gender")
-#' }
-setMethod("crosstab",
- signature(x = "DataFrame", col1 = "character", col2 = "character"),
- function(x, col1, col2) {
- statFunctions <- callJMethod(x@sdf, "stat")
- sct <- callJMethod(statFunctions, "crosstab", col1, col2)
- collect(dataFrame(sct))
- })