You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/06/18 16:35:19 UTC
[arrow] branch master updated: ARROW-5509: [R] Add basic
write_parquet
This is an automated email from the ASF dual-hosted git repository.
fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 028a6a0 ARROW-5509: [R] Add basic write_parquet
028a6a0 is described below
commit 028a6a0b0b4cd899177ee4f6a11522cfde90cf8e
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Tue Jun 18 12:34:59 2019 -0400
ARROW-5509: [R] Add basic write_parquet
Be careful, my first ever R contribution ;)
Author: Romain Francois <ro...@rstudio.com>
Author: François Saint-Jacques <fs...@gmail.com>
Author: Uwe L. Korn <uw...@quantco.com>
Closes #4492 from xhochy/ARROW-5509 and squashes the following commits:
b13639fc6 <Romain Francois> wrap write_parquet() example in \dontrun{}
e3328554b <François Saint-Jacques> Remove set -e in configure
f20ade0b8 <Romain Francois> update read_table() documentation
714d1d406 <Romain Francois> document()
e925d93e1 <Uwe L. Korn> ARROW-5509: Add basic write_parquet
---
dev/lint/run_linters.sh | 4 ++++
r/NAMESPACE | 1 +
r/R/arrowExports.R | 4 ++++
r/R/parquet.R | 20 ++++++++++++++++++++
r/man/write_parquet.Rd | 25 +++++++++++++++++++++++++
r/src/arrowExports.cpp | 18 ++++++++++++++++++
r/src/parquet.cpp | 12 ++++++++++++
r/tests/testthat/test-parquet.R | 10 ++++++++++
8 files changed, 94 insertions(+)
diff --git a/dev/lint/run_linters.sh b/dev/lint/run_linters.sh
index 1b549aa..ba618a7 100755
--- a/dev/lint/run_linters.sh
+++ b/dev/lint/run_linters.sh
@@ -29,3 +29,7 @@ pushd /arrow/python
flake8 --count pyarrow
flake8 --count --config=.flake8.cython pyarrow
popd
+
+pushd /arrow/r
+ ./lint.sh
+popd
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 78cdfd5..799cbe1 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -168,6 +168,7 @@ export(utf8)
export(write_arrow)
export(write_feather)
export(write_feather_RecordBatch)
+export(write_parquet)
importFrom(R6,R6Class)
importFrom(Rcpp,sourceCpp)
importFrom(assertthat,assert_that)
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 8609f9b..a6fd72d 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -708,6 +708,10 @@ read_parquet_file <- function(filename){
.Call(`_arrow_read_parquet_file` , filename)
}
+write_parquet_file <- function(table, filename){
+ invisible(.Call(`_arrow_write_parquet_file` , table, filename))
+}
+
RecordBatch__num_columns <- function(x){
.Call(`_arrow_RecordBatch__num_columns` , x)
}
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 0efed1f..b5e5884 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -41,3 +41,23 @@ read_parquet <- function(file, as_tibble = TRUE, ...) {
}
tab
}
+
+#' Write Parquet file to disk
+#'
+#' [Parquet](https://parquet.apache.org/) is a columnar storage file format.
+#' This function enables you to write Parquet files from R.
+#'
+#' @param table An [arrow::Table][arrow__Table], or an object convertible to it
+#' @param file a file path
+#'
+#' @examples
+#'
+#' \dontrun{
+#' tf <- tempfile(fileext = ".parquet")
+#' write_parquet(tibble::tibble(x = 1:5), tf)
+#' }
+#'
+#' @export
+write_parquet <- function(table, file) {
+ write_parquet_file(to_arrow(table), file)
+}
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
new file mode 100644
index 0000000..b5393a6
--- /dev/null
+++ b/r/man/write_parquet.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{write_parquet}
+\alias{write_parquet}
+\title{Write Parquet file to disk}
+\usage{
+write_parquet(table, file)
+}
+\arguments{
+\item{table}{An \link[=arrow__Table]{arrow::Table}, or an object convertible to it}
+
+\item{file}{a file path}
+}
+\description{
+\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
+This function enables you to write Parquet files from R.
+}
+\examples{
+
+\dontrun{
+ tf <- tempfile(fileext = ".parquet")
+ write_parquet(tibble::tibble(x = 1:5), tf)
+}
+
+}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 2352184..8d5ed3e 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2719,6 +2719,23 @@ RcppExport SEXP _arrow_read_parquet_file(SEXP filename_sexp){
}
#endif
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void write_parquet_file(const std::shared_ptr<arrow::Table>& table, std::string filename);
+RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){
+BEGIN_RCPP
+ Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+ Rcpp::traits::input_parameter<std::string>::type filename(filename_sexp);
+ write_parquet_file(table, filename);
+ return R_NilValue;
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){
+ Rf_error("Cannot call write_parquet_file(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
// recordbatch.cpp
#if defined(ARROW_R_WITH_ARROW)
int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x);
@@ -3528,6 +3545,7 @@ static const R_CallMethodDef CallEntries[] = {
{ "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1},
{ "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1},
{ "_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1},
+ { "_arrow_write_parquet_file", (DL_FUNC) &_arrow_write_parquet_file, 2},
{ "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1},
{ "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1},
{ "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1},
diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp
index f4f5fab..a89801e 100644
--- a/r/src/parquet.cpp
+++ b/r/src/parquet.cpp
@@ -47,4 +47,16 @@ std::shared_ptr<arrow::Table> read_parquet_file(std::string filename) {
#endif
}
+// [[arrow::export]]
+void write_parquet_file(const std::shared_ptr<arrow::Table>& table,
+ std::string filename) {
+#ifdef ARROW_R_WITH_PARQUET
+ std::shared_ptr<arrow::io::OutputStream> sink;
+ PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open(filename, &sink));
+ PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(),
+ sink, table->num_rows()));
+#else
+ Rcpp::stop("Support for Parquet is not available.");
+#endif
+}
#endif
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 5ad573c..554744e 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -25,3 +25,13 @@ test_that("reading a known Parquet file to tibble", {
expect_identical(dim(df), c(10L, 11L))
# TODO: assert more about the contents
})
+
+test_that("simple int column roundtrip", {
+ df <- tibble::tibble(x = 1:5)
+ pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
+ on.exit(unlink(pq_tmp_file))
+
+ write_parquet(df, pq_tmp_file)
+ df_read <- read_parquet(pq_tmp_file)
+ expect_identical(df, df_read)
+})