You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/06/18 16:35:19 UTC

[arrow] branch master updated: ARROW-5509: [R] Add basic write_parquet

This is an automated email from the ASF dual-hosted git repository.

fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 028a6a0  ARROW-5509: [R] Add basic write_parquet
028a6a0 is described below

commit 028a6a0b0b4cd899177ee4f6a11522cfde90cf8e
Author: Romain Francois <ro...@rstudio.com>
AuthorDate: Tue Jun 18 12:34:59 2019 -0400

    ARROW-5509: [R] Add basic write_parquet
    
    Be careful, my first ever R contribution ;)
    
    Author: Romain Francois <ro...@rstudio.com>
    Author: François Saint-Jacques <fs...@gmail.com>
    Author: Uwe L. Korn <uw...@quantco.com>
    
    Closes #4492 from xhochy/ARROW-5509 and squashes the following commits:
    
    b13639fc6 <Romain Francois> wrap write_parquet() example in \dontrun{}
    e3328554b <François Saint-Jacques> Remove set -e in configure
    f20ade0b8 <Romain Francois> update read_table() documentation
    714d1d406 <Romain Francois> document()
    e925d93e1 <Uwe L. Korn> ARROW-5509:  Add basic write_parquet
---
 dev/lint/run_linters.sh         |  4 ++++
 r/NAMESPACE                     |  1 +
 r/R/arrowExports.R              |  4 ++++
 r/R/parquet.R                   | 20 ++++++++++++++++++++
 r/man/write_parquet.Rd          | 25 +++++++++++++++++++++++++
 r/src/arrowExports.cpp          | 18 ++++++++++++++++++
 r/src/parquet.cpp               | 12 ++++++++++++
 r/tests/testthat/test-parquet.R | 10 ++++++++++
 8 files changed, 94 insertions(+)

diff --git a/dev/lint/run_linters.sh b/dev/lint/run_linters.sh
index 1b549aa..ba618a7 100755
--- a/dev/lint/run_linters.sh
+++ b/dev/lint/run_linters.sh
@@ -29,3 +29,7 @@ pushd /arrow/python
   flake8 --count pyarrow
   flake8 --count --config=.flake8.cython pyarrow
 popd
+
+pushd /arrow/r
+  ./lint.sh
+popd
diff --git a/r/NAMESPACE b/r/NAMESPACE
index 78cdfd5..799cbe1 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -168,6 +168,7 @@ export(utf8)
 export(write_arrow)
 export(write_feather)
 export(write_feather_RecordBatch)
+export(write_parquet)
 importFrom(R6,R6Class)
 importFrom(Rcpp,sourceCpp)
 importFrom(assertthat,assert_that)
diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
index 8609f9b..a6fd72d 100644
--- a/r/R/arrowExports.R
+++ b/r/R/arrowExports.R
@@ -708,6 +708,10 @@ read_parquet_file <- function(filename){
     .Call(`_arrow_read_parquet_file` , filename)
 }
 
+write_parquet_file <- function(table, filename){
+    invisible(.Call(`_arrow_write_parquet_file` , table, filename))
+}
+
 RecordBatch__num_columns <- function(x){
     .Call(`_arrow_RecordBatch__num_columns` , x)
 }
diff --git a/r/R/parquet.R b/r/R/parquet.R
index 0efed1f..b5e5884 100644
--- a/r/R/parquet.R
+++ b/r/R/parquet.R
@@ -41,3 +41,23 @@ read_parquet <- function(file, as_tibble = TRUE, ...) {
   }
   tab
 }
+
+#' Write Parquet file to disk
+#'
+#' [Parquet](https://parquet.apache.org/) is a columnar storage file format.
+#' This function enables you to write Parquet files from R.
+#'
+#' @param table An [arrow::Table][arrow__Table], or an object convertible to it
+#' @param file a file path
+#'
+#' @examples
+#'
+#' \dontrun{
+#'   tf <- tempfile(fileext = ".parquet")
+#'   write_parquet(tibble::tibble(x = 1:5), tf)
+#' }
+#'
+#' @export
+write_parquet <- function(table, file) {
+  write_parquet_file(to_arrow(table), file)
+}
diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd
new file mode 100644
index 0000000..b5393a6
--- /dev/null
+++ b/r/man/write_parquet.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/parquet.R
+\name{write_parquet}
+\alias{write_parquet}
+\title{Write Parquet file to disk}
+\usage{
+write_parquet(table, file)
+}
+\arguments{
+\item{table}{An \link[=arrow__Table]{arrow::Table}, or an object convertible to it}
+
+\item{file}{a file path}
+}
+\description{
+\href{https://parquet.apache.org/}{Parquet} is a columnar storage file format.
+This function enables you to write Parquet files from R.
+}
+\examples{
+
+\dontrun{
+  tf <- tempfile(fileext = ".parquet")
+  write_parquet(tibble::tibble(x = 1:5), tf)
+}
+
+}
diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
index 2352184..8d5ed3e 100644
--- a/r/src/arrowExports.cpp
+++ b/r/src/arrowExports.cpp
@@ -2719,6 +2719,23 @@ RcppExport SEXP _arrow_read_parquet_file(SEXP filename_sexp){
 }
 #endif
 
+// parquet.cpp
+#if defined(ARROW_R_WITH_ARROW)
+void write_parquet_file(const std::shared_ptr<arrow::Table>& table, std::string filename);
+RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){
+BEGIN_RCPP
+	Rcpp::traits::input_parameter<const std::shared_ptr<arrow::Table>&>::type table(table_sexp);
+	Rcpp::traits::input_parameter<std::string>::type filename(filename_sexp);
+	write_parquet_file(table, filename);
+	return R_NilValue;
+END_RCPP
+}
+#else
+RcppExport SEXP _arrow_write_parquet_file(SEXP table_sexp, SEXP filename_sexp){
+	Rf_error("Cannot call write_parquet_file(). Please use arrow::install_arrow() to install required runtime libraries. ");
+}
+#endif
+
 // recordbatch.cpp
 #if defined(ARROW_R_WITH_ARROW)
 int RecordBatch__num_columns(const std::shared_ptr<arrow::RecordBatch>& x);
@@ -3528,6 +3545,7 @@ static const R_CallMethodDef CallEntries[] = {
 		{ "_arrow_ipc___MessageReader__ReadNextMessage", (DL_FUNC) &_arrow_ipc___MessageReader__ReadNextMessage, 1}, 
 		{ "_arrow_ipc___ReadMessage", (DL_FUNC) &_arrow_ipc___ReadMessage, 1}, 
 		{ "_arrow_read_parquet_file", (DL_FUNC) &_arrow_read_parquet_file, 1}, 
+		{ "_arrow_write_parquet_file", (DL_FUNC) &_arrow_write_parquet_file, 2}, 
 		{ "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, 
 		{ "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, 
 		{ "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, 
diff --git a/r/src/parquet.cpp b/r/src/parquet.cpp
index f4f5fab..a89801e 100644
--- a/r/src/parquet.cpp
+++ b/r/src/parquet.cpp
@@ -47,4 +47,16 @@ std::shared_ptr<arrow::Table> read_parquet_file(std::string filename) {
 #endif
 }
 
+// [[arrow::export]]
+void write_parquet_file(const std::shared_ptr<arrow::Table>& table,
+                        std::string filename) {
+#ifdef ARROW_R_WITH_PARQUET
+  std::shared_ptr<arrow::io::OutputStream> sink;
+  PARQUET_THROW_NOT_OK(arrow::io::FileOutputStream::Open(filename, &sink));
+  PARQUET_THROW_NOT_OK(parquet::arrow::WriteTable(*table, arrow::default_memory_pool(),
+                                                  sink, table->num_rows()));
+#else
+  Rcpp::stop("Support for Parquet is not available.");
+#endif
+}
 #endif
diff --git a/r/tests/testthat/test-parquet.R b/r/tests/testthat/test-parquet.R
index 5ad573c..554744e 100644
--- a/r/tests/testthat/test-parquet.R
+++ b/r/tests/testthat/test-parquet.R
@@ -25,3 +25,13 @@ test_that("reading a known Parquet file to tibble", {
   expect_identical(dim(df), c(10L, 11L))
   # TODO: assert more about the contents
 })
+
+test_that("simple int column roundtrip", {
+  df <- tibble::tibble(x = 1:5)
+  pq_tmp_file <- tempfile() # You can specify the .parquet here but that's probably not necessary
+  on.exit(unlink(pq_tmp_file))
+
+  write_parquet(df, pq_tmp_file)
+  df_read <- read_parquet(pq_tmp_file)
+  expect_identical(df, df_read)
+})