You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/12/28 14:57:12 UTC
[arrow] branch master updated: ARROW-3020: [C++/Python] Allow empty arrow::Table objects to be written as empty Parquet row groups

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 68daba2  ARROW-3020: [C++/Python] Allow empty arrow::Table objects to be written as empty Parquet row groups
68daba2 is described below

commit 68daba2ba7390d0afee072aa00271a60d8ad4b07
Author: Wes McKinney <we...@apache.org>
AuthorDate: Fri Dec 28 15:56:55 2018 +0100

    ARROW-3020: [C++/Python] Allow empty arrow::Table objects to be written as empty Parquet row groups
    
    While it's unclear how useful this is, it at least preserves the intent of the user if they decide to call `write_table` with an empty table
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3269 from wesm/ARROW-3020 and squashes the following commits:
    
    b8c0cc2d <Wes McKinney> Revert changes to CMakeLists.txt
    12b92cf6 <Wes McKinney> Allow empty arrow::Table objects to be written as empty Parquet row groups, and read back
---
 cpp/src/parquet/arrow/writer.cc      | 30 ++++++++++++++++++++++--------
 python/pyarrow/_parquet.pyx          | 13 ++++++-------
 python/pyarrow/tests/test_parquet.py | 18 ++++++++++++++++++
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index a8153ca..a5c0a62 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -312,6 +312,10 @@ class ArrowColumnWriter {
   Status Write(const Array& data);
 
   Status Write(const ChunkedArray& data, int64_t offset, const int64_t size) {
+    if (data.length() == 0) {
+      return Status::OK();
+    }
+
     int64_t absolute_position = 0;
     int chunk_index = 0;
     int64_t chunk_offset = 0;
@@ -1134,22 +1138,32 @@ Status WriteFileMetaData(const FileMetaData& file_metadata,
 namespace {}  // namespace
 
 Status FileWriter::WriteTable(const Table& table, int64_t chunk_size) {
-  if (chunk_size <= 0) {
+  if (chunk_size <= 0 && table.num_rows() > 0) {
     return Status::Invalid("chunk size per row_group must be greater than 0");
   } else if (chunk_size > impl_->properties().max_row_group_length()) {
     chunk_size = impl_->properties().max_row_group_length();
   }
 
-  for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
-    int64_t offset = chunk * chunk_size;
-    int64_t size = std::min(chunk_size, table.num_rows() - offset);
-
-    RETURN_NOT_OK_ELSE(NewRowGroup(size), PARQUET_IGNORE_NOT_OK(Close()));
+  auto WriteRowGroup = [&](int64_t offset, int64_t size) {
+    RETURN_NOT_OK(NewRowGroup(size));
     for (int i = 0; i < table.num_columns(); i++) {
       auto chunked_data = table.column(i)->data();
-      RETURN_NOT_OK_ELSE(WriteColumnChunk(chunked_data, offset, size),
-                         PARQUET_IGNORE_NOT_OK(Close()));
+      RETURN_NOT_OK(WriteColumnChunk(chunked_data, offset, size));
     }
+    return Status::OK();
+  };
+
+  if (table.num_rows() == 0) {
+    // Append a row group with 0 rows
+    RETURN_NOT_OK_ELSE(WriteRowGroup(0, 0), PARQUET_IGNORE_NOT_OK(Close()));
+    return Status::OK();
+  }
+
+  for (int chunk = 0; chunk * chunk_size < table.num_rows(); chunk++) {
+    int64_t offset = chunk * chunk_size;
+    RETURN_NOT_OK_ELSE(
+        WriteRowGroup(offset, std::min(chunk_size, table.num_rows() - offset)),
+        PARQUET_IGNORE_NOT_OK(Close()));
   }
   return Status::OK();
 }
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 2e92bac..fcecaf5 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -909,17 +909,16 @@ cdef class ParquetWriter:
                 check_status(self.sink.get().Close())
 
     def write_table(self, Table table, row_group_size=None):
-        cdef CTable* ctable = table.table
+        cdef:
+            CTable* ctable = table.table
+            int64_t c_row_group_size
 
         if row_group_size is None or row_group_size == -1:
-            if ctable.num_rows() > 0:
-                row_group_size = ctable.num_rows()
-            else:
-                row_group_size = 1
+            c_row_group_size = ctable.num_rows()
         elif row_group_size == 0:
             raise ValueError('Row group size cannot be 0')
-
-        cdef int64_t c_row_group_size = row_group_size
+        else:
+            c_row_group_size = row_group_size
 
         with nogil:
             check_status(self.writer.get()
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 82c80e9..9f05170 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -2251,6 +2251,24 @@ def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
     writer.write_table(table2)
 
 
+def test_empty_row_groups(tempdir):
+    # ARROW-3020
+    table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
+
+    path = tempdir / 'empty_row_groups.parquet'
+
+    num_groups = 3
+    with pq.ParquetWriter(path, table.schema) as writer:
+        for i in range(num_groups):
+            writer.write_table(table)
+
+    reader = pq.ParquetFile(path)
+    assert reader.metadata.num_row_groups == num_groups
+
+    for i in range(num_groups):
+        assert reader.read_row_group(i).equals(table)
+
+
 def test_writing_empty_lists():
     # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
     arr1 = pa.array([[], []], pa.list_(pa.int32()))