You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/08 17:15:32 UTC

[arrow] branch master updated: ARROW-8799: [C++][Parquet] NestedListReader needs to handle empty item batches

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5c8f215  ARROW-8799: [C++][Parquet] NestedListReader needs to handle empty item batches
5c8f215 is described below

commit 5c8f2158f7f22793c8130fe544ab9f70fa0b92c6
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Mon Jun 8 12:14:59 2020 -0500

    ARROW-8799: [C++][Parquet] NestedListReader needs to handle empty item batches
    
    Closes #7181 from bkietz/8799-Reading-list-column-as-ne
    
    Authored-by: Benjamin Kietzman <be...@gmail.com>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 24 +++++++++++++++++++++++
 cpp/src/parquet/arrow/reader.cc                   | 21 ++++++++++++++------
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f5ebd2d..e26cf4e 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3031,6 +3031,30 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
   CheckReadWholeFile(*ex_table);
 }
 
+TEST_P(TestArrowReadDictionary, ZeroChunksListOfDictionary) {
+  // ARROW-8799
+  properties_.set_read_dictionary(0, true);
+  dense_values_.reset();
+  auto values = std::make_shared<ChunkedArray>(::arrow::ArrayVector{},
+                                               ::arrow::list(::arrow::utf8()));
+  options.num_rows = 0;
+  options.num_row_groups = 1;
+  expected_dense_ = MakeSimpleTable(values, false);
+
+  WriteSimple();
+
+  ASSERT_OK_AND_ASSIGN(auto reader, GetReader());
+
+  std::unique_ptr<ColumnReader> column_reader;
+  ASSERT_OK_NO_THROW(reader->GetColumn(0, &column_reader));
+
+  std::shared_ptr<ChunkedArray> chunked_out;
+  ASSERT_OK(column_reader->NextBatch(1 << 15, &chunked_out));
+
+  ASSERT_EQ(chunked_out->length(), 0);
+  ASSERT_EQ(chunked_out->num_chunks(), 1);
+}
+
 TEST_P(TestArrowReadDictionary, IncrementalReads) {
   // ARROW-6895
   options.num_rows = 100;
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 3096ee8..9c6cc10 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -489,11 +489,20 @@ class NestedListReader : public ColumnReaderImpl {
 
     RETURN_NOT_OK(item_reader_->NextBatch(records_to_read, out));
 
-    // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
-    // this is not yet implemented
-    if ((*out)->num_chunks() > 1) {
-      return Status::NotImplemented(
-          "Nested data conversions not implemented for chunked array outputs");
+    std::shared_ptr<Array> item_chunk;
+    switch ((*out)->num_chunks()) {
+      case 0: {
+        ARROW_ASSIGN_OR_RAISE(item_chunk, ::arrow::MakeArrayOfNull((*out)->type(), 0));
+        break;
+      }
+      case 1:
+        item_chunk = (*out)->chunk(0);
+        break;
+      default:
+        // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+        // this is not yet implemented
+        return Status::NotImplemented(
+            "Nested data conversions not implemented for chunked array outputs");
     }
 
     const int16_t* def_levels;
@@ -502,7 +511,7 @@ class NestedListReader : public ColumnReaderImpl {
     RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
     RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
     std::shared_ptr<Array> result;
-    RETURN_NOT_OK(ReconstructNestedList((*out)->chunk(0), field_, max_definition_level_,
+    RETURN_NOT_OK(ReconstructNestedList(item_chunk, field_, max_definition_level_,
                                         max_repetition_level_, def_levels, rep_levels,
                                         num_levels, ctx_->pool, &result));
     *out = std::make_shared<ChunkedArray>(result);