You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2020/06/08 17:15:32 UTC
[arrow] branch master updated: ARROW-8799: [C++][Parquet]
NestedListReader needs to handle empty item batches
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5c8f215 ARROW-8799: [C++][Parquet] NestedListReader needs to handle empty item batches
5c8f215 is described below
commit 5c8f2158f7f22793c8130fe544ab9f70fa0b92c6
Author: Benjamin Kietzman <be...@gmail.com>
AuthorDate: Mon Jun 8 12:14:59 2020 -0500
ARROW-8799: [C++][Parquet] NestedListReader needs to handle empty item batches
Closes #7181 from bkietz/8799-Reading-list-column-as-ne
Authored-by: Benjamin Kietzman <be...@gmail.com>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 24 +++++++++++++++++++++++
cpp/src/parquet/arrow/reader.cc | 21 ++++++++++++++------
2 files changed, 39 insertions(+), 6 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index f5ebd2d..e26cf4e 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3031,6 +3031,30 @@ TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
CheckReadWholeFile(*ex_table);
}
+TEST_P(TestArrowReadDictionary, ZeroChunksListOfDictionary) {
+ // ARROW-8799
+ properties_.set_read_dictionary(0, true);
+ dense_values_.reset();
+ auto values = std::make_shared<ChunkedArray>(::arrow::ArrayVector{},
+ ::arrow::list(::arrow::utf8()));
+ options.num_rows = 0;
+ options.num_row_groups = 1;
+ expected_dense_ = MakeSimpleTable(values, false);
+
+ WriteSimple();
+
+ ASSERT_OK_AND_ASSIGN(auto reader, GetReader());
+
+ std::unique_ptr<ColumnReader> column_reader;
+ ASSERT_OK_NO_THROW(reader->GetColumn(0, &column_reader));
+
+ std::shared_ptr<ChunkedArray> chunked_out;
+ ASSERT_OK(column_reader->NextBatch(1 << 15, &chunked_out));
+
+ ASSERT_EQ(chunked_out->length(), 0);
+ ASSERT_EQ(chunked_out->num_chunks(), 1);
+}
+
TEST_P(TestArrowReadDictionary, IncrementalReads) {
// ARROW-6895
options.num_rows = 100;
diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc
index 3096ee8..9c6cc10 100644
--- a/cpp/src/parquet/arrow/reader.cc
+++ b/cpp/src/parquet/arrow/reader.cc
@@ -489,11 +489,20 @@ class NestedListReader : public ColumnReaderImpl {
RETURN_NOT_OK(item_reader_->NextBatch(records_to_read, out));
- // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
- // this is not yet implemented
- if ((*out)->num_chunks() > 1) {
- return Status::NotImplemented(
- "Nested data conversions not implemented for chunked array outputs");
+ std::shared_ptr<Array> item_chunk;
+ switch ((*out)->num_chunks()) {
+ case 0: {
+ ARROW_ASSIGN_OR_RAISE(item_chunk, ::arrow::MakeArrayOfNull((*out)->type(), 0));
+ break;
+ }
+ case 1:
+ item_chunk = (*out)->chunk(0);
+ break;
+ default:
+ // ARROW-3762(wesm): If item reader yields a chunked array, we reject as
+ // this is not yet implemented
+ return Status::NotImplemented(
+ "Nested data conversions not implemented for chunked array outputs");
}
const int16_t* def_levels;
@@ -502,7 +511,7 @@ class NestedListReader : public ColumnReaderImpl {
RETURN_NOT_OK(item_reader_->GetDefLevels(&def_levels, &num_levels));
RETURN_NOT_OK(item_reader_->GetRepLevels(&rep_levels, &num_levels));
std::shared_ptr<Array> result;
- RETURN_NOT_OK(ReconstructNestedList((*out)->chunk(0), field_, max_definition_level_,
+ RETURN_NOT_OK(ReconstructNestedList(item_chunk, field_, max_definition_level_,
max_repetition_level_, def_levels, rep_levels,
num_levels, ctx_->pool, &result));
*out = std::make_shared<ChunkedArray>(result);