You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2018/08/15 13:04:00 UTC
[parquet-cpp] branch master updated: PARQUET-1378: Allow RowGroups
with zero rows to be written
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git
The following commit(s) were added to refs/heads/master by this push:
new 310ef06 PARQUET-1378: Allow RowGroups with zero rows to be written
310ef06 is described below
commit 310ef063727610cfcfee2b9511ff65f7bf517f72
Author: Deepak Majeti <de...@hpe.com>
AuthorDate: Wed Aug 15 09:03:56 2018 -0400
PARQUET-1378: Allow RowGroups with zero rows to be written
Author: Deepak Majeti <de...@hpe.com>
Closes #485 from majetideepak/PARQUET-1378 and squashes the following commits:
a4db300 [Deepak Majeti] Parquet-1378: Allow RowGroups with zero rows to be written
---
src/parquet/column_writer.cc | 3 ++-
src/parquet/file-serialize-test.cc | 23 +++++++++++++++++++++++
src/parquet/metadata.cc | 2 +-
3 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index 48fba55..934530c 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -432,12 +432,13 @@ int64_t ColumnWriter::Close() {
FlushBufferedDataPages();
EncodedStatistics chunk_statistics = GetChunkStatistics();
+ // Write stats only if the column has atleast one row written
// From parquet-mr
// Don't write stats larger than the max size rather than truncating. The
// rationale is that some engines may use the minimum value in the page as
// the true minimum for aggregations and there is no way to mark that a
// value has been truncated and is a lower bound and not in the page.
- if (chunk_statistics.is_set() &&
+ if (rows_written_ > 0 && chunk_statistics.is_set() &&
chunk_statistics.max_stat_length() <=
properties_->max_statistics_size(descr_->path())) {
metadata_->SetStatistics(SortOrder::SIGNED == descr_->sort_order(),
diff --git a/src/parquet/file-serialize-test.cc b/src/parquet/file-serialize-test.cc
index 31d2bd4..1993404 100644
--- a/src/parquet/file-serialize-test.cc
+++ b/src/parquet/file-serialize-test.cc
@@ -176,6 +176,27 @@ class TestSerialize : public PrimitiveTypedTest<TestType> {
column_writer->Close();
}
}
+
+ void ZeroRowsRowGroup() {
+ std::shared_ptr<InMemoryOutputStream> sink(new InMemoryOutputStream());
+ auto gnode = std::static_pointer_cast<GroupNode>(this->node_);
+
+ std::shared_ptr<WriterProperties> props = WriterProperties::Builder().build();
+
+ auto file_writer = ParquetFileWriter::Open(sink, gnode, props);
+
+ RowGroupWriter* row_group_writer;
+ row_group_writer = file_writer->AppendRowGroup();
+
+ for (int col = 0; col < num_columns_; ++col) {
+ auto column_writer =
+ static_cast<TypedColumnWriter<TestType>*>(row_group_writer->NextColumn());
+ column_writer->Close();
+ }
+
+ row_group_writer->Close();
+ file_writer->Close();
+ }
};
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
@@ -198,6 +219,8 @@ TYPED_TEST(TestSerialize, TooManyRows) {
ASSERT_THROW(this->UnequalNumRows(101, num_rows), ParquetException);
}
+TYPED_TEST(TestSerialize, ZeroRows) { ASSERT_NO_THROW(this->ZeroRowsRowGroup()); }
+
TYPED_TEST(TestSerialize, RepeatedTooFewRows) {
ASSERT_THROW(this->RepeatedUnequalRows(), ParquetException);
}
diff --git a/src/parquet/metadata.cc b/src/parquet/metadata.cc
index 39dee63..1cab51f 100644
--- a/src/parquet/metadata.cc
+++ b/src/parquet/metadata.cc
@@ -731,7 +731,7 @@ class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
int64_t total_byte_size = 0;
for (int i = 0; i < schema_->num_columns(); i++) {
- if (!(row_group_->columns[i].file_offset > 0)) {
+ if (!(row_group_->columns[i].file_offset >= 0)) {
std::stringstream ss;
ss << "Column " << i << " is not complete.";
throw ParquetException(ss.str());