You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/06/19 15:26:43 UTC
parquet-cpp git commit: PARQUET-1033: Improve documentation about
WriteBatchSpaced
Repository: parquet-cpp
Updated Branches:
refs/heads/master 792f858c9 -> 99759a38b
PARQUET-1033: Improve documentation about WriteBatchSpaced
Author: Uwe L. Korn <uw...@apache.org>
Closes #354 from xhochy/PARQUET-1033 and squashes the following commits:
895676a [Uwe L. Korn] Remove trailing comment line
709ef32 [Uwe L. Korn] PARQUET-1033: Improve documentation about WriteBatchSpaced
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/99759a38
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/99759a38
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/99759a38
Branch: refs/heads/master
Commit: 99759a38b7dabf2520070949713a1e5d6853caf4
Parents: 792f858
Author: Uwe L. Korn <uw...@apache.org>
Authored: Mon Jun 19 11:26:37 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jun 19 11:26:37 2017 -0400
----------------------------------------------------------------------
src/parquet/column/column-writer-test.cc | 30 +++++++++++++++++++++++++++
src/parquet/column/writer.h | 28 +++++++++++++++++++++++--
2 files changed, 56 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 33eefac..6f47f3b 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -347,6 +347,36 @@ TYPED_TEST(TestPrimitiveWriter, Optional) {
ASSERT_EQ(this->values_, this->values_out_);
}
+TYPED_TEST(TestPrimitiveWriter, OptionalSpaced) {
+ // Optional and non-repeated, with definition levels
+ // but no repetition levels
+ this->SetUpSchema(Repetition::OPTIONAL);
+
+ this->GenerateData(SMALL_SIZE);
+ std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
+ std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(SMALL_SIZE), 255);
+
+ definition_levels[SMALL_SIZE - 1] = 0;
+ ::arrow::BitUtil::ClearBit(valid_bits.data(), SMALL_SIZE - 1);
+ definition_levels[1] = 0;
+ ::arrow::BitUtil::ClearBit(valid_bits.data(), 1);
+
+ auto writer = this->BuildWriter();
+ writer->WriteBatchSpaced(this->values_.size(), definition_levels.data(), nullptr,
+ valid_bits.data(), 0, this->values_ptr_);
+ writer->Close();
+
+ // PARQUET-703
+ ASSERT_EQ(100, this->metadata_num_values());
+
+ this->ReadColumn();
+ ASSERT_EQ(98, this->values_read_);
+ this->values_out_.resize(98);
+ this->values_.resize(99);
+ this->values_.erase(this->values_.cbegin() + 1);
+ ASSERT_EQ(this->values_, this->values_out_);
+}
+
TYPED_TEST(TestPrimitiveWriter, Repeated) {
// Optional and repeated, so definition and repetition levels
this->SetUpSchema(Repetition::REPEATED);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/writer.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h
index 5ffcf73..407e808 100644
--- a/src/parquet/column/writer.h
+++ b/src/parquet/column/writer.h
@@ -166,8 +166,32 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter {
void WriteBatch(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const T* values);
- // Write a batch of repetition levels, definition levels, and values to the
- // column.
+ /// Write a batch of repetition levels, definition levels, and values to the
+ /// column.
+ ///
+ /// In comparision to WriteBatch the length of repetition and definition levels
+ /// is the same as of the number of values read for max_definition_level == 1.
+ /// In the case of max_definition_level > 1, the repetition and definition
+ /// levels are larger than the values but the values include the null entries
+ /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+ /// in the parameters of this function if the input has the length of num_values or the
+ /// _number of rows in the lowest nesting level_.
+ ///
+ /// In the case that the most inner node in the Parquet is required, the _number of rows
+ /// in the lowest nesting level_ is equal to the number of non-null values. If the
+ /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+ /// also includes all values with definition_level == (max_definition_level - 1).
+ ///
+ /// @param num_values number of levels to write.
+ /// @param def_levels The Parquet definiton levels, length is num_values
+ /// @param rep_levels The Parquet repetition levels, length is num_values
+ /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+ /// level. The length is number of rows in the lowest nesting level.
+ /// @param valid_bits_offset The offset in bits of the valid_bits where the
+ /// first relevant bit resides.
+ /// @param values The values in the lowest nested level including
+ /// spacing for nulls on the lowest levels; input has the length
+ /// of the number of rows on the lowest nesting level.
void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset,
const T* values);