You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/06/19 15:26:43 UTC

parquet-cpp git commit: PARQUET-1033: Improve documentation about WriteBatchSpaced

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 792f858c9 -> 99759a38b


PARQUET-1033: Improve documentation about WriteBatchSpaced

Author: Uwe L. Korn <uw...@apache.org>

Closes #354 from xhochy/PARQUET-1033 and squashes the following commits:

895676a [Uwe L. Korn] Remove trailing comment line
709ef32 [Uwe L. Korn] PARQUET-1033: Improve documentation about WriteBatchSpaced


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/99759a38
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/99759a38
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/99759a38

Branch: refs/heads/master
Commit: 99759a38b7dabf2520070949713a1e5d6853caf4
Parents: 792f858
Author: Uwe L. Korn <uw...@apache.org>
Authored: Mon Jun 19 11:26:37 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Jun 19 11:26:37 2017 -0400

----------------------------------------------------------------------
 src/parquet/column/column-writer-test.cc | 30 +++++++++++++++++++++++++++
 src/parquet/column/writer.h              | 28 +++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 33eefac..6f47f3b 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -347,6 +347,36 @@ TYPED_TEST(TestPrimitiveWriter, Optional) {
   ASSERT_EQ(this->values_, this->values_out_);
 }
 
+TYPED_TEST(TestPrimitiveWriter, OptionalSpaced) {
+  // Optional and non-repeated, with definition levels
+  // but no repetition levels
+  this->SetUpSchema(Repetition::OPTIONAL);
+
+  this->GenerateData(SMALL_SIZE);
+  std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
+  std::vector<uint8_t> valid_bits(::arrow::BitUtil::BytesForBits(SMALL_SIZE), 255);
+
+  definition_levels[SMALL_SIZE - 1] = 0;
+  ::arrow::BitUtil::ClearBit(valid_bits.data(), SMALL_SIZE - 1);
+  definition_levels[1] = 0;
+  ::arrow::BitUtil::ClearBit(valid_bits.data(), 1);
+
+  auto writer = this->BuildWriter();
+  writer->WriteBatchSpaced(this->values_.size(), definition_levels.data(), nullptr,
+      valid_bits.data(), 0, this->values_ptr_);
+  writer->Close();
+
+  // PARQUET-703
+  ASSERT_EQ(100, this->metadata_num_values());
+
+  this->ReadColumn();
+  ASSERT_EQ(98, this->values_read_);
+  this->values_out_.resize(98);
+  this->values_.resize(99);
+  this->values_.erase(this->values_.cbegin() + 1);
+  ASSERT_EQ(this->values_, this->values_out_);
+}
+
 TYPED_TEST(TestPrimitiveWriter, Repeated) {
   // Optional and repeated, so definition and repetition levels
   this->SetUpSchema(Repetition::REPEATED);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/99759a38/src/parquet/column/writer.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h
index 5ffcf73..407e808 100644
--- a/src/parquet/column/writer.h
+++ b/src/parquet/column/writer.h
@@ -166,8 +166,32 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter {
   void WriteBatch(int64_t num_values, const int16_t* def_levels,
       const int16_t* rep_levels, const T* values);
 
-  // Write a batch of repetition levels, definition levels, and values to the
-  // column.
+  /// Write a batch of repetition levels, definition levels, and values to the
+  /// column.
+  ///
+  /// In comparision to WriteBatch the length of repetition and definition levels
+  /// is the same as of the number of values read for max_definition_level == 1.
+  /// In the case of max_definition_level > 1, the repetition and definition
+  /// levels are larger than the values but the values include the null entries
+  /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+  /// in the parameters of this function if the input has the length of num_values or the
+  /// _number of rows in the lowest nesting level_.
+  ///
+  /// In the case that the most inner node in the Parquet is required, the _number of rows
+  /// in the lowest nesting level_ is equal to the number of non-null values. If the
+  /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+  /// also includes all values with definition_level == (max_definition_level - 1).
+  ///
+  /// @param num_values number of levels to write.
+  /// @param def_levels The Parquet definiton levels, length is num_values
+  /// @param rep_levels The Parquet repetition levels, length is num_values
+  /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+  ///   level. The length is number of rows in the lowest nesting level.
+  /// @param valid_bits_offset The offset in bits of the valid_bits where the
+  ///   first relevant bit resides.
+  /// @param values The values in the lowest nested level including
+  ///   spacing for nulls on the lowest levels; input has the length
+  ///   of the number of rows on the lowest nesting level.
   void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
       const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset,
       const T* values);