You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/09/15 16:30:54 UTC

parquet-cpp git commit: PARQUET-719: Fix WriterBatch API to handle NULL values

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 0bf72a96b -> 942f2aedb


PARQUET-719: Fix WriterBatch API to handle NULL values

Fixed bug and added a test case

Author: Deepak Majeti <de...@hpe.com>

Closes #160 from majetideepak/PARQUET-719 and squashes the following commits:

ff56788 [Deepak Majeti] use using
c34ce78 [Deepak Majeti] added comments
f38d6ed [Deepak Majeti] Fixed bug and added test case


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/942f2aed
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/942f2aed
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/942f2aed

Branch: refs/heads/master
Commit: 942f2aedb5262580c7dfcb511641b20e9bbb367d
Parents: 0bf72a9
Author: Deepak Majeti <de...@hpe.com>
Authored: Thu Sep 15 12:30:45 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Sep 15 12:30:45 2016 -0400

----------------------------------------------------------------------
 src/parquet/column/column-writer-test.cc | 23 +++++++++++++++++++++++
 src/parquet/column/scanner-test.cc       | 12 +++++-------
 src/parquet/column/writer.h              | 14 +++++++++-----
 3 files changed, 37 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 230a843..57f1d1b 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -150,6 +150,8 @@ typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
 
 TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
 
+using TestNullValuesWriter = TestPrimitiveWriter<Int32Type>;
+
 TYPED_TEST(TestPrimitiveWriter, RequiredPlain) {
   this->TestRequiredWithEncoding(Encoding::PLAIN);
 }
@@ -303,5 +305,26 @@ TYPED_TEST(TestPrimitiveWriter, RequiredVeryLargeChunk) {
   }
 }
 
+// PARQUET-719
+// Test case for NULL values
+TEST_F(TestNullValuesWriter, OptionalNullValueChunk) {
+  this->SetUpSchemaOptional();
+
+  this->GenerateData(LARGE_SIZE);
+
+  std::vector<int16_t> definition_levels(LARGE_SIZE, 0);
+  std::vector<int16_t> repetition_levels(LARGE_SIZE, 0);
+
+  auto writer = this->BuildWriter(LARGE_SIZE);
+  // All values being written are NULL
+  writer->WriteBatch(
+      this->values_.size(), definition_levels.data(), repetition_levels.data(), NULL);
+  writer->Close();
+
+  // Just read the first SMALL_SIZE rows to ensure we could read it back in
+  this->ReadColumn();
+  ASSERT_EQ(0, this->values_read_);
+}
+
 }  // namespace test
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/scanner-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc
index 78cd16c..fb5178a 100644
--- a/src/parquet/column/scanner-test.cc
+++ b/src/parquet/column/scanner-test.cc
@@ -141,8 +141,6 @@ class TestFlatScanner : public ::testing::Test {
   vector<uint8_t> data_buffer_;  // For BA and FLBA
 };
 
-typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner;
-
 static int num_levels_per_page = 100;
 static int num_pages = 20;
 static int batch_size = 32;
@@ -150,8 +148,8 @@ static int batch_size = 32;
 typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
     ByteArrayType> TestTypes;
 
-typedef TestFlatScanner<BooleanType> TestBooleanFlatScanner;
-typedef TestFlatScanner<FLBAType> TestFLBAFlatScanner;
+using TestBooleanFlatScanner = TestFlatScanner<BooleanType>;
+using TestFLBAFlatScanner = TestFlatScanner<FLBAType>;
 
 TYPED_TEST_CASE(TestFlatScanner, TestTypes);
 
@@ -183,7 +181,7 @@ TEST_F(TestFLBAFlatScanner, TestPlainDictScanner) {
 }
 
 // PARQUET 502
-TEST_F(TestFlatFLBAScanner, TestSmallBatch) {
+TEST_F(TestFLBAFlatScanner, TestSmallBatch) {
   NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED,
       Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
   const ColumnDescriptor d(type, 0, 0);
@@ -194,7 +192,7 @@ TEST_F(TestFlatFLBAScanner, TestSmallBatch) {
   CheckResults(1, &d);
 }
 
-TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) {
+TEST_F(TestFLBAFlatScanner, TestDescriptorAPI) {
   NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
       Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
   const ColumnDescriptor d(type, 4, 0);
@@ -209,7 +207,7 @@ TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) {
   ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length());
 }
 
-TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
+TEST_F(TestFLBAFlatScanner, TestFLBAPrinterNext) {
   NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
       Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
   const ColumnDescriptor d(type, 4, 0);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/942f2aed/src/parquet/column/writer.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h
index 6a6ee5f..4b2a021 100644
--- a/src/parquet/column/writer.h
+++ b/src/parquet/column/writer.h
@@ -156,7 +156,7 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter {
   void CheckDictionarySizeLimit() override;
 
  private:
-  void WriteMiniBatch(int64_t num_values, const int16_t* def_levels,
+  int64_t WriteMiniBatch(int64_t num_values, const int16_t* def_levels,
       const int16_t* rep_levels, const T* values);
 
   typedef Encoder<DType> EncoderType;
@@ -167,7 +167,7 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter {
 };
 
 template <typename DType>
-inline void TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
+inline int64_t TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
     const int16_t* def_levels, const int16_t* rep_levels, const T* values) {
   int64_t values_to_write = 0;
   // If the field is required and non-repeated, there are no definition levels
@@ -209,6 +209,8 @@ inline void TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
     AddDataPage();
   }
   if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); }
+
+  return values_to_write;
 }
 
 template <typename DType>
@@ -222,15 +224,17 @@ inline void TypedColumnWriter<DType>::WriteBatch(int64_t num_values,
   int64_t write_batch_size = properties_->write_batch_size();
   int num_batches = num_values / write_batch_size;
   int64_t num_remaining = num_values % write_batch_size;
+  int64_t value_offset = 0;
   for (int round = 0; round < num_batches; round++) {
     int64_t offset = round * write_batch_size;
-    WriteMiniBatch(
-        write_batch_size, &def_levels[offset], &rep_levels[offset], &values[offset]);
+    int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset],
+        &rep_levels[offset], &values[value_offset]);
+    value_offset += num_values;
   }
   // Write the remaining values
   int64_t offset = num_batches * write_batch_size;
   WriteMiniBatch(
-      num_remaining, &def_levels[offset], &rep_levels[offset], &values[offset]);
+      num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]);
 }
 
 template <typename DType>