You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/10/18 18:49:22 UTC

[arrow] branch master updated: PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 261930791f PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
261930791f is described below

commit 261930791f3c22d3ce6e3bbe0826ff03ccae320f
Author: Fatemah Panahi <fa...@users.noreply.github.com>
AuthorDate: Tue Oct 18 18:49:11 2022 +0000

    PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
    
    Add a test for `TypedColumnReader::Skip` with repeated values to make it clear that we are skipping values and not records.
    Also, add some comments to the existing test for Skip of non-repeated values.
    
    Lead-authored-by: Fatemah Panahi <pa...@google.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Co-authored-by: Fatemah Panahi <fa...@users.noreply.github.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/parquet/column_reader_test.cc | 69 ++++++++++++++++++++++++++++++++---
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc
index e7162eb981..b2f947eea4 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -260,7 +260,8 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) {
   ASSERT_NO_FATAL_FAILURE(ExecuteDict(num_pages, levels_per_page, &descr));
 }
 
-TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
+// Tests skipping around page boundaries.
+TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) {
   int levels_per_page = 100;
   int num_pages = 5;
   max_def_level_ = 0;
@@ -289,10 +290,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
       values_.begin() + static_cast<int>(2.5 * static_cast<double>(levels_per_page)));
   ASSERT_TRUE(vector_equal(sub_values, vresult));
 
-  // 2) skip_size == page_size (skip across two pages)
+  // 2) skip_size == page_size (skip across two pages from page 2.5 to 3.5)
   levels_skipped = reader->Skip(levels_per_page);
   ASSERT_EQ(levels_per_page, levels_skipped);
-  // Read half a page
+  // Read half a page (page 3.5 to 4)
   reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
                     &values_read);
   sub_values.clear();
@@ -303,10 +304,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
   ASSERT_TRUE(vector_equal(sub_values, vresult));
 
   // 3) skip_size < page_size (skip limited to a single page)
-  // Skip half a page
+  // Skip half a page (page 4 to 4.5)
   levels_skipped = reader->Skip(levels_per_page / 2);
   ASSERT_EQ(0.5 * levels_per_page, levels_skipped);
-  // Read half a page
+  // Read half a page (page 4.5 to 5)
   reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
                     &values_read);
   sub_values.clear();
@@ -316,6 +317,15 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
       values_.end());
   ASSERT_TRUE(vector_equal(sub_values, vresult));
 
+  // 4) skip_size = 0
+  levels_skipped = reader->Skip(0);
+  ASSERT_EQ(0, levels_skipped);
+
+  // 5) Skip past the end page. There are 5 pages and we have either skipped
+  // or read all of them, so there is nothing left to skip.
+  levels_skipped = reader->Skip(10);
+  ASSERT_EQ(0, levels_skipped);
+
   values_.clear();
   def_levels_.clear();
   rep_levels_.clear();
@@ -323,6 +333,55 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
   reader_.reset();
 }
 
+// Skip with repeated field. This test makes it clear that we are skipping
+// values and not records.
+TEST_F(TestPrimitiveReader, TestSkipRepeatedField) {
+  // Example schema: message M { repeated int32 b = 1 }
+  max_def_level_ = 1;
+  max_rep_level_ = 1;
+  NodePtr type = schema::Int32("b", Repetition::REPEATED);
+  const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+  // Example rows: {}, {[10, 10]}, {[20, 20, 20]}
+  std::vector<int32_t> values = {10, 10, 20, 20, 20};
+  std::vector<int16_t> def_levels = {0, 1, 1, 1, 1, 1};
+  std::vector<int16_t> rep_levels = {0, 0, 1, 0, 1, 1};
+  num_values_ = static_cast<int>(def_levels.size());
+  std::shared_ptr<DataPageV1> page = MakeDataPage<Int32Type>(
+      &descr, values, num_values_, Encoding::PLAIN, /*indices=*/{},
+      /*indices_size=*/0, def_levels, max_def_level_, rep_levels, max_rep_level_);
+
+  pages_.push_back(std::move(page));
+
+  InitReader(&descr);
+  Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
+
+  // Vecotrs to hold read values, definition levels, and repetition levels.
+  std::vector<int32_t> read_vals(4, -1);
+  std::vector<int16_t> read_defs(4, -1);
+  std::vector<int16_t> read_reps(4, -1);
+
+  // Skip two levels.
+  int64_t levels_skipped = reader->Skip(2);
+  ASSERT_EQ(2, levels_skipped);
+
+  int64_t num_read_values = 0;
+  // Read the next set of values
+  reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+                    &num_read_values);
+  ASSERT_EQ(num_read_values, 4);
+  // Note that we end up in the record with {[10, 10]}
+  ASSERT_TRUE(vector_equal({10, 20, 20, 20}, read_vals));
+  ASSERT_TRUE(vector_equal({1, 1, 1, 1}, read_defs));
+  ASSERT_TRUE(vector_equal({1, 0, 1, 1}, read_reps));
+
+  // No values remain in data page
+  levels_skipped = reader->Skip(2);
+  ASSERT_EQ(0, levels_skipped);
+  reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+                    &num_read_values);
+  ASSERT_EQ(num_read_values, 0);
+}
+
 // Page claims to have two values but only 1 is present.
 TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
   max_def_level_ = 1;