You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/10/18 18:49:22 UTC
[arrow] branch master updated: PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 261930791f PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
261930791f is described below
commit 261930791f3c22d3ce6e3bbe0826ff03ccae320f
Author: Fatemah Panahi <fa...@users.noreply.github.com>
AuthorDate: Tue Oct 18 18:49:11 2022 +0000
PARQUET-2179: [C++][Parquet] Add a test for skipping repeated fields (#14366)
Add a test for `TypedColumnReader::Skip` with repeated values to make it clear that we are skipping values and not records.
Also, add some comments to the existing test for Skip of non-repeated values.
Lead-authored-by: Fatemah Panahi <pa...@google.com>
Co-authored-by: Antoine Pitrou <an...@python.org>
Co-authored-by: Fatemah Panahi <fa...@users.noreply.github.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/column_reader_test.cc | 69 ++++++++++++++++++++++++++++++++---
1 file changed, 64 insertions(+), 5 deletions(-)
diff --git a/cpp/src/parquet/column_reader_test.cc b/cpp/src/parquet/column_reader_test.cc
index e7162eb981..b2f947eea4 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -260,7 +260,8 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRepeated) {
ASSERT_NO_FATAL_FAILURE(ExecuteDict(num_pages, levels_per_page, &descr));
}
-TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
+// Tests skipping around page boundaries.
+TEST_F(TestPrimitiveReader, TestSkipAroundPageBoundries) {
int levels_per_page = 100;
int num_pages = 5;
max_def_level_ = 0;
@@ -289,10 +290,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.begin() + static_cast<int>(2.5 * static_cast<double>(levels_per_page)));
ASSERT_TRUE(vector_equal(sub_values, vresult));
- // 2) skip_size == page_size (skip across two pages)
+ // 2) skip_size == page_size (skip across two pages from page 2.5 to 3.5)
levels_skipped = reader->Skip(levels_per_page);
ASSERT_EQ(levels_per_page, levels_skipped);
- // Read half a page
+ // Read half a page (page 3.5 to 4)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
@@ -303,10 +304,10 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
ASSERT_TRUE(vector_equal(sub_values, vresult));
// 3) skip_size < page_size (skip limited to a single page)
- // Skip half a page
+ // Skip half a page (page 4 to 4.5)
levels_skipped = reader->Skip(levels_per_page / 2);
ASSERT_EQ(0.5 * levels_per_page, levels_skipped);
- // Read half a page
+ // Read half a page (page 4.5 to 5)
reader->ReadBatch(levels_per_page / 2, dresult.data(), rresult.data(), vresult.data(),
&values_read);
sub_values.clear();
@@ -316,6 +317,15 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
values_.end());
ASSERT_TRUE(vector_equal(sub_values, vresult));
+ // 4) skip_size = 0
+ levels_skipped = reader->Skip(0);
+ ASSERT_EQ(0, levels_skipped);
+
+ // 5) Skip past the end page. There are 5 pages and we have either skipped
+ // or read all of them, so there is nothing left to skip.
+ levels_skipped = reader->Skip(10);
+ ASSERT_EQ(0, levels_skipped);
+
values_.clear();
def_levels_.clear();
rep_levels_.clear();
@@ -323,6 +333,55 @@ TEST_F(TestPrimitiveReader, TestInt32FlatRequiredSkip) {
reader_.reset();
}
+// Skip with repeated field. This test makes it clear that we are skipping
+// values and not records.
+TEST_F(TestPrimitiveReader, TestSkipRepeatedField) {
+ // Example schema: message M { repeated int32 b = 1 }
+ max_def_level_ = 1;
+ max_rep_level_ = 1;
+ NodePtr type = schema::Int32("b", Repetition::REPEATED);
+ const ColumnDescriptor descr(type, max_def_level_, max_rep_level_);
+ // Example rows: {}, {[10, 10]}, {[20, 20, 20]}
+ std::vector<int32_t> values = {10, 10, 20, 20, 20};
+ std::vector<int16_t> def_levels = {0, 1, 1, 1, 1, 1};
+ std::vector<int16_t> rep_levels = {0, 0, 1, 0, 1, 1};
+ num_values_ = static_cast<int>(def_levels.size());
+ std::shared_ptr<DataPageV1> page = MakeDataPage<Int32Type>(
+ &descr, values, num_values_, Encoding::PLAIN, /*indices=*/{},
+ /*indices_size=*/0, def_levels, max_def_level_, rep_levels, max_rep_level_);
+
+ pages_.push_back(std::move(page));
+
+ InitReader(&descr);
+ Int32Reader* reader = static_cast<Int32Reader*>(reader_.get());
+
+ // Vecotrs to hold read values, definition levels, and repetition levels.
+ std::vector<int32_t> read_vals(4, -1);
+ std::vector<int16_t> read_defs(4, -1);
+ std::vector<int16_t> read_reps(4, -1);
+
+ // Skip two levels.
+ int64_t levels_skipped = reader->Skip(2);
+ ASSERT_EQ(2, levels_skipped);
+
+ int64_t num_read_values = 0;
+ // Read the next set of values
+ reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+ &num_read_values);
+ ASSERT_EQ(num_read_values, 4);
+ // Note that we end up in the record with {[10, 10]}
+ ASSERT_TRUE(vector_equal({10, 20, 20, 20}, read_vals));
+ ASSERT_TRUE(vector_equal({1, 1, 1, 1}, read_defs));
+ ASSERT_TRUE(vector_equal({1, 0, 1, 1}, read_reps));
+
+ // No values remain in data page
+ levels_skipped = reader->Skip(2);
+ ASSERT_EQ(0, levels_skipped);
+ reader->ReadBatch(10, read_defs.data(), read_reps.data(), read_vals.data(),
+ &num_read_values);
+ ASSERT_EQ(num_read_values, 0);
+}
+
// Page claims to have two values but only 1 is present.
TEST_F(TestPrimitiveReader, TestReadValuesMissing) {
max_def_level_ = 1;