You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "pitrou (via GitHub)" <gi...@apache.org> on 2023/06/29 15:29:33 UTC

[GitHub] [arrow] pitrou commented on a diff in pull request #36191: GH-36189: [C++][Parquet] Parquet StreamReader::SkipRows() skips to incorrect place in multi-row-group files

pitrou commented on code in PR #36191:
URL: https://github.com/apache/arrow/pull/36191#discussion_r1246775416


##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;

Review Comment:
   Let's use our common conventions for constants:
   ```suggestion
     static constexpr int kNumGroups = 5;
     static constexpr int kNumRowsPerGroup = 10;
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);
+
+  // there are 50 total rows, so definitely not EOF
+  ASSERT_FALSE(reader_.eof());
+
+  // make sure we skipped to where we expected
+  uint16_t current_row_group = 0;
+  uint64_t current_global_row = 0;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);

Review Comment:
   ```suggestion
     EXPECT_EQ(current_row_group, current_row / kNumRowsPerGroup);
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);
+
+  // there are 50 total rows, so definitely not EOF
+  ASSERT_FALSE(reader_.eof());
+
+  // make sure we skipped to where we expected
+  uint16_t current_row_group = 0;
+  uint64_t current_global_row = 0;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);
+
+  reader_ >> current_global_row;
+  EXPECT_EQ(current_global_row, current_row);
+  reader_.EndRow();
+
+  // skip a few more rows but stay inside of this row group
+  retval = reader_.SkipRows(4);
+  ASSERT_GE(retval, 0);
+
+  // we read row 33 (were at 34, then skipped 4 => 38)
+  current_row = 38;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);

Review Comment:
   ```suggestion
     EXPECT_EQ(current_row_group, current_row / kNumRowsPerGroup);
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);
+
+  // there are 50 total rows, so definitely not EOF
+  ASSERT_FALSE(reader_.eof());
+
+  // make sure we skipped to where we expected
+  uint16_t current_row_group = 0;
+  uint64_t current_global_row = 0;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);
+
+  reader_ >> current_global_row;
+  EXPECT_EQ(current_global_row, current_row);
+  reader_.EndRow();
+
+  // skip a few more rows but stay inside of this row group
+  retval = reader_.SkipRows(4);
+  ASSERT_GE(retval, 0);
+
+  // we read row 33 (were at 34, then skipped 4 => 38)

Review Comment:
   ```suggestion
     // we read row 38 (were at 34, then skipped 4 => 38)
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,77 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {

Review Comment:
   This change doesn't seem to have been made?



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+

Review Comment:
   Can we make this test generally more compact and more readable through some kind of test helper on the fixture? For example:
   ```c++
   class TestMultiRowGroupStreamReader : public ::testing::Test {
     ...
     void AssertNextRow(uint16_t expected_group_num, uint64_t expected_row_num) {
       ASSERT_FALSE(reader_.eof());
       uint16_t group_num = 0;
       uint64_t row_num = 0;
       reader_ >> group_num >> row_num >> EndRow;
       ASSERT_EQ(group_num, expected_group_num);
       ASSERT_EQ(row_num, expected_row_num);
     }
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);
+
+  // there are 50 total rows, so definitely not EOF
+  ASSERT_FALSE(reader_.eof());
+
+  // make sure we skipped to where we expected
+  uint16_t current_row_group = 0;
+  uint64_t current_global_row = 0;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);
+
+  reader_ >> current_global_row;
+  EXPECT_EQ(current_global_row, current_row);
+  reader_.EndRow();
+
+  // skip a few more rows but stay inside of this row group
+  retval = reader_.SkipRows(4);
+  ASSERT_GE(retval, 0);

Review Comment:
   Again, test `retval` more precisely.



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }

Review Comment:
   Can you do this in `SetUp` instead?



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);

Review Comment:
   Let this assertion be more precise, for example:
   ```suggestion
     ASSERT_EQ(retval, current_row);
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,110 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+      for (auto i = 0; i < num_rows_per_group; ++i) {
+        os << static_cast<uint16_t>(group);
+        os << static_cast<uint64_t>(nrows);
+        os << EndRow;
+        nrows++;
+      }
+      os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere into the middle of a row group somewhere in the middle of the file
+  auto current_row = 33;
+
+  auto retval = reader_.SkipRows(current_row);
+  ASSERT_GE(retval, 0);
+
+  // there are 50 total rows, so definitely not EOF
+  ASSERT_FALSE(reader_.eof());
+
+  // make sure we skipped to where we expected
+  uint16_t current_row_group = 0;
+  uint64_t current_global_row = 0;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);
+
+  reader_ >> current_global_row;
+  EXPECT_EQ(current_global_row, current_row);
+  reader_.EndRow();
+
+  // skip a few more rows but stay inside of this row group
+  retval = reader_.SkipRows(4);
+  ASSERT_GE(retval, 0);
+
+  // we read row 33 (were at 34, then skipped 4 => 38)
+  current_row = 38;
+  reader_ >> current_row_group;
+  EXPECT_EQ(current_row_group, current_row / 10);
+
+  reader_ >> current_global_row;
+  EXPECT_EQ(current_global_row, current_row);
+  reader_.EndRow();
+
+  // skip to the exact start of the next row group 4
+  // read 38 (were at 39, then skip 1 => 40)
+  retval = reader_.SkipRows(1);

Review Comment:
   Similar comments below as well...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org