You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "wgtmac (via GitHub)" <gi...@apache.org> on 2023/06/21 05:22:20 UTC
[GitHub] [arrow] wgtmac commented on a diff in pull request #36191: GH-36189: [C++][Parquet] Parquet StreamReader::SkipRows() skips to incorrect place in multi-row-group files

wgtmac commented on code in PR #36191:
URL: https://github.com/apache/arrow/pull/36191#discussion_r1236291778


##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,77 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }

Review Comment:
   It would be better to write this to an in-memory file.



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,77 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {

Review Comment:
   ```suggestion
     void CreateTestFile() {
   ```



##########
cpp/src/parquet/stream_reader_test.cc:
##########
@@ -947,5 +947,77 @@ TEST_F(TestReadingDataFiles, ByteArrayDecimal) {
   EXPECT_EQ(i, 25);
 }
 
+class TestMultiRowGroupStreamReader : public ::testing::Test {
+ public:
+  TestMultiRowGroupStreamReader() { createTestFile(); }
+
+ protected:
+  const char* GetDataFile() const { return "stream_reader_multirowgroup_test.parquet"; }
+
+  void SetUp() {
+    PARQUET_ASSIGN_OR_THROW(auto infile, ::arrow::io::ReadableFile::Open(GetDataFile()));
+    auto file_reader = parquet::ParquetFileReader::Open(infile);
+    reader_ = StreamReader{std::move(file_reader)};
+  }
+
+  void TearDown() { reader_ = StreamReader{}; }
+
+  std::shared_ptr<schema::GroupNode> GetSchema() {
+    schema::NodeVector fields;
+    fields.push_back(schema::PrimitiveNode::Make("row_group_number", Repetition::REQUIRED,
+                                                 Type::INT32, ConvertedType::UINT_16));
+
+    fields.push_back(schema::PrimitiveNode::Make("row_number", Repetition::REQUIRED,
+                                                 Type::INT64, ConvertedType::UINT_64));
+
+    return std::static_pointer_cast<schema::GroupNode>(
+        schema::GroupNode::Make("schema", Repetition::REQUIRED, fields));
+  }
+
+  void createTestFile() {
+    PARQUET_ASSIGN_OR_THROW(auto outfile,
+                            ::arrow::io::FileOutputStream::Open(GetDataFile()));
+
+    auto file_writer = ParquetFileWriter::Open(outfile, GetSchema());
+
+    StreamWriter os{std::move(file_writer)};
+
+    int nrows = 0;
+    for (auto group = 0; group < num_row_groups; ++group) {
+        for (auto i = 0; i < num_rows_per_group; ++i) {
+          os << static_cast<uint16_t>(group);
+          os << static_cast<uint64_t>(nrows);
+          os << EndRow;
+          nrows++;
+        }
+        os.EndRowGroup();
+    }
+  }
+
+  StreamReader reader_;
+  static constexpr int num_row_groups = 5;
+  static constexpr int num_rows_per_group = 10;
+};
+
+TEST_F(TestMultiRowGroupStreamReader, SkipRows) {
+  // skip somewhere in the middle that is a few row groups in to the file
+  auto num_rows_to_skip = 33;
+
+  auto retval = reader_.SkipRows(num_rows_to_skip);

Review Comment:
   Could you add more `SkipRows` call here to make sure multiple skips are correct?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org