You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "wgtmac (via GitHub)" <gi...@apache.org> on 2023/03/12 15:28:31 UTC

[GitHub] [arrow] wgtmac commented on a diff in pull request #34054: GH-34053: [C++][Parquet] Write parquet page index

wgtmac commented on code in PR #34054:
URL: https://github.com/apache/arrow/pull/34054#discussion_r1133275238


##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5146,5 +5158,96 @@ TEST(TestArrowReadWrite, FuzzReader) {
   }
 }
 
+TEST(TestArrowReadWrite, WriteReadPageIndexRoundTrip) {
+  // Enable page index to the writer.
+  auto writer_properties = WriterProperties::Builder()
+                               .enable_write_page_index()
+                               ->max_row_group_length(4)
+                               ->build();
+  auto arrow_writer_properties = default_arrow_writer_properties();
+  auto pool = ::arrow::default_memory_pool();
+  auto sink = CreateOutputStream();
+  auto schema = ::arrow::schema(
+      {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())});
+  std::shared_ptr<SchemaDescriptor> parquet_schema;
+  ASSERT_OK_NO_THROW(ToParquetSchema(schema.get(), *writer_properties,
+                                     *arrow_writer_properties, &parquet_schema));
+  auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
+
+  // Prepare data and each row group contains 4 rows.
+  auto record_batch = ::arrow::RecordBatchFromJSON(schema, R"([
+      [1,     "a"],
+      [2,     "b"],
+      [3,     "c"],
+      [null,  "d"],
+      [5,     null],
+      [6,     "f"]
+    ])");
+
+  // Create writer to write data via RecordBatch.
+  auto writer = ParquetFileWriter::Open(sink, schema_node, writer_properties);
+  std::unique_ptr<FileWriter> arrow_writer;
+  ASSERT_OK(FileWriter::Make(pool, std::move(writer), record_batch->schema(),
+                             arrow_writer_properties, &arrow_writer));
+  ASSERT_OK_NO_THROW(arrow_writer->WriteRecordBatch(*record_batch));
+  ASSERT_OK_NO_THROW(arrow_writer->Close());
+  ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+  // Create reader to read page index.
+  auto read_properties = default_arrow_reader_properties();
+  auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer));
+  auto metadata = reader->metadata();
+  ASSERT_EQ(2, metadata->num_row_groups());
+
+  // Make sure page index reader is not null.
+  auto page_index_reader = reader->GetPageIndexReader();
+  ASSERT_NE(page_index_reader, nullptr);
+
+  auto encode_int64 = [=](int64_t value) {
+    return std::string(reinterpret_cast<const char*>(&value), sizeof(int64_t));
+  };
+
+  const std::vector<std::string> c0_min_values = {encode_int64(1), encode_int64(5)};
+  const std::vector<std::string> c0_max_values = {encode_int64(3), encode_int64(6)};
+  const std::vector<std::string> c1_min_values = {"a", "f"};
+  const std::vector<std::string> c1_max_values = {"d", "f"};
+  const std::vector<int64_t> c0_null_counts = {1, 0};
+  const std::vector<int64_t> c1_null_counts = {0, 1};
+
+  const size_t num_pages = 1;
+  for (int rg = 0; rg < metadata->num_row_groups(); ++rg) {
+    auto row_group_index_reader = page_index_reader->RowGroup(rg);
+    ASSERT_NE(row_group_index_reader, nullptr);
+
+    // Verify offset index.
+    for (int c = 0; c < metadata->num_columns(); ++c) {
+      auto offset_index = row_group_index_reader->GetOffsetIndex(c);
+      ASSERT_NE(offset_index, nullptr);
+      ASSERT_EQ(num_pages, offset_index->page_locations().size());
+      ASSERT_EQ(0, offset_index->page_locations()[0].first_row_index);
+    }
+
+    // Verify column index of c0.
+    auto c0_column_index = row_group_index_reader->GetColumnIndex(0);
+    ASSERT_NE(c0_column_index, nullptr);
+    ASSERT_EQ(num_pages, c0_column_index->null_pages().size());

Review Comment:
   `null_pages` is a required list to indicate whether each page is a null page or not. So its size can be used to count number of pages here.
   
   ```thrift
   struct ColumnIndex {
     /**
      * A list of Boolean values to determine the validity of the corresponding
      * min and max values. If true, a page contains only null values, and writers
      * have to set the corresponding entries in min_values and max_values to
      * byte[0], so that all lists have the same length. If false, the
      * corresponding entries in min_values and max_values must be valid.
      */
     1: required list<bool> null_pages
   
     /**
      * Two lists containing lower and upper bounds for the values of each page
      * determined by the ColumnOrder of the column. These may be the actual
      * minimum and maximum values found on a page, but can also be (more compact)
      * values that do not exist on a page. For example, instead of storing ""Blart
      * Versenwald III", a writer may set min_values[i]="B", max_values[i]="C".
      * Such more compact values must still be valid values within the column's
      * logical type. Readers must make sure that list entries are populated before
      * using them by inspecting null_pages.
      */
     2: required list<binary> min_values
     3: required list<binary> max_values
   
     /**
      * Stores whether both min_values and max_values are ordered and if so, in
      * which direction. This allows readers to perform binary searches in both
      * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even
      * if the lists are ordered.
      */
     4: required BoundaryOrder boundary_order
   
     /** A list containing the number of null values for each page **/
     5: optional list<i64> null_counts
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org