You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/05/14 12:56:11 UTC

[GitHub] [arrow] pitrou commented on a change in pull request #7175: ARROW-8794: [C++] Expand performance coverage of parquet to arrow reading

pitrou commented on a change in pull request #7175:
URL: https://github.com/apache/arrow/pull/7175#discussion_r425110880



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -118,13 +134,22 @@ std::shared_ptr<::arrow::Table> TableFromVector(
 
 template <>
 std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
-                                                             bool nullable) {
+                                                             bool nullable,
+                                                             int null_percentage) {
   BooleanBuilder builder;
   if (nullable) {
     std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
-    int n = {0};
-    std::generate(valid_bytes.begin(), valid_bytes.end(),
-                  [&n] { return (n++ % 2) != 0; });
+    if (null_percentage == -1) {

Review comment:
       Perhaps nulls generation can be factored out?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,29 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
 template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    int null_percentage = kAlternatingOrNa) {
+  if (!nullable) {
+    DCHECK(null_percentage = kAlternatingOrNa);
+  }
   std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
   NumericBuilder<ArrowType<ParquetType>> builder;
   if (nullable) {
     std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
-    int n = {0};
-    std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; });
+    if (null_percentage == -1) {

Review comment:
       Should this be `kAlternatingOrNa`?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})

Review comment:
       Can you add a comment why we're varying `first_value_percentage`? (trigger RLE perhaps?)

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
+    ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
+    ->Args({kAlternatingOrNa, 0})
+    ->Args({1, 20});

Review comment:
       Why `1`?

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,29 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
 template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    int null_percentage = kAlternatingOrNa) {

Review comment:
       `int64_t` above

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -187,17 +219,56 @@ static void BM_ReadColumn(::benchmark::State& state) {
   SetBytesProcessed<nullable, ParquetType>(state);
 }
 
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/50, /*first_value_percentage=*/100})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/99, /*first_value_percentage=*/100});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+    ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
+    ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+    ->Args({/*null_percentage=*/1, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/5, /*first_value_percentage=*/10})
+    ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+    ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/30, /*first_value_percentage=*/25})
+    ->Args({/*null_percentage=*/35, /*first_value_percentage=*/25})

Review comment:
       Do we need such a granularity in `null_percentage` values?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org