You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/05/19 00:21:24 UTC

[GitHub] [arrow] fsaintjacques commented on a change in pull request #7175: ARROW-8794: [C++] Expand performance coverage of parquet to arrow reading

fsaintjacques commented on a change in pull request #7175:
URL: https://github.com/apache/arrow/pull/7175#discussion_r426958606



##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,37 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
+template <typename T>
+std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,

Review comment:
       It'll need to depend on libarrow_testing.so, not sure if this is a problem.

##########
File path: cpp/src/parquet/arrow/reader_writer_benchmark.cc
##########
@@ -95,15 +97,37 @@ void SetBytesProcessed(::benchmark::State& state) {
   state.SetBytesProcessed(bytes_processed);
 }
 
+constexpr int64_t kAlternatingOrNa = -1;
+
+template <typename T>
+std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
+                            const std::array<T, 2>& sample_values) {
+  std::vector<T> values(BENCHMARK_SIZE, {});
+  if (true_percentage == kAlternatingOrNa) {
+    int n = {0};
+    std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
+  } else {
+    std::default_random_engine rng(500);
+    double true_probability = static_cast<double>(true_percentage) / 100.0;
+    std::bernoulli_distribution dist(true_probability);
+    std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
+  }
+  return values;
+}
+
 template <typename ParquetType>
 std::shared_ptr<::arrow::Table> TableFromVector(
-    const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+    const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+    int64_t null_percentage = kAlternatingOrNa) {
+  if (!nullable) {
+    DCHECK(null_percentage = kAlternatingOrNa);
+  }
   std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
   NumericBuilder<ArrowType<ParquetType>> builder;
   if (nullable) {
-    std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
-    int n = {0};
-    std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; });
+    // Note true values select index 1 of sample_values
+    auto valid_bytes = RandomVector<uint8_t>(/*true_percengate=*/null_percentage,
+                                             BENCHMARK_SIZE, /*sample_values=*/{1, 0});

Review comment:
       Good catch, the bitmap only has `0b00000001` and `0b00000000` as possible words, or more-or-less one bit every 8th position.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org