You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/05/20 13:47:15 UTC
[arrow] branch master updated: ARROW-8794: [C++] Expand performance
coverage of parquet to arrow reading
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a70b4a0 ARROW-8794: [C++] Expand performance coverage of parquet to arrow reading
a70b4a0 is described below
commit a70b4a06f3cf657f08f80cee83b61f8799828539
Author: Micah Kornfield <em...@gmail.com>
AuthorDate: Wed May 20 15:46:49 2020 +0200
ARROW-8794: [C++] Expand performance coverage of parquet to arrow reading
Closes #7175 from emkornfield/ARROW-8794-benchmark
Lead-authored-by: Micah Kornfield <em...@gmail.com>
Co-authored-by: emkornfield <em...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/arrow/reader_writer_benchmark.cc | 127 +++++++++++++++++++----
1 file changed, 104 insertions(+), 23 deletions(-)
diff --git a/cpp/src/parquet/arrow/reader_writer_benchmark.cc b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
index 66bc9e9..bf3a93c 100644
--- a/cpp/src/parquet/arrow/reader_writer_benchmark.cc
+++ b/cpp/src/parquet/arrow/reader_writer_benchmark.cc
@@ -18,6 +18,7 @@
#include "benchmark/benchmark.h"
#include <iostream>
+#include <random>
#include "parquet/arrow/reader.h"
#include "parquet/arrow/writer.h"
@@ -28,6 +29,7 @@
#include "parquet/platform.h"
#include "arrow/api.h"
+#include "arrow/util/logging.h"
using arrow::BooleanBuilder;
using arrow::NumericBuilder;
@@ -95,15 +97,37 @@ void SetBytesProcessed(::benchmark::State& state) {
state.SetBytesProcessed(bytes_processed);
}
+constexpr int64_t kAlternatingOrNa = -1;
+
+template <typename T>
+std::vector<T> RandomVector(int64_t true_percentage, int64_t vector_size,
+ const std::array<T, 2>& sample_values) {
+ std::vector<T> values(BENCHMARK_SIZE, {});
+ if (true_percentage == kAlternatingOrNa) {
+ int n = {0};
+ std::generate(values.begin(), values.end(), [&n] { return n++ % 2; });
+ } else {
+ std::default_random_engine rng(500);
+ double true_probability = static_cast<double>(true_percentage) / 100.0;
+ std::bernoulli_distribution dist(true_probability);
+ std::generate(values.begin(), values.end(), [&] { return sample_values[dist(rng)]; });
+ }
+ return values;
+}
+
template <typename ParquetType>
std::shared_ptr<::arrow::Table> TableFromVector(
- const std::vector<typename ParquetType::c_type>& vec, bool nullable) {
+ const std::vector<typename ParquetType::c_type>& vec, bool nullable,
+ int64_t null_percentage = kAlternatingOrNa) {
+ if (!nullable) {
+ ARROW_CHECK_EQ(null_percentage, kAlternatingOrNa);
+ }
std::shared_ptr<::arrow::DataType> type = std::make_shared<ArrowType<ParquetType>>();
NumericBuilder<ArrowType<ParquetType>> builder;
if (nullable) {
- std::vector<uint8_t> valid_bytes(BENCHMARK_SIZE, 0);
- int n = {0};
- std::generate(valid_bytes.begin(), valid_bytes.end(), [&n] { return n++ % 2; });
+ // Note true values select index 1 of sample_values
+ auto valid_bytes = RandomVector<uint8_t>(/*true_percentage=*/null_percentage,
+ BENCHMARK_SIZE, /*sample_values=*/{1, 0});
EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), valid_bytes.data()));
} else {
EXIT_NOT_OK(builder.AppendValues(vec.data(), vec.size(), nullptr));
@@ -118,13 +142,12 @@ std::shared_ptr<::arrow::Table> TableFromVector(
template <>
std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<bool>& vec,
- bool nullable) {
+ bool nullable,
+ int64_t null_percentage) {
BooleanBuilder builder;
if (nullable) {
- std::vector<bool> valid_bytes(BENCHMARK_SIZE, 0);
- int n = {0};
- std::generate(valid_bytes.begin(), valid_bytes.end(),
- [&n] { return (n++ % 2) != 0; });
+ auto valid_bytes = RandomVector<bool>(/*true_percentage=*/null_percentage,
+ BENCHMARK_SIZE, {true, false});
EXIT_NOT_OK(builder.AppendValues(vec, valid_bytes));
} else {
EXIT_NOT_OK(builder.AppendValues(vec));
@@ -141,7 +164,7 @@ std::shared_ptr<::arrow::Table> TableFromVector<BooleanType>(const std::vector<b
template <bool nullable, typename ParquetType>
static void BM_WriteColumn(::benchmark::State& state) {
using T = typename ParquetType::c_type;
- std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
+ std::vector<T> values(BENCHMARK_SIZE, 128);
std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
while (state.KeepRunning()) {
@@ -164,12 +187,25 @@ BENCHMARK_TEMPLATE2(BM_WriteColumn, true, DoubleType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, false, BooleanType);
BENCHMARK_TEMPLATE2(BM_WriteColumn, true, BooleanType);
+template <typename T>
+struct Examples {
+ static constexpr std::array<T, 2> values() { return {127, 128}; }
+};
+
+template <>
+struct Examples<bool> {
+ static constexpr std::array<bool, 2> values() { return {false, true}; }
+};
+
template <bool nullable, typename ParquetType>
static void BM_ReadColumn(::benchmark::State& state) {
using T = typename ParquetType::c_type;
- std::vector<T> values(BENCHMARK_SIZE, static_cast<T>(128));
- std::shared_ptr<::arrow::Table> table = TableFromVector<ParquetType>(values, nullable);
+ auto values = RandomVector<T>(/*percentage=*/state.range(1), BENCHMARK_SIZE,
+ Examples<T>::values());
+
+ std::shared_ptr<::arrow::Table> table =
+ TableFromVector<ParquetType>(values, nullable, state.range(0));
auto output = CreateOutputStream();
EXIT_NOT_OK(WriteTable(*table, ::arrow::default_memory_pool(), output, BENCHMARK_SIZE));
@@ -187,17 +223,62 @@ static void BM_ReadColumn(::benchmark::State& state) {
SetBytesProcessed<nullable, ParquetType>(state);
}
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType);
-
-BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType);
-BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType);
+// There are two parameters here that cover different data distributions.
+// null_percentage governs distribution and therefore runs of null values.
+// first_value_percentage governs distribution of values (we select from 1 of 2)
+// so when 0 or 100 RLE is triggered all the time. When a value in the range (0, 100)
+// there will be some percentage of RLE encoded values and some percentage of literal
+// encoded values (RLE is much less likely with percentages close to 50).
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int32Type)
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int32Type)
+ ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+ ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
+ ->Args({/*null_percentage=*/10, /*first_value_percentage=*/10})
+ ->Args({/*null_percentage=*/25, /*first_value_percentage=*/5})
+ ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+ ->Args({/*null_percentage=*/50, /*first_value_percentage=*/0})
+ ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+ ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, Int64Type)
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 1})
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 10})
+ ->Args({/*null_percentage=*/kAlternatingOrNa, 50});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, Int64Type)
+ ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+ ->Args({/*null_percentage=*/1, /*first_value_percentage=*/1})
+ ->Args({/*null_percentage=*/5, /*first_value_percentage=*/5})
+ ->Args({/*null_percentage=*/10, /*first_value_percentage=*/5})
+ ->Args({/*null_percentage=*/25, /*first_value_percentage=*/10})
+ ->Args({/*null_percentage=*/30, /*first_value_percentage=*/10})
+ ->Args({/*null_percentage=*/35, /*first_value_percentage=*/10})
+ ->Args({/*null_percentage=*/45, /*first_value_percentage=*/25})
+ ->Args({/*null_percentage=*/50, /*first_value_percentage=*/50})
+ ->Args({/*null_percentage=*/50, /*first_value_percentage=*/1})
+ ->Args({/*null_percentage=*/75, /*first_value_percentage=*/1})
+ ->Args({/*null_percentage=*/99, /*first_value_percentage=*/50})
+ ->Args({/*null_percentage=*/99, /*first_value_percentage=*/0});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, DoubleType)
+ ->Args({kAlternatingOrNa, 0})
+ ->Args({kAlternatingOrNa, 20});
+// Less coverage because int64_t should be pretty good representation for nullability and
+// repeating values.
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, DoubleType)
+ ->Args({/*null_percentage=*/kAlternatingOrNa, /*first_value_percentage=*/0})
+ ->Args({/*null_percentage=*/10, /*first_value_percentage=*/50})
+ ->Args({/*null_percentage=*/25, /*first_value_percentage=*/25});
+
+BENCHMARK_TEMPLATE2(BM_ReadColumn, false, BooleanType)
+ ->Args({kAlternatingOrNa, 0})
+ ->Args({1, 20});
+BENCHMARK_TEMPLATE2(BM_ReadColumn, true, BooleanType)
+ ->Args({kAlternatingOrNa, 1})
+ ->Args({5, 10});
static void BM_ReadIndividualRowGroups(::benchmark::State& state) {
std::vector<int64_t> values(BENCHMARK_SIZE, 128);