You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/27 18:06:11 UTC
[arrow] branch main updated: GH-36297: [C++][Parquet] Benchmark for non-binary dict encoding (#36298)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4198aacf8a GH-36297: [C++][Parquet] Benchmark for non-binary dict encoding (#36298)
4198aacf8a is described below
commit 4198aacf8ab369d2ec3b7cb539f96c189f4a86f3
Author: mwish <ma...@gmail.com>
AuthorDate: Wed Jun 28 02:06:00 2023 +0800
GH-36297: [C++][Parquet] Benchmark for non-binary dict encoding (#36298)
### Rationale for this change
Add benchmark for non-binary dict encoding
### What changes are included in this PR?
Add benchmark `BM_DictEncodingInt64`
### Are these changes tested?
no need
### Are there any user-facing changes?
no
* Closes: #36297
Authored-by: mwish <ma...@gmail.com>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/parquet/encoding.cc | 2 +-
cpp/src/parquet/encoding_benchmark.cc | 55 +++++++++++++++++++++++++++++++----
2 files changed, 50 insertions(+), 7 deletions(-)
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 134a22f284..dda0e7701b 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -512,7 +512,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
::arrow::util::RleEncoder encoder(buffer, buffer_len, bit_width());
for (int32_t index : buffered_indices_) {
- if (!encoder.Put(index)) return -1;
+ if (ARROW_PREDICT_FALSE(!encoder.Put(index))) return -1;
}
encoder.Flush();
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 408588cd0c..6726810911 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -783,7 +783,29 @@ static void BM_RleDecodingSpacedBoolean(benchmark::State& state) {
BENCHMARK(BM_RleDecodingSpacedBoolean)->Apply(BM_SpacedArgs);
template <typename Type>
-static void DecodeDict(std::vector<typename Type::c_type>& values,
+static void EncodeDict(const std::vector<typename Type::c_type>& values,
+ benchmark::State& state) {
+ using T = typename Type::c_type;
+ int num_values = static_cast<int>(values.size());
+
+ MemoryPool* allocator = default_memory_pool();
+ std::shared_ptr<ColumnDescriptor> descr = Int64Schema(Repetition::REQUIRED);
+
+ auto base_encoder = MakeEncoder(Type::type_num, Encoding::RLE_DICTIONARY,
+ /*use_dictionary=*/true, descr.get(), allocator);
+ auto encoder =
+ dynamic_cast<typename EncodingTraits<Type>::Encoder*>(base_encoder.get());
+ for (auto _ : state) {
+ encoder->Put(values.data(), num_values);
+ encoder->FlushValues();
+ }
+
+ state.SetBytesProcessed(state.iterations() * num_values * sizeof(T));
+ state.SetItemsProcessed(state.iterations() * num_values);
+}
+
+template <typename Type>
+static void DecodeDict(const std::vector<typename Type::c_type>& values,
benchmark::State& state) {
typedef typename Type::c_type T;
int num_values = static_cast<int>(values.size());
@@ -810,6 +832,7 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,
PARQUET_THROW_NOT_OK(indices->Resize(actual_bytes));
+ std::vector<T> decoded_values(num_values);
for (auto _ : state) {
auto dict_decoder = MakeTypedDecoder<Type>(Encoding::PLAIN, descr.get());
dict_decoder->SetData(dict_traits->num_entries(), dict_buffer->data(),
@@ -818,10 +841,11 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,
auto decoder = MakeDictDecoder<Type>(descr.get());
decoder->SetDict(dict_decoder.get());
decoder->SetData(num_values, indices->data(), static_cast<int>(indices->size()));
- decoder->Decode(values.data(), num_values);
+ decoder->Decode(decoded_values.data(), num_values);
}
- state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(T));
+ state.SetBytesProcessed(state.iterations() * num_values * sizeof(T));
+ state.SetItemsProcessed(state.iterations() * num_values);
}
static void BM_DictDecodingInt64_repeats(benchmark::State& state) {
@@ -834,19 +858,38 @@ static void BM_DictDecodingInt64_repeats(benchmark::State& state) {
BENCHMARK(BM_DictDecodingInt64_repeats)->Range(MIN_RANGE, MAX_RANGE);
+static void BM_DictEncodingInt64_repeats(benchmark::State& state) {
+ typedef Int64Type Type;
+ typedef typename Type::c_type T;
+
+ std::vector<T> values(state.range(0), 64);
+ EncodeDict<Type>(values, state);
+}
+
+BENCHMARK(BM_DictEncodingInt64_repeats)->Range(MIN_RANGE, MAX_RANGE);
+
static void BM_DictDecodingInt64_literals(benchmark::State& state) {
typedef Int64Type Type;
typedef typename Type::c_type T;
std::vector<T> values(state.range(0));
- for (size_t i = 0; i < values.size(); ++i) {
- values[i] = i;
- }
+ std::iota(values.begin(), values.end(), 0);
DecodeDict<Type>(values, state);
}
BENCHMARK(BM_DictDecodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE);
+static void BM_DictEncodingInt64_literals(benchmark::State& state) {
+ using Type = Int64Type;
+ using T = typename Type::c_type;
+
+ std::vector<T> values(state.range(0));
+ std::iota(values.begin(), values.end(), 0);
+ EncodeDict<Type>(values, state);
+}
+
+BENCHMARK(BM_DictEncodingInt64_literals)->Range(MIN_RANGE, MAX_RANGE);
+
static void BM_DictDecodingByteArray(benchmark::State& state) {
::arrow::random::RandomArrayGenerator rag(0);
// Using arrow generator to generate random data.