You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2023/06/22 16:55:29 UTC

[arrow] branch main updated: GH-36059: [C++][Compute] Reserve space for hashtable for scalar lookup functions (#36067)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2ce4a38fcf GH-36059: [C++][Compute] Reserve space for hashtable for scalar lookup functions (#36067)
2ce4a38fcf is described below

commit 2ce4a38fcf1fbd0d759b6c2cdf657f27ee6f02a9
Author: Jin Shang <sh...@gmail.com>
AuthorDate: Fri Jun 23 00:55:21 2023 +0800

    GH-36059: [C++][Compute] Reserve space for hashtable for scalar lookup functions (#36067)
    
    
    
    ### Rationale for this change
    
    Performance improvement for scalar lookup functions.
    
    ### What changes are included in this PR?
    
    1. Reserve space for hashtable for scalar lookup functions before the insertion process.
    2. A benchmark that demonstrates the improvement. Data is pasted in the comments.
    
    ### Are these changes tested?
    
    By existing unit tests.
    
    ### Are there any user-facing changes?
    
    No.
    * Closes: #36059
    
    Lead-authored-by: Jin Shang <sh...@gmail.com>
    Co-authored-by: Antoine Pitrou <an...@python.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/compute/kernels/scalar_set_lookup.cc | 24 ++++++++----
 .../compute/kernels/scalar_set_lookup_benchmark.cc | 45 +++++++++++++++++-----
 2 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index c3d2bc5417..0fbc5b62fe 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -36,16 +36,23 @@ namespace {
 
 template <typename Type>
 struct SetLookupState : public KernelState {
-  explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
+  explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {}
 
   Status Init(const SetLookupOptions& options) {
     if (options.value_set.is_array()) {
       const ArrayData& value_set = *options.value_set.array();
       memo_index_to_value_index.reserve(value_set.length);
+      lookup_table =
+          MemoTable(memory_pool,
+                    ::arrow::internal::HashTable<char>::kLoadFactor * value_set.length);
       RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
     } else if (options.value_set.kind() == Datum::CHUNKED_ARRAY) {
       const ChunkedArray& value_set = *options.value_set.chunked_array();
       memo_index_to_value_index.reserve(value_set.length());
+      lookup_table =
+          MemoTable(memory_pool,
+                    ::arrow::internal::HashTable<char>::kLoadFactor * value_set.length());
+
       int64_t offset = 0;
       for (const std::shared_ptr<Array>& chunk : value_set.chunks()) {
         RETURN_NOT_OK(AddArrayValueSet(options, *chunk->data(), offset));
@@ -54,8 +61,8 @@ struct SetLookupState : public KernelState {
     } else {
       return Status::Invalid("value_set should be an array or chunked array");
     }
-    if (!options.skip_nulls && lookup_table.GetNull() >= 0) {
-      null_index = memo_index_to_value_index[lookup_table.GetNull()];
+    if (!options.skip_nulls && lookup_table->GetNull() >= 0) {
+      null_index = memo_index_to_value_index[lookup_table->GetNull()];
     }
     return Status::OK();
   }
@@ -75,7 +82,7 @@ struct SetLookupState : public KernelState {
         DCHECK_EQ(memo_index, memo_size);
         memo_index_to_value_index.push_back(index);
       };
-      RETURN_NOT_OK(lookup_table.GetOrInsert(
+      RETURN_NOT_OK(lookup_table->GetOrInsert(
           v, std::move(on_found), std::move(on_not_found), &unused_memo_index));
       ++index;
       return Status::OK();
@@ -89,7 +96,7 @@ struct SetLookupState : public KernelState {
         DCHECK_EQ(memo_index, memo_size);
         memo_index_to_value_index.push_back(index);
       };
-      lookup_table.GetOrInsertNull(std::move(on_found), std::move(on_not_found));
+      lookup_table->GetOrInsertNull(std::move(on_found), std::move(on_not_found));
       ++index;
       return Status::OK();
     };
@@ -98,7 +105,8 @@ struct SetLookupState : public KernelState {
   }
 
   using MemoTable = typename HashTraits<Type>::MemoTableType;
-  MemoTable lookup_table;
+  std::optional<MemoTable> lookup_table;  // use optional for delayed initialization
+  MemoryPool* memory_pool;
   // When there are duplicates in value_set, the MemoTable indices must
   // be mapped back to indices in the value_set.
   std::vector<int32_t> memo_index_to_value_index;
@@ -264,7 +272,7 @@ struct IndexInVisitor {
     VisitArraySpanInline<Type>(
         data,
         [&](T v) {
-          int32_t index = state.lookup_table.Get(v);
+          int32_t index = state.lookup_table->Get(v);
           if (index != -1) {
             bitmap_writer.Set();
 
@@ -358,7 +366,7 @@ struct IsInVisitor {
     VisitArraySpanInline<Type>(
         this->data,
         [&](T v) {
-          if (state.lookup_table.Get(v) != -1) {
+          if (state.lookup_table->Get(v) != -1) {
             writer.Set();
           } else {
             writer.Clear();
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_benchmark.cc
index c49dd74084..9158c518b4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_benchmark.cc
@@ -51,13 +51,14 @@ static void SetLookupBenchmarkString(benchmark::State& state,
   }
   state.SetItemsProcessed(state.iterations() * array_length);
   state.SetBytesProcessed(state.iterations() * values->data()->buffers[2]->size());
+  state.counters["value_set_length"] = static_cast<double>(value_set_length);
 }
 
 template <typename Type>
 static void SetLookupBenchmarkNumeric(benchmark::State& state,
                                       const std::string& func_name,
-                                      const int64_t value_set_length) {
-  const int64_t array_length = 1 << 18;
+                                      const int64_t value_set_length,
+                                      const int64_t array_length) {
   const int64_t value_min = 0;
   const int64_t value_max = std::numeric_limits<typename Type::c_type>::max();
   const double null_probability = 0.1 / value_set_length;
@@ -72,6 +73,7 @@ static void SetLookupBenchmarkNumeric(benchmark::State& state,
   }
   state.SetItemsProcessed(state.iterations() * array_length);
   state.SetBytesProcessed(state.iterations() * values->data()->buffers[1]->size());
+  state.counters["value_set_length"] = static_cast<double>(value_set_length);
 }
 
 static void IndexInStringSmallSet(benchmark::State& state) {
@@ -90,36 +92,57 @@ static void IsInStringLargeSet(benchmark::State& state) {
   SetLookupBenchmarkString(state, "is_in_meta_binary", 1 << 10);
 }
 
+static constexpr int64_t kArrayLengthWithSmallSet = 1 << 18;
+static constexpr int64_t kArrayLengthWithLargeSet = 1000;
+
 static void IndexInInt8SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int8Type>(state, "index_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int8Type>(state, "index_in_meta_binary", state.range(0),
+                                      kArrayLengthWithSmallSet);
 }
 
 static void IndexInInt16SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int16Type>(state, "index_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int16Type>(state, "index_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
 }
 
 static void IndexInInt32SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int32Type>(state, "index_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int32Type>(state, "index_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
 }
 
 static void IndexInInt64SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int64Type>(state, "index_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int64Type>(state, "index_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
+}
+
+static void IndexInInt32LargeSet(benchmark::State& state) {
+  SetLookupBenchmarkNumeric<Int32Type>(state, "index_in_meta_binary", state.range(0),
+                                       kArrayLengthWithLargeSet);
 }
 
 static void IsInInt8SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int8Type>(state, "is_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int8Type>(state, "is_in_meta_binary", state.range(0),
+                                      kArrayLengthWithSmallSet);
 }
 
 static void IsInInt16SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int16Type>(state, "is_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int16Type>(state, "is_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
 }
 
 static void IsInInt32SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int32Type>(state, "is_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int32Type>(state, "is_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
 }
 
 static void IsInInt64SmallSet(benchmark::State& state) {
-  SetLookupBenchmarkNumeric<Int64Type>(state, "is_in_meta_binary", state.range(0));
+  SetLookupBenchmarkNumeric<Int64Type>(state, "is_in_meta_binary", state.range(0),
+                                       kArrayLengthWithSmallSet);
+}
+
+static void IsInInt32LargeSet(benchmark::State& state) {
+  SetLookupBenchmarkNumeric<Int32Type>(state, "is_in_meta_binary", state.range(0),
+                                       kArrayLengthWithLargeSet);
 }
 
 BENCHMARK(IndexInStringSmallSet)->RangeMultiplier(4)->Range(2, 64);
@@ -134,10 +157,12 @@ BENCHMARK(IndexInInt8SmallSet)->RangeMultiplier(4)->Range(2, 8);
 BENCHMARK(IndexInInt16SmallSet)->RangeMultiplier(4)->Range(2, 64);
 BENCHMARK(IndexInInt32SmallSet)->RangeMultiplier(4)->Range(2, 64);
 BENCHMARK(IndexInInt64SmallSet)->RangeMultiplier(4)->Range(2, 64);
+BENCHMARK(IndexInInt32LargeSet)->RangeMultiplier(100)->Range(1000, 1000000);
 BENCHMARK(IsInInt8SmallSet)->RangeMultiplier(4)->Range(2, 8);
 BENCHMARK(IsInInt16SmallSet)->RangeMultiplier(4)->Range(2, 64);
 BENCHMARK(IsInInt32SmallSet)->RangeMultiplier(4)->Range(2, 64);
 BENCHMARK(IsInInt64SmallSet)->RangeMultiplier(4)->Range(2, 64);
+BENCHMARK(IsInInt32LargeSet)->RangeMultiplier(100)->Range(1000, 1000000);
 
 }  // namespace compute
 }  // namespace arrow