You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/10/11 02:42:46 UTC
[impala] 02/03: IMPALA-11504: Specializing DecimalUtil::GetScaleMultiplier().

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 6e7cd605fdf3ce95a5993676a87a0209ae671973
Author: yx91490 <yx...@126.com>
AuthorDate: Wed Aug 17 13:08:06 2022 +0800

    IMPALA-11504: Specializing DecimalUtil::GetScaleMultiplier<int256_t>().
    
    Currently decimal-util.h didn't specialize DecimalUtil
    ::GetScaleMultiplier<int256_t>(), causing more performance loss when
    calculate Decimal16Value division.
    
    This template specialisation results in 5600X speedup approximately.
    
    Testing:
    - Ran existing jobs.
    - Add decimal-util-benchmark.
    
    Change-Id: I969e2977d51313e738f72c8246db003ae43a3782
    Reviewed-on: http://gerrit.cloudera.org:8080/18861
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/benchmarks/CMakeLists.txt            |   1 +
 be/src/benchmarks/decimal-util-benchmark.cc |  92 +++++++++++++++++
 be/src/runtime/decimal-test.cc              |  19 ++++
 be/src/util/decimal-util.h                  | 153 ++++++++++++++++++++++------
 4 files changed, 235 insertions(+), 30 deletions(-)

diff --git a/be/src/benchmarks/CMakeLists.txt b/be/src/benchmarks/CMakeLists.txt
index f436cefee..2af5e9fe2 100644
--- a/be/src/benchmarks/CMakeLists.txt
+++ b/be/src/benchmarks/CMakeLists.txt
@@ -38,6 +38,7 @@ ADD_BE_BENCHMARK(bitmap-benchmark)
 ADD_BE_BENCHMARK(bit-packing-benchmark)
 ADD_BE_BENCHMARK(bloom-filter-benchmark)
 ADD_BE_BENCHMARK(bswap-benchmark)
+ADD_BE_BENCHMARK(decimal-util-benchmark)
 ADD_BE_BENCHMARK(expr-benchmark)
 ADD_BE_BENCHMARK(free-lists-benchmark)
 ADD_BE_BENCHMARK(hash-benchmark)
diff --git a/be/src/benchmarks/decimal-util-benchmark.cc b/be/src/benchmarks/decimal-util-benchmark.cc
new file mode 100644
index 000000000..e1395175a
--- /dev/null
+++ b/be/src/benchmarks/decimal-util-benchmark.cc
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "util/benchmark.h"
+#include "util/cpu-info.h"
+#include "util/decimal-util.h"
+
+// Summary result (ran within a Docker container):
+//
+// Machine Info: AMD Ryzen 9 5950X 16-Core Processor
+// ScaleMultiplierBenchmark:  Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//                                 raw              0.852    0.852    0.852         1X         1X         1X
+//                         specialized           4.71e+03 4.76e+03 4.81e+03  5.53e+03X  5.59e+03X  5.64e+03X
+
+using namespace impala;
+
+namespace int256_scale_multiplier {
+
+void AddTestData(vector<int>* data, int n) {
+  for (int i = 0; i < n; i++) {
+    int m = rand() % DecimalUtil::INT256_SCALE_UPPER_BOUND;
+    data->push_back(m);
+  }
+}
+
+/// Copied from Decimal::GetScaleMultiplier
+static int256_t GetScaleMultiplier(int scale) {
+  int256_t result = 1;
+  for (int i = 0; i < scale; ++i) {
+    result *= 10;
+  }
+  return result;
+}
+
+// A volatile variable to hold the return value of the benchmarked functions.
+// Used to prevent the compiler from optimising out the calls.
+static volatile int64_t volatile_var = 0;
+
+static void TestRawTemplateCall(int batch_size, void* d) {
+  vector<int>* data = reinterpret_cast<vector<int>*>(d);
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < data->size(); j++) {
+      auto m = GetScaleMultiplier(j);
+      volatile_var = m.convert_to<int64_t>();
+    }
+  }
+}
+
+static void TestSpecializedTemplateCall(int batch_size, void* d) {
+  vector<int>* data = reinterpret_cast<vector<int>*>(d);
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < data->size(); j++) {
+      auto m = DecimalUtil::GetScaleMultiplier<int256_t>(j);
+      volatile_var = m.convert_to<int64_t>();
+    }
+  }
+}
+
+} // namespace int256_scale_multiplier
+
+int main(int argc, char** argv) {
+  CpuInfo::Init();
+  std::cout << Benchmark::GetMachineInfo() << std::endl;
+  vector<int> data;
+  int256_scale_multiplier::AddTestData(&data, 1000);
+  Benchmark int256_scale_multiplier_suite("ScaleMultiplierBenchmark");
+  int256_scale_multiplier_suite.AddBenchmark(
+      "raw", int256_scale_multiplier::TestRawTemplateCall, &data);
+  int256_scale_multiplier_suite.AddBenchmark(
+      "specialized", int256_scale_multiplier::TestSpecializedTemplateCall, &data);
+  std::cout << int256_scale_multiplier_suite.Measure() << std::endl;
+  return 0;
+}
diff --git a/be/src/runtime/decimal-test.cc b/be/src/runtime/decimal-test.cc
index c5aed53c1..34624b134 100644
--- a/be/src/runtime/decimal-test.cc
+++ b/be/src/runtime/decimal-test.cc
@@ -1004,5 +1004,24 @@ TEST(DecimalTest, PrecisionScaleValidation) {
   EXPECT_FALSE(ColumnType::ValidateDecimalParams(15, 16));
 }
 
+template <typename T>
+static void TestGetScaleMultiplier(int scale_upper_bound, T overflow_val) {
+  T expect = 1;
+  for (int scale = 0; scale < scale_upper_bound; scale++) {
+    EXPECT_EQ(expect, DecimalUtil::GetScaleMultiplier<T>(scale));
+    expect *= 10;
+  }
+  // test overflow
+  EXPECT_EQ(overflow_val, DecimalUtil::GetScaleMultiplier<T>(scale_upper_bound));
+}
+
+TEST(DecimalTest, GetScaleMultiplier) {
+  TestGetScaleMultiplier<int32_t>(DecimalUtil::INT32_SCALE_UPPER_BOUND, -1);
+  TestGetScaleMultiplier<int64_t>(DecimalUtil::INT64_SCALE_UPPER_BOUND, -1);
+  TestGetScaleMultiplier<int128_t>(DecimalUtil::INT128_SCALE_UPPER_BOUND, -1);
+  TestGetScaleMultiplier<int256_t>(DecimalUtil::INT256_SCALE_UPPER_BOUND, -1);
+  TestGetScaleMultiplier<double>(DecimalUtil::INT64_SCALE_UPPER_BOUND, 1E19);
+}
+
 }
 
diff --git a/be/src/util/decimal-util.h b/be/src/util/decimal-util.h
index 6c1d94f19..b9bb8bf1e 100644
--- a/be/src/util/decimal-util.h
+++ b/be/src/util/decimal-util.h
@@ -34,6 +34,15 @@ namespace impala {
 
 class DecimalUtil {
  public:
+  // The scale's exclusive upper bound for GetScaleMultiplier<int32_t>()
+  static constexpr int INT32_SCALE_UPPER_BOUND = ColumnType::MAX_DECIMAL4_PRECISION + 1;
+  // The scale's exclusive upper bound for GetScaleMultiplier<int64_t>()
+  static constexpr int INT64_SCALE_UPPER_BOUND = ColumnType::MAX_DECIMAL8_PRECISION + 1;
+  // The scale's exclusive upper bound for GetScaleMultiplier<int128_t>()
+  static constexpr int INT128_SCALE_UPPER_BOUND = ColumnType::MAX_PRECISION + 1;
+  // The scale's exclusive upper bound for GetScaleMultiplier<int256_t>()
+  static constexpr int INT256_SCALE_UPPER_BOUND = 77;
+
   // Helper function that checks for multiplication overflow. We only check for overflow
   // if may_overflow is false.
   template <typename T>
@@ -166,7 +175,7 @@ class DecimalUtil {
 template <>
 inline int32_t DecimalUtil::GetScaleMultiplier<int32_t>(int scale) {
   DCHECK_GE(scale, 0);
-  static const int32_t values[] = {
+  static constexpr int32_t values[INT32_SCALE_UPPER_BOUND] = {
       1,
       10,
       100,
@@ -177,15 +186,14 @@ inline int32_t DecimalUtil::GetScaleMultiplier<int32_t>(int scale) {
       10000000,
       100000000,
       1000000000};
-  DCHECK_GE(sizeof(values) / sizeof(int32_t), ColumnType::MAX_DECIMAL4_PRECISION);
-  if (LIKELY(scale < 10)) return values[scale];
+  if (LIKELY(scale < INT32_SCALE_UPPER_BOUND)) return values[scale];
   return -1;  // Overflow
 }
 
 template <>
 inline int64_t DecimalUtil::GetScaleMultiplier<int64_t>(int scale) {
   DCHECK_GE(scale, 0);
-  static const int64_t values[] = {
+  static constexpr int64_t values[INT64_SCALE_UPPER_BOUND] = {
       1ll,
       10ll,
       100ll,
@@ -205,15 +213,15 @@ inline int64_t DecimalUtil::GetScaleMultiplier<int64_t>(int scale) {
       10000000000000000ll,
       100000000000000000ll,
       1000000000000000000ll};
-  DCHECK_GE(sizeof(values) / sizeof(int64_t), ColumnType::MAX_DECIMAL8_PRECISION);
-  if (LIKELY(scale < 19)) return values[scale];
+  if (LIKELY(scale < INT64_SCALE_UPPER_BOUND)) return values[scale];
   return -1;  // Overflow
 }
 
 template <>
 inline int128_t DecimalUtil::GetScaleMultiplier<int128_t>(int scale) {
   DCHECK_GE(scale, 0);
-  static const int128_t values[] = {
+  static constexpr int128_t i10e18{1000000000000000000ll};
+  static constexpr int128_t values[INT128_SCALE_UPPER_BOUND] = {
       static_cast<int128_t>(1ll),
       static_cast<int128_t>(10ll),
       static_cast<int128_t>(100ll),
@@ -232,29 +240,114 @@ inline int128_t DecimalUtil::GetScaleMultiplier<int128_t>(int scale) {
       static_cast<int128_t>(1000000000000000ll),
       static_cast<int128_t>(10000000000000000ll),
       static_cast<int128_t>(100000000000000000ll),
-      static_cast<int128_t>(1000000000000000000ll),
-      static_cast<int128_t>(1000000000000000000ll) * 10ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100ll,
-      static_cast<int128_t>(1000000000000000000ll) * 1000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 10000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 1000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 10000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 1000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 10000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 1000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 10000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 1000000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 10000000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 10ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 100ll,
-      static_cast<int128_t>(1000000000000000000ll) * 100000000000000000ll * 1000ll};
-  DCHECK_GE(sizeof(values) / sizeof(int128_t), ColumnType::MAX_PRECISION);
-  if (LIKELY(scale < 39)) return values[scale];
+      i10e18,
+      i10e18 * 10ll,
+      i10e18 * 100ll,
+      i10e18 * 1000ll,
+      i10e18 * 10000ll,
+      i10e18 * 100000ll,
+      i10e18 * 1000000ll,
+      i10e18 * 10000000ll,
+      i10e18 * 100000000ll,
+      i10e18 * 1000000000ll,
+      i10e18 * 10000000000ll,
+      i10e18 * 100000000000ll,
+      i10e18 * 1000000000000ll,
+      i10e18 * 10000000000000ll,
+      i10e18 * 100000000000000ll,
+      i10e18 * 1000000000000000ll,
+      i10e18 * 10000000000000000ll,
+      i10e18 * 100000000000000000ll,
+      i10e18 * i10e18,
+      i10e18 * i10e18 * 10ll,
+      i10e18 * i10e18 * 100ll};
+  if (LIKELY(scale < INT128_SCALE_UPPER_BOUND)) return values[scale];
+  return -1;  // Overflow
+}
+
+template <>
+inline int256_t DecimalUtil::GetScaleMultiplier<int256_t>(int scale) {
+  DCHECK_GE(scale, 0);
+  static constexpr int256_t i10e18{1000000000000000000ll};
+  static constexpr int256_t values[INT256_SCALE_UPPER_BOUND] = {
+      static_cast<int256_t>(1ll),
+      static_cast<int256_t>(10ll),
+      static_cast<int256_t>(100ll),
+      static_cast<int256_t>(1000ll),
+      static_cast<int256_t>(10000ll),
+      static_cast<int256_t>(100000ll),
+      static_cast<int256_t>(1000000ll),
+      static_cast<int256_t>(10000000ll),
+      static_cast<int256_t>(100000000ll),
+      static_cast<int256_t>(1000000000ll),
+      static_cast<int256_t>(10000000000ll),
+      static_cast<int256_t>(100000000000ll),
+      static_cast<int256_t>(1000000000000ll),
+      static_cast<int256_t>(10000000000000ll),
+      static_cast<int256_t>(100000000000000ll),
+      static_cast<int256_t>(1000000000000000ll),
+      static_cast<int256_t>(10000000000000000ll),
+      static_cast<int256_t>(100000000000000000ll),
+      i10e18,
+      i10e18 * 10ll,
+      i10e18 * 100ll,
+      i10e18 * 1000ll,
+      i10e18 * 10000ll,
+      i10e18 * 100000ll,
+      i10e18 * 1000000ll,
+      i10e18 * 10000000ll,
+      i10e18 * 100000000ll,
+      i10e18 * 1000000000ll,
+      i10e18 * 10000000000ll,
+      i10e18 * 100000000000ll,
+      i10e18 * 1000000000000ll,
+      i10e18 * 10000000000000ll,
+      i10e18 * 100000000000000ll,
+      i10e18 * 1000000000000000ll,
+      i10e18 * 10000000000000000ll,
+      i10e18 * 100000000000000000ll,
+      i10e18 * i10e18,
+      i10e18 * i10e18 * 10ll,
+      i10e18 * i10e18 * 100ll,
+      i10e18 * i10e18 * 1000ll,
+      i10e18 * i10e18 * 10000ll,
+      i10e18 * i10e18 * 100000ll,
+      i10e18 * i10e18 * 1000000ll,
+      i10e18 * i10e18 * 10000000ll,
+      i10e18 * i10e18 * 100000000ll,
+      i10e18 * i10e18 * 1000000000ll,
+      i10e18 * i10e18 * 10000000000ll,
+      i10e18 * i10e18 * 100000000000ll,
+      i10e18 * i10e18 * 1000000000000ll,
+      i10e18 * i10e18 * 10000000000000ll,
+      i10e18 * i10e18 * 100000000000000ll,
+      i10e18 * i10e18 * 1000000000000000ll,
+      i10e18 * i10e18 * 10000000000000000ll,
+      i10e18 * i10e18 * 100000000000000000ll,
+      i10e18 * i10e18 * i10e18,
+      i10e18 * i10e18 * i10e18 * 10ll,
+      i10e18 * i10e18 * i10e18 * 100ll,
+      i10e18 * i10e18 * i10e18 * 1000ll,
+      i10e18 * i10e18 * i10e18 * 10000ll,
+      i10e18 * i10e18 * i10e18 * 100000ll,
+      i10e18 * i10e18 * i10e18 * 1000000ll,
+      i10e18 * i10e18 * i10e18 * 10000000ll,
+      i10e18 * i10e18 * i10e18 * 100000000ll,
+      i10e18 * i10e18 * i10e18 * 1000000000ll,
+      i10e18 * i10e18 * i10e18 * 10000000000ll,
+      i10e18 * i10e18 * i10e18 * 100000000000ll,
+      i10e18 * i10e18 * i10e18 * 1000000000000ll,
+      i10e18 * i10e18 * i10e18 * 10000000000000ll,
+      i10e18 * i10e18 * i10e18 * 100000000000000ll,
+      i10e18 * i10e18 * i10e18 * 1000000000000000ll,
+      i10e18 * i10e18 * i10e18 * 10000000000000000ll,
+      i10e18 * i10e18 * i10e18 * 100000000000000000ll,
+      i10e18 * i10e18 * i10e18 * i10e18,
+      i10e18 * i10e18 * i10e18 * i10e18 * 10ll,
+      i10e18 * i10e18 * i10e18 * i10e18 * 100ll,
+      i10e18 * i10e18 * i10e18 * i10e18 * 1000ll,
+      i10e18 * i10e18 * i10e18 * i10e18 * 10000ll};
+  if (LIKELY(scale < INT256_SCALE_UPPER_BOUND)) return values[scale];
   return -1;  // Overflow
 }
 }