You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/05/08 21:46:12 UTC
[3/7] incubator-impala git commit: IMPALA-5273: Replace StringCompare
with glibc memcmp
IMPALA-5273: Replace StringCompare with glibc memcmp
glibc's memcmp, which dispatches dynamically based on the instructions
the processor supports, uses sse4.1's ptest, which is faster than our
implementation.
I ran a the benchmark below. The final query sped up by about 5x with
this patch.
create table long_strings (s string) stored as parquet;
insert into long_strings values (repeat("a", 2048));
insert into long_strings select a.s from long_strings a,
long_strings b;
insert into long_strings select a.s from long_strings a,
long_strings b;
insert into long_strings select a.s from long_strings a,
long_strings b;
insert into long_strings select a.s from long_strings a,
long_strings b;
insert into long_strings select a.s from long_strings a,
long_strings b;
insert into long_strings select a.s from long_strings a,
(select * from long_strings limit 10) b;
select count(*) from long_strings where s <= repeat("a", 2048);
Change-Id: Ie4786a4a75fdaffedd6e17cf076b5368ba4b4e3e
Reviewed-on: http://gerrit.cloudera.org:8080/6768
Reviewed-by: Jim Apple <jb...@apache.org>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5cab97fd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5cab97fd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5cab97fd
Branch: refs/heads/master
Commit: 5cab97fd7faced41bbcc2370a400cbe92c3e0128
Parents: aa05c64
Author: Jim Apple <jb...@apache.org>
Authored: Sat Apr 29 15:47:40 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat May 6 22:05:04 2017 +0000
----------------------------------------------------------------------
be/src/benchmarks/string-compare-benchmark.cc | 181 +++++----------------
be/src/runtime/string-value.inline.h | 28 +---
2 files changed, 49 insertions(+), 160 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cab97fd/be/src/benchmarks/string-compare-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/string-compare-benchmark.cc b/be/src/benchmarks/string-compare-benchmark.cc
index cddaaa5..30bfcb6 100644
--- a/be/src/benchmarks/string-compare-benchmark.cc
+++ b/be/src/benchmarks/string-compare-benchmark.cc
@@ -23,35 +23,43 @@
#include "util/cpu-info.h"
#include "util/sse-util.h"
+#include "gutil/strings/substitute.h"
+
#include "common/names.h"
using namespace impala;
// Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
//
-// Long strings (10000): Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// Length 1: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+// StringCompare 2.19e+05 2.32e+05 2.36e+05 1X 1X 1X
+// strncmp 3.68e+05 3.83e+05 3.9e+05 1.68X 1.65X 1.65X
+// memcmp 3.88e+05 4.01e+05 4.05e+05 1.77X 1.73X 1.72X
+//
+// Length 10: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// Original 85 86.5 86.5 1X 1X 1X
-// Simplified, broken 76.6 78 78 0.901X 0.901X 0.901X
-// Simplified, fixed 95.8 97.5 97.5 1.13X 1.13X 1.13X
+// StringCompare 1.86e+05 1.89e+05 1.92e+05 1X 1X 1X
+// strncmp 2.76e+05 2.78e+05 2.8e+05 1.48X 1.47X 1.46X
+// memcmp 3.24e+05 3.27e+05 3.3e+05 1.75X 1.73X 1.72X
//
-// Med strings (100): Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// Length 100: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// Original 6.55e+03 6.66e+03 6.74e+03 1X 1X 1X
-// Simplified, broken 6.25e+03 6.32e+03 6.38e+03 0.955X 0.949X 0.947X
-// Simplified, fixed 7.38e+03 7.49e+03 7.55e+03 1.13X 1.12X 1.12X
+// StringCompare 4.9e+04 4.95e+04 5e+04 1X 1X 1X
+// strncmp 9.5e+04 9.65e+04 9.77e+04 1.94X 1.95X 1.95X
+// memcmp 1.69e+05 1.72e+05 1.74e+05 3.46X 3.47X 3.47X
//
-// Short strings (10): Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
+// Length 10000: Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
// (relative) (relative) (relative)
// ---------------------------------------------------------------------------------------------------------
-// Original 1.59e+04 1.62e+04 1.63e+04 1X 1X 1X
-// Simplified, broken 2.8e+04 2.85e+04 2.87e+04 1.76X 1.76X 1.76X
-// Simplified, fixed 2.92e+04 2.96e+04 2.99e+04 1.83X 1.83X 1.84X
+// StringCompare 640 642 643 1X 1X 1X
+// strncmp 2.17e+03 2.18e+03 2.2e+03 3.39X 3.4X 3.42X
+// memcmp 4.62e+03 4.64e+03 4.69e+03 7.22X 7.23X 7.29X
-// Original
-int StringCompare1(const char* s1, int n1, const char* s2, int n2, int len) {
+int StringCompare(const char* s1, int n1, const char* s2, int n2, int len) {
DCHECK_EQ(len, std::min(n1, n2));
if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
@@ -66,71 +74,18 @@ int StringCompare1(const char* s1, int n1, const char* s2, int n2, int len) {
s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
}
- if (len >= SSEUtil::CHARS_PER_64_BIT_REGISTER) {
- // Load 64 bits at a time, the upper 64 bits of the xmm register is set to 0
- __m128i xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s1));
- __m128i xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s2));
- // The upper bits always match (always 0), hence the comparison to
- // CHAR_PER_128_REGISTER
- int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
- SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1, SSEUtil::CHARS_PER_128_BIT_REGISTER);
- if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- return s1[chars_match] - s2[chars_match];
- }
- len -= SSEUtil::CHARS_PER_64_BIT_REGISTER;
- s1 += SSEUtil::CHARS_PER_64_BIT_REGISTER;
- s2 += SSEUtil::CHARS_PER_64_BIT_REGISTER;
- }
}
- // TODO: for some reason memcmp is way slower than strncmp (2.5x) why?
- int result = strncmp(s1, s2, len);
+ // memcmp has undefined behavior when called on nullptr for either pointer
+ int result = (len == 0) ? 0 : strncmp(s1, s2, len);
if (result != 0) return result;
return n1 - n2;
}
-// Simplified but broken (can't safely load s1 and s2)
-int StringCompare2(const char* s1, int n1, const char* s2, int n2, int len) {
+template<typename T, int COMPARE(const T*, const T*, size_t)>
+int SimpleCompare(const char* s1, int n1, const char* s2, int n2, int len) {
DCHECK_EQ(len, std::min(n1, n2));
- if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
- while (len > 0) {
- __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
- __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
- int n = std::min(len, 16);
- int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0, n, xmm1, n);
- if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- return s1[chars_match] - s2[chars_match];
- }
- len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- }
- return n1 - n2;
- }
- // TODO: for some reason memcmp is way slower than strncmp (2.5x) why?
- int result = strncmp(s1, s2, len);
- if (result != 0) return result;
- return n1 - n2;
-}
-
-// Simplified and not broken
-int StringCompare3(const char* s1, int n1, const char* s2, int n2, int len) {
- DCHECK_EQ(len, std::min(n1, n2));
- if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
- while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
- __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
- int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
- SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1, SSEUtil::CHARS_PER_128_BIT_REGISTER);
- if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- return s1[chars_match] - s2[chars_match];
- }
- len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- }
- }
- // TODO: for some reason memcmp is way slower than strncmp (2.5x) why?
- int result = strncmp(s1, s2, len);
+ // memcmp has undefined behavior when called on nullptr for either pointer
+ const int result = (len == 0) ? 0 : COMPARE(s1, s2, len);
if (result != 0) return result;
return n1 - n2;
}
@@ -143,51 +98,13 @@ struct TestData {
int result;
};
-void TestStringCompare1(int batch_size, void* d) {
+template<int (*STRING_COMPARE)(const char* s1, int n1, const char* s2, int n2, int len)>
+void TestStringCompare(int batch_size, void* d) {
TestData* data = reinterpret_cast<TestData*>(d);
- int len = std::min(data->n1, data->n2);
+ const int len = std::min(data->n1, data->n2);
+ data->result = 0;
for (int i = 0; i < batch_size; ++i) {
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
- }
-}
-
-void TestStringCompare2(int batch_size, void* d) {
- TestData* data = reinterpret_cast<TestData*>(d);
- int len = std::min(data->n1, data->n2);
- for (int i = 0; i < batch_size; ++i) {
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
- }
-}
-
-void TestStringCompare3(int batch_size, void* d) {
- TestData* data = reinterpret_cast<TestData*>(d);
- int len = std::min(data->n1, data->n2);
- for (int i = 0; i < batch_size; ++i) {
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
- data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
+ data->result += STRING_COMPARE(data->s1, data->n1, data->s2, data->n2, len);
}
}
@@ -202,30 +119,22 @@ TestData InitTestData(int len) {
return data;
}
+void BenchmarkAll(int len) {
+ Benchmark suite(Substitute("Length $0", len));
+ TestData data = InitTestData(len);
+ suite.AddBenchmark("StringCompare", TestStringCompare<StringCompare>, &data);
+ suite.AddBenchmark("strncmp", TestStringCompare<SimpleCompare<char, strncmp>>, &data);
+ suite.AddBenchmark("memcmp", TestStringCompare<SimpleCompare<void, memcmp>>, &data);
+ cout << suite.Measure() << endl;
+}
+
int main(int argc, char **argv) {
CpuInfo::Init();
cout << Benchmark::GetMachineInfo() << endl << endl;
- Benchmark long_suite("Long strings (10000)");
- TestData long_data = InitTestData(10000);
- long_suite.AddBenchmark("Original", TestStringCompare1, &long_data);
- long_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &long_data);
- long_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &long_data);
- cout << long_suite.Measure() << endl;
-
- Benchmark med_suite("Med strings (100)");
- TestData med_data = InitTestData(100);
- med_suite.AddBenchmark("Original", TestStringCompare1, &med_data);
- med_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &med_data);
- med_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &med_data);
- cout << med_suite.Measure() << endl;
-
- Benchmark short_suite("Short strings (10)");
- TestData short_data = InitTestData(10);
- short_suite.AddBenchmark("Original", TestStringCompare1, &short_data);
- short_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &short_data);
- short_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &short_data);
- cout << short_suite.Measure() << endl;
+ for (int len : {1, 10, 100, 10000}) {
+ BenchmarkAll(len);
+ }
return 0;
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cab97fd/be/src/runtime/string-value.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/string-value.inline.h b/be/src/runtime/string-value.inline.h
index 073d01f..e6db93f 100644
--- a/be/src/runtime/string-value.inline.h
+++ b/be/src/runtime/string-value.inline.h
@@ -27,37 +27,17 @@
namespace impala {
-/// Compare two strings using sse4.2 intrinsics if they are available. This code assumes
-/// that the trivial cases are already handled (i.e. one string is empty).
-/// Returns:
+/// Compare two strings. Returns:
/// < 0 if s1 < s2
/// 0 if s1 == s2
/// > 0 if s1 > s2
-/// The SSE code path is just under 2x faster than the non-sse code path.
+///
/// - s1/n1: ptr/len for the first string
/// - s2/n2: ptr/len for the second string
/// - len: min(n1, n2) - this can be more cheaply passed in by the caller
static inline int StringCompare(const char* s1, int n1, const char* s2, int n2, int len) {
- DCHECK_EQ(len, std::min(n1, n2));
- if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
- while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
- __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
- int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
- SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1,
- SSEUtil::CHARS_PER_128_BIT_REGISTER);
- if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
- // Match strncmp() behavior, which interprets characters as unsigned char.
- return static_cast<unsigned char>(s1[chars_match]) -
- static_cast<unsigned char>(s2[chars_match]);
- }
- len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
- }
- }
- // TODO: for some reason memcmp is way slower than strncmp (2.5x) why?
- int result = strncmp(s1, s2, len);
+ // memcmp has undefined behavior when called on nullptr for either pointer
+ const int result = (len == 0) ? 0 : memcmp(s1, s2, len);
if (result != 0) return result;
return n1 - n2;
}