You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/05/08 21:46:12 UTC

[3/7] incubator-impala git commit: IMPALA-5273: Replace StringCompare with glibc memcmp

IMPALA-5273: Replace StringCompare with glibc memcmp

glibc's memcmp, which dispatches dynamically based on the instructions
the processor supports, uses sse4.1's ptest, which is faster than our
implementation.

I ran a the benchmark below. The final query sped up by about 5x with
this patch.

    create table long_strings (s string) stored as parquet;
    insert into long_strings values (repeat("a", 2048));
    insert into long_strings select a.s from long_strings a,
      long_strings b;
    insert into long_strings select a.s from long_strings a,
      long_strings b;
    insert into long_strings select a.s from long_strings a,
      long_strings b;
    insert into long_strings select a.s from long_strings a,
      long_strings b;
    insert into long_strings select a.s from long_strings a,
      long_strings b;
    insert into long_strings select a.s from long_strings a,
      (select * from long_strings limit 10) b;
    select count(*) from long_strings where s <= repeat("a", 2048);

Change-Id: Ie4786a4a75fdaffedd6e17cf076b5368ba4b4e3e
Reviewed-on: http://gerrit.cloudera.org:8080/6768
Reviewed-by: Jim Apple <jb...@apache.org>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5cab97fd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5cab97fd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5cab97fd

Branch: refs/heads/master
Commit: 5cab97fd7faced41bbcc2370a400cbe92c3e0128
Parents: aa05c64
Author: Jim Apple <jb...@apache.org>
Authored: Sat Apr 29 15:47:40 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat May 6 22:05:04 2017 +0000

----------------------------------------------------------------------
 be/src/benchmarks/string-compare-benchmark.cc | 181 +++++----------------
 be/src/runtime/string-value.inline.h          |  28 +---
 2 files changed, 49 insertions(+), 160 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cab97fd/be/src/benchmarks/string-compare-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/string-compare-benchmark.cc b/be/src/benchmarks/string-compare-benchmark.cc
index cddaaa5..30bfcb6 100644
--- a/be/src/benchmarks/string-compare-benchmark.cc
+++ b/be/src/benchmarks/string-compare-benchmark.cc
@@ -23,35 +23,43 @@
 #include "util/cpu-info.h"
 #include "util/sse-util.h"
 
+#include "gutil/strings/substitute.h"
+
 #include "common/names.h"
 
 using namespace impala;
 
 // Machine Info: Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
 //
-// Long strings (10000):      Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+// Length 1:                  Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//                       StringCompare           2.19e+05 2.32e+05 2.36e+05         1X         1X         1X
+//                             strncmp           3.68e+05 3.83e+05  3.9e+05      1.68X      1.65X      1.65X
+//                              memcmp           3.88e+05 4.01e+05 4.05e+05      1.77X      1.73X      1.72X
+//
+// Length 10:                 Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//                            Original                 85     86.5     86.5         1X         1X         1X
-//                  Simplified, broken               76.6       78       78     0.901X     0.901X     0.901X
-//                   Simplified, fixed               95.8     97.5     97.5      1.13X      1.13X      1.13X
+//                       StringCompare           1.86e+05 1.89e+05 1.92e+05         1X         1X         1X
+//                             strncmp           2.76e+05 2.78e+05  2.8e+05      1.48X      1.47X      1.46X
+//                              memcmp           3.24e+05 3.27e+05  3.3e+05      1.75X      1.73X      1.72X
 //
-// Med strings (100):         Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+// Length 100:                Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//                            Original           6.55e+03 6.66e+03 6.74e+03         1X         1X         1X
-//                  Simplified, broken           6.25e+03 6.32e+03 6.38e+03     0.955X     0.949X     0.947X
-//                   Simplified, fixed           7.38e+03 7.49e+03 7.55e+03      1.13X      1.12X      1.12X
+//                       StringCompare            4.9e+04 4.95e+04    5e+04         1X         1X         1X
+//                             strncmp            9.5e+04 9.65e+04 9.77e+04      1.94X      1.95X      1.95X
+//                              memcmp           1.69e+05 1.72e+05 1.74e+05      3.46X      3.47X      3.47X
 //
-// Short strings (10):        Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+// Length 10000:              Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
 //                                                                          (relative) (relative) (relative)
 // ---------------------------------------------------------------------------------------------------------
-//                            Original           1.59e+04 1.62e+04 1.63e+04         1X         1X         1X
-//                  Simplified, broken            2.8e+04 2.85e+04 2.87e+04      1.76X      1.76X      1.76X
-//                   Simplified, fixed           2.92e+04 2.96e+04 2.99e+04      1.83X      1.83X      1.84X
+//                       StringCompare                640      642      643         1X         1X         1X
+//                             strncmp           2.17e+03 2.18e+03  2.2e+03      3.39X       3.4X      3.42X
+//                              memcmp           4.62e+03 4.64e+03 4.69e+03      7.22X      7.23X      7.29X
 
-// Original
-int StringCompare1(const char* s1, int n1, const char* s2, int n2, int len) {
+int StringCompare(const char* s1, int n1, const char* s2, int n2, int len) {
   DCHECK_EQ(len, std::min(n1, n2));
   if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
     while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
@@ -66,71 +74,18 @@ int StringCompare1(const char* s1, int n1, const char* s2, int n2, int len) {
       s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
       s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
     }
-    if (len >= SSEUtil::CHARS_PER_64_BIT_REGISTER) {
-      // Load 64 bits at a time, the upper 64 bits of the xmm register is set to 0
-      __m128i xmm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s1));
-      __m128i xmm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s2));
-      // The upper bits always match (always 0), hence the comparison to
-      // CHAR_PER_128_REGISTER
-      int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
-          SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1, SSEUtil::CHARS_PER_128_BIT_REGISTER);
-      if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-        return s1[chars_match] - s2[chars_match];
-      }
-      len -= SSEUtil::CHARS_PER_64_BIT_REGISTER;
-      s1 += SSEUtil::CHARS_PER_64_BIT_REGISTER;
-      s2 += SSEUtil::CHARS_PER_64_BIT_REGISTER;
-    }
   }
-  // TODO: for some reason memcmp is way slower than strncmp (2.5x)  why?
-  int result = strncmp(s1, s2, len);
+  // memcmp has undefined behavior when called on nullptr for either pointer
+  int result = (len == 0) ? 0 : strncmp(s1, s2, len);
   if (result != 0) return result;
   return n1 - n2;
 }
 
-// Simplified but broken (can't safely load s1 and s2)
-int StringCompare2(const char* s1, int n1, const char* s2, int n2, int len) {
+template<typename T, int COMPARE(const T*, const T*, size_t)>
+int SimpleCompare(const char* s1, int n1, const char* s2, int n2, int len) {
   DCHECK_EQ(len, std::min(n1, n2));
-  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
-    while (len > 0) {
-      __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
-      __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
-      int n = std::min(len, 16);
-      int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0, n, xmm1, n);
-      if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-        return s1[chars_match] - s2[chars_match];
-      }
-      len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-    }
-    return n1 - n2;
-  }
-  // TODO: for some reason memcmp is way slower than strncmp (2.5x)  why?
-  int result = strncmp(s1, s2, len);
-  if (result != 0) return result;
-  return n1 - n2;
-}
-
-// Simplified and not broken
-int StringCompare3(const char* s1, int n1, const char* s2, int n2, int len) {
-  DCHECK_EQ(len, std::min(n1, n2));
-  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
-    while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-      __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
-      __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
-      int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
-          SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1, SSEUtil::CHARS_PER_128_BIT_REGISTER);
-      if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-        return s1[chars_match] - s2[chars_match];
-      }
-      len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-    }
-  }
-  // TODO: for some reason memcmp is way slower than strncmp (2.5x)  why?
-  int result = strncmp(s1, s2, len);
+  // memcmp has undefined behavior when called on nullptr for either pointer
+  const int result = (len == 0) ? 0 : COMPARE(s1, s2, len);
   if (result != 0) return result;
   return n1 - n2;
 }
@@ -143,51 +98,13 @@ struct TestData {
   int result;
 };
 
-void TestStringCompare1(int batch_size, void* d) {
+template<int (*STRING_COMPARE)(const char* s1, int n1, const char* s2, int n2, int len)>
+void TestStringCompare(int batch_size, void* d) {
   TestData* data = reinterpret_cast<TestData*>(d);
-  int len = std::min(data->n1, data->n2);
+  const int len = std::min(data->n1, data->n2);
+  data->result = 0;
   for (int i = 0; i < batch_size; ++i) {
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare1(data->s1, data->n1, data->s2, data->n2, len);
-  }
-}
-
-void TestStringCompare2(int batch_size, void* d) {
-  TestData* data = reinterpret_cast<TestData*>(d);
-  int len = std::min(data->n1, data->n2);
-  for (int i = 0; i < batch_size; ++i) {
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare2(data->s1, data->n1, data->s2, data->n2, len);
-  }
-}
-
-void TestStringCompare3(int batch_size, void* d) {
-  TestData* data = reinterpret_cast<TestData*>(d);
-  int len = std::min(data->n1, data->n2);
-  for (int i = 0; i < batch_size; ++i) {
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
-    data->result = StringCompare3(data->s1, data->n1, data->s2, data->n2, len);
+    data->result += STRING_COMPARE(data->s1, data->n1, data->s2, data->n2, len);
   }
 }
 
@@ -202,30 +119,22 @@ TestData InitTestData(int len) {
   return data;
 }
 
+void BenchmarkAll(int len) {
+  Benchmark suite(Substitute("Length $0", len));
+  TestData data = InitTestData(len);
+  suite.AddBenchmark("StringCompare", TestStringCompare<StringCompare>, &data);
+  suite.AddBenchmark("strncmp", TestStringCompare<SimpleCompare<char, strncmp>>, &data);
+  suite.AddBenchmark("memcmp", TestStringCompare<SimpleCompare<void, memcmp>>, &data);
+  cout << suite.Measure() << endl;
+}
+
 int main(int argc, char **argv) {
   CpuInfo::Init();
   cout << Benchmark::GetMachineInfo() << endl << endl;
 
-  Benchmark long_suite("Long strings (10000)");
-  TestData long_data = InitTestData(10000);
-  long_suite.AddBenchmark("Original", TestStringCompare1, &long_data);
-  long_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &long_data);
-  long_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &long_data);
-  cout << long_suite.Measure() << endl;
-
-  Benchmark med_suite("Med strings (100)");
-  TestData med_data = InitTestData(100);
-  med_suite.AddBenchmark("Original", TestStringCompare1, &med_data);
-  med_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &med_data);
-  med_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &med_data);
-  cout << med_suite.Measure() << endl;
-
-  Benchmark short_suite("Short strings (10)");
-  TestData short_data = InitTestData(10);
-  short_suite.AddBenchmark("Original", TestStringCompare1, &short_data);
-  short_suite.AddBenchmark("Simplified, broken", TestStringCompare2, &short_data);
-  short_suite.AddBenchmark("Simplified, fixed", TestStringCompare3, &short_data);
-  cout << short_suite.Measure() << endl;
+  for (int len : {1, 10, 100, 10000}) {
+    BenchmarkAll(len);
+  }
 
   return 0;
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5cab97fd/be/src/runtime/string-value.inline.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/string-value.inline.h b/be/src/runtime/string-value.inline.h
index 073d01f..e6db93f 100644
--- a/be/src/runtime/string-value.inline.h
+++ b/be/src/runtime/string-value.inline.h
@@ -27,37 +27,17 @@
 
 namespace impala {
 
-/// Compare two strings using sse4.2 intrinsics if they are available. This code assumes
-/// that the trivial cases are already handled (i.e. one string is empty).
-/// Returns:
+/// Compare two strings. Returns:
 ///   < 0 if s1 < s2
 ///   0 if s1 == s2
 ///   > 0 if s1 > s2
-/// The SSE code path is just under 2x faster than the non-sse code path.
+///
 ///   - s1/n1: ptr/len for the first string
 ///   - s2/n2: ptr/len for the second string
 ///   - len: min(n1, n2) - this can be more cheaply passed in by the caller
 static inline int StringCompare(const char* s1, int n1, const char* s2, int n2, int len) {
-  DCHECK_EQ(len, std::min(n1, n2));
-  if (CpuInfo::IsSupported(CpuInfo::SSE4_2)) {
-    while (len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-      __m128i xmm0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s1));
-      __m128i xmm1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s2));
-      int chars_match = SSE4_cmpestri<SSEUtil::STRCMP_MODE>(xmm0,
-          SSEUtil::CHARS_PER_128_BIT_REGISTER, xmm1,
-          SSEUtil::CHARS_PER_128_BIT_REGISTER);
-      if (chars_match != SSEUtil::CHARS_PER_128_BIT_REGISTER) {
-        // Match strncmp() behavior, which interprets characters as unsigned char.
-        return static_cast<unsigned char>(s1[chars_match]) -
-            static_cast<unsigned char>(s2[chars_match]);
-      }
-      len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s1 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-      s2 += SSEUtil::CHARS_PER_128_BIT_REGISTER;
-    }
-  }
-  // TODO: for some reason memcmp is way slower than strncmp (2.5x)  why?
-  int result = strncmp(s1, s2, len);
+  // memcmp has undefined behavior when called on nullptr for either pointer
+  const int result = (len == 0) ? 0 : memcmp(s1, s2, len);
   if (result != 0) return result;
   return n1 - n2;
 }