You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/08/25 21:40:36 UTC

incubator-impala git commit: IMPALA-2809: Improve ByteSwap with builtin function or SSSE3 or AVX2.

Repository: incubator-impala
Updated Branches:
  refs/heads/master 19a2dcfbe -> f7bf59f5c


IMPALA-2809: Improve ByteSwap with builtin function or SSSE3 or AVX2.

Using SSSE3/AVX2 intrinsic to accelerate the function
"static inline void ByteSwap(void* dst, const void* src, int len)" of BitUtil class,
and a scalar byte-swap routine is added as fallback.
Also the runtime selector for CPUs of different capacity is included,
as well as performance test and data verification.
Brief performance comparison is listed here:
CPU: Intel(R) Core(TM) i5-4460  CPU@3.20GHz
Result:
I0725 20:47:02.402506  2078 bswap-benchmark.cc:117] Machine Info: Intel(R) Core(TM) i5-4460  CPU @ 3.20GHz
ByteSwap benchmark:        Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
                                                                         (relative) (relative) (relative)
---------------------------------------------------------------------------------------------------------
                         FastScalar                675      725      731         1X         1X         1X
                              SSSE3           6.12e+03  6.2e+03 6.23e+03      9.06X      8.55X      8.53X
                               AVX2           1.87e+04 1.88e+04 1.89e+04      27.7X      25.9X      25.9X
                               SIMD           1.82e+04 1.88e+04 1.89e+04        27X      25.9X      25.9X

Change-Id: I392ed5a8d5683f30f161282c228c1aedd7b648c1
Reviewed-on: http://gerrit.cloudera.org:8080/4124
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f7bf59f5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f7bf59f5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f7bf59f5

Branch: refs/heads/master
Commit: f7bf59f5c4db6d7b897fcf159a98913de2da8122
Parents: 19a2dcf
Author: Hayabusa-intel <42...@qq.com>
Authored: Tue Aug 23 09:47:30 2016 +0800
Committer: Internal Jenkins <cl...@gerrit.cloudera.org>
Committed: Thu Aug 25 21:33:00 2016 +0000

----------------------------------------------------------------------
 be/src/benchmarks/CMakeLists.txt     |   1 +
 be/src/benchmarks/bswap-benchmark.cc | 120 +++++++++++++++++++++
 be/src/exprs/string-functions-ir.cc  |   2 +-
 be/src/util/bit-util-test.cc         | 109 +++++++++++++++----
 be/src/util/bit-util.cc              | 170 ++++++++++++++++++++++++------
 be/src/util/bit-util.h               |  27 +++++
 6 files changed, 378 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/benchmarks/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/CMakeLists.txt b/be/src/benchmarks/CMakeLists.txt
index 1793230..0a28dce 100644
--- a/be/src/benchmarks/CMakeLists.txt
+++ b/be/src/benchmarks/CMakeLists.txt
@@ -49,5 +49,6 @@ ADD_BE_BENCHMARK(expr-benchmark)
 ADD_BE_BENCHMARK(hash-benchmark)
 ADD_BE_BENCHMARK(in-predicate-benchmark)
 ADD_BE_BENCHMARK(network-perf-benchmark)
+ADD_BE_BENCHMARK(bswap-benchmark)
 
 target_link_libraries(hash-benchmark Experiments)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/benchmarks/bswap-benchmark.cc
----------------------------------------------------------------------
diff --git a/be/src/benchmarks/bswap-benchmark.cc b/be/src/benchmarks/bswap-benchmark.cc
new file mode 100644
index 0000000..a3bda80
--- /dev/null
+++ b/be/src/benchmarks/bswap-benchmark.cc
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <iostream>
+#include <algorithm>
+#include <stdlib.h>
+#include <immintrin.h>
+
+#include "exec/parquet-common.h"
+#include "runtime/decimal-value.h"
+#include "util/benchmark.h"
+#include "util/cpu-info.h"
+#include "util/bit-util.h"
+
+#include "common/names.h"
+
+using std::numeric_limits;
+using namespace impala;
+
+// This benchmark is to compare the performance for all available byteswap approaches:
+// 1. OldImpala: use the old Impala routine to byte-swap the input array.
+// Corresponding performance is used as the baseline.
+// 2. FastScalar: use the ByteSwapScalar routine in bit-util.inline.h to byte-swap
+// the input array with subdivided byte sizes, which is proposed by Zuo Wang.
+// 3. SSSE3: use the SSSE3 SIMD routine to byte-swap the input array
+// without arch-selector branches;
+// 4. AVX2: use the AVX2 SIMD routine to byte-swap the input array
+// without arch-selector branches;
+// 5. SIMD: use the comprehensive SIMD routine to byte-swap the input array
+// with arch-selector branches;
+// Result:
+// I0725 20:47:02.402506  2078 bswap-benchmark.cc:117] Machine Info: Intel(R) Core(TM) i5-4460  CPU @ 3.20GHz
+// ByteSwap benchmark:        Function  iters/ms   10%ile   50%ile   90%ile     10%ile     50%ile     90%ile
+//                                                                          (relative) (relative) (relative)
+// ---------------------------------------------------------------------------------------------------------
+//                          FastScalar                675      725      731         1X         1X         1X
+//                               SSSE3           6.12e+03  6.2e+03 6.23e+03      9.06X      8.55X      8.53X
+//                                AVX2           1.87e+04 1.88e+04 1.89e+04      27.7X      25.9X      25.9X
+//                                SIMD           1.82e+04 1.88e+04 1.89e+04        27X      25.9X      25.9X
+
+
+// Data structure used in the benchmark;
+struct TestData {
+  int32_t num_values;
+  uint8_t* inbuffer;
+  uint8_t* outbuffer;
+};
+
+// Initialization routine for benchmark data;
+void InitData(uint8_t* input, const int len) {
+  srand(time(NULL));
+  for (int i = 0; i < len; ++i) {
+    input[i] = rand() % 256;
+  }
+}
+
+// Test for the scalar approach;
+void TestFastScalarSwap(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  SimdByteSwap::ByteSwapScalar(data->inbuffer, data->num_values, data->outbuffer);
+}
+
+// Test for the SSSE3 subroutine;
+void TestSSSE3Swap(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  SimdByteSwap::ByteSwapSimd<16>(data->inbuffer, data->num_values, data->outbuffer);
+}
+
+// Test for the AVX2 subroutine;
+void TestAVX2Swap(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  SimdByteSwap::ByteSwapSimd<32>(data->inbuffer, data->num_values, data->outbuffer);
+}
+
+// Test for the SIMD approach in a general way;
+void TestSIMDSwap(int batch_size, void* d) {
+  TestData* data = reinterpret_cast<TestData*>(d);
+  BitUtil::ByteSwap(data->outbuffer, data->inbuffer, data->num_values);
+}
+
+// Benchmark routine for FastScalar/"Pure" SSSE3/"Pure" AVX2/SIMD approaches
+void PerfBenchmark() {
+  const int data_len = 1 << 20;
+  Benchmark suite("ByteSwap benchmark");
+  vector<uint8_t> inbuffer_vector(data_len, 0);
+  vector<uint8_t> outbuffer_vector(data_len, 0);
+  TestData data;
+
+  data.num_values = data_len;
+  data.inbuffer = &inbuffer_vector[0];
+  data.outbuffer = &outbuffer_vector[0];
+  InitData(data.inbuffer, data_len);
+
+  const int baseline = suite.AddBenchmark("FastScalar", TestFastScalarSwap, &data, -1);
+  suite.AddBenchmark("SSSE3", TestSSSE3Swap, &data, baseline);
+  suite.AddBenchmark("AVX2", TestAVX2Swap, &data, baseline);
+  suite.AddBenchmark("SIMD", TestSIMDSwap, &data, baseline);
+  cout << suite.Measure();
+}
+
+int main(int argc, char **argv) {
+  CpuInfo::Init();
+  LOG(INFO) << Benchmark::GetMachineInfo();
+  PerfBenchmark();
+  return 0;
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/exprs/string-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/string-functions-ir.cc b/be/src/exprs/string-functions-ir.cc
index 82f6b13..06b16af 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -208,7 +208,7 @@ StringVal StringFunctions::Reverse(FunctionContext* context, const StringVal& st
   if (str.is_null) return StringVal::null();
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
-  std::reverse_copy(str.ptr, str.ptr + str.len, result.ptr);
+  BitUtil::ByteSwap(result.ptr, str.ptr, str.len);
   return result;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc
index 6c419e3..9bfd494 100644
--- a/be/src/util/bit-util-test.cc
+++ b/be/src/util/bit-util-test.cc
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <iostream>
+#include <algorithm>
 #include <limits.h>
 
 #include <boost/utility.hpp>
@@ -94,6 +95,80 @@ TEST(BitUtil, TrailingBits) {
   EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
 }
 
+// Test different SIMD functionality units with an input/output buffer.
+// CpuFlag parameter indicates SIMD routine to be tested:
+//   CpuInfo::SSSE3 for ByteSwapSSE_Unit;
+//   CpuInfo::AVX2 for ByteSwapAVX2_Unit;
+void TestByteSwapSimd_Unit(const int64_t CpuFlag) {
+  void (*bswap_fptr)(const uint8_t* src, uint8_t* dst) = NULL;
+  int buf_size = 0;
+  if (CpuFlag == CpuInfo::SSSE3) {
+    buf_size = 16;
+    bswap_fptr = SimdByteSwap::ByteSwap128;
+  } else {
+    static const int64_t AVX2_MASK = CpuInfo::AVX2;
+    ASSERT_EQ(CpuFlag, AVX2_MASK);
+    buf_size = 32;
+    bswap_fptr = SimdByteSwap::ByteSwap256;
+  }
+
+  DCHECK(bswap_fptr != NULL);
+  uint8_t src_buf[buf_size];
+  uint8_t dst_buf[buf_size];
+  std::iota(src_buf, src_buf + buf_size, 0);
+  bswap_fptr(src_buf, dst_buf);
+
+  // Validate the swap results.
+  for (int j = 0; j < buf_size; ++j) {
+    EXPECT_EQ(dst_buf[j], buf_size - j - 1);
+    EXPECT_EQ(dst_buf[j], src_buf[buf_size - j - 1]);
+  }
+}
+
+// Test the logic of ByteSwapSimd control flow using specified SIMD routine with an
+// input/output buffer.
+// CpuFlag parameter indicates SIMD routine to be tested:
+//   CpuInfo::SSSE3 for SimdByteSwap::ByteSwapSSE_Unit;
+//   CpuInfo::AVX2 for SimdByteSwap::ByteSwapAVX2_Unit;
+//   CpuFlag == 0 for BitUtil::ByteSwap;
+// buf_size parameter indicates the size of input/output buffer.
+void TestByteSwapSimd(const int64_t CpuFlag, const int buf_size) {
+  uint8_t src_buf[buf_size];
+  uint8_t dst_buf[buf_size];
+  std::iota(src_buf, src_buf + buf_size, 0);
+
+  int start_size = 0;
+  if (CpuFlag == CpuInfo::SSSE3) {
+    start_size = 16;
+  } else if (CpuFlag == CpuInfo::AVX2) {
+    start_size = 32;
+  }
+
+  for (int i = start_size; i < buf_size; ++i) {
+    // Initialize dst buffer and swap i bytes.
+    memset(dst_buf, 0, buf_size);
+    if (CpuFlag == CpuInfo::SSSE3) {
+      SimdByteSwap::ByteSwapSimd<16>(src_buf, i, dst_buf);
+    } else if (CpuFlag == CpuInfo::AVX2) {
+      SimdByteSwap::ByteSwapSimd<32>(src_buf, i, dst_buf);
+    } else {
+      // CpuFlag == 0: test the internal logic of BitUtil::ByteSwap
+      ASSERT_EQ(CpuFlag, 0);
+      BitUtil::ByteSwap(dst_buf, src_buf, i);
+    }
+
+    // Validate the swap results.
+    for (int j = 0; j < i; ++j) {
+      EXPECT_EQ(dst_buf[j], i - j - 1);
+      EXPECT_EQ(dst_buf[j], src_buf[i - j - 1]);
+    }
+    // Check that the dst buffer is otherwise unmodified.
+    for (int j = i; j < buf_size; ++j) {
+      EXPECT_EQ(dst_buf[j], 0);
+    }
+  }
+}
+
 TEST(BitUtil, ByteSwap) {
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
@@ -115,27 +190,23 @@ TEST(BitUtil, ByteSwap) {
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
 
-  // Test ByteSwap() with an input/output buffer, swapping up to 32 bytes.
-  int buf_size = 32;
-  uint8_t src_buf[buf_size];
-  for (int i = 0; i < buf_size; ++i) {
-    src_buf[i] = i;
+  // Tests for ByteSwap SIMD functions
+  if (CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+    // Test SSSE3 functionality unit
+    TestByteSwapSimd_Unit(CpuInfo::SSSE3);
+    // Test ByteSwapSimd() using SSSE3;
+    TestByteSwapSimd(CpuInfo::SSSE3, 64);
   }
-  uint8_t dst_buf[buf_size];
-  for (int i = 0; i < buf_size; ++i) {
-    // Init dst buffer and swap i bytes.
-    memset(dst_buf, 0, buf_size);
-    BitUtil::ByteSwap(dst_buf, src_buf, i);
-    // Validate the swap results.
-    for (int j = 0; j < i; ++j) {
-      EXPECT_EQ(dst_buf[j], i - j - 1);
-      EXPECT_EQ(dst_buf[j], src_buf[i - j - 1]);
-    }
-    // Check that the dst buffer is otherwise unmodified.
-    for (int j = i; j < buf_size; ++j) {
-      EXPECT_EQ(dst_buf[j], 0);
-    }
+
+  if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+    // Test AVX2 functionality unit
+    TestByteSwapSimd_Unit(CpuInfo::AVX2);
+    // Test ByteSwapSimd() using AVX2;
+    TestByteSwapSimd(CpuInfo::AVX2, 64);
   }
+
+  // Test BitUtil::ByteSwap(Black Box Testing)
+  for (int i = 0; i <= 32; ++i) TestByteSwapSimd(0, i);
 }
 
 TEST(BitUtil, Log2) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/util/bit-util.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.cc b/be/src/util/bit-util.cc
index 06ed32b..76224c8 100644
--- a/be/src/util/bit-util.cc
+++ b/be/src/util/bit-util.cc
@@ -16,10 +16,21 @@
 // under the License.
 
 #include "util/bit-util.h"
+#include <immintrin.h>
 
-namespace impala {
+namespace {
+// ByteSwapScalarLoop is only used in bit-util.cc, so put it in this anonymous
+// namespace
+inline static void ByteSwapScalarLoop(const void* src, int len, void* dst) {
+  //TODO: improve the performance of following code further using BSWAP intrinsic
+  uint8_t* d = reinterpret_cast<uint8_t*>(dst);
+  const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
+  for (int i = 0; i < len; ++i) d[i] = s[len - i - 1];
+}
+}
 
-void BitUtil::ByteSwap(void* dest, const void* source, int len) {
+namespace impala {
+void SimdByteSwap::ByteSwapScalar(const void* source, int len, void* dest) {
   uint8_t* dst = reinterpret_cast<uint8_t*>(dest);
   const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
   switch (len) {
@@ -28,100 +39,197 @@ void BitUtil::ByteSwap(void* dest, const void* source, int len) {
       return;
     case 2:
       *reinterpret_cast<uint16_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src));
       return;
     case 3:
       *reinterpret_cast<uint16_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 2);
       return;
     case 4:
       *reinterpret_cast<uint32_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
       return;
     case 5:
       *reinterpret_cast<uint32_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 4);
       return;
     case 6:
       *reinterpret_cast<uint32_t*>(dst + 2) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
       *reinterpret_cast<uint16_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
       return;
     case 7:
       *reinterpret_cast<uint32_t*>(dst + 3) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src));
       *reinterpret_cast<uint16_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 6);
       return;
     case 8:
       *reinterpret_cast<uint64_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       return;
     case 9:
       *reinterpret_cast<uint64_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 8);
       return;
     case 10:
       *reinterpret_cast<uint64_t*>(dst + 2) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint16_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
       return;
     case 11:
       *reinterpret_cast<uint64_t*>(dst + 3) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint16_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 10);
       return;
     case 12:
       *reinterpret_cast<uint64_t*>(dst + 4) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint32_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
       return;
     case 13:
       *reinterpret_cast<uint64_t*>(dst + 5) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint32_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 12);
       return;
     case 14:
       *reinterpret_cast<uint64_t*>(dst + 6) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint32_t*>(dst + 2) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
       *reinterpret_cast<uint16_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
       return;
     case 15:
       *reinterpret_cast<uint64_t*>(dst + 7) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint32_t*>(dst + 3) =
-          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
       *reinterpret_cast<uint16_t*>(dst + 1) =
-          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
       *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 14);
       return;
     case 16:
       *reinterpret_cast<uint64_t*>(dst + 8) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src));
       *reinterpret_cast<uint64_t*>(dst) =
-          ByteSwap(*reinterpret_cast<const uint64_t*>(src + 8));
+          BitUtil::ByteSwap(*reinterpret_cast<const uint64_t*>(src + 8));
       return;
     default:
       // Revert to slow loop-based swap.
-      for (int i = 0; i < len; ++i) {
-        dst[i] = src[len - i - 1];
-      }
+      ByteSwapScalarLoop(source, len, dest);
       return;
   }
 }
 
+// This constant is concluded from the definition of _mm_set_epi8;
+// Refer this link for more details:
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+const __m128i mask128i = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+    13, 14, 15);
+// ByteSwap 16 bytes using SSSE3 instructions.
+__attribute__((target("ssse3")))
+inline void SimdByteSwap::ByteSwap128(const uint8_t* src, uint8_t* dst) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), _mm_shuffle_epi8(
+      _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)), mask128i));
+}
+
+// ByteSwap 32 bytes using AVX2 instructions.
+__attribute__((target("avx2")))
+inline void SimdByteSwap::ByteSwap256(const uint8_t* src, uint8_t* dst) {
+  // This constant is concluded from the definition of _mm256_set_epi8;
+  // Refer this link for more details:
+  // https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+  const __m256i mask256i = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+    11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), _mm256_shuffle_epi8(
+      _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)), mask256i));
+  __m128i part1 = *reinterpret_cast<__m128i*>(dst);
+  __m128i part2 = *reinterpret_cast<__m128i*>(dst + 16);
+  *reinterpret_cast<__m128i*>(dst) = part2;
+  *reinterpret_cast<__m128i*>(dst + 16) = part1;
+  _mm256_zeroupper();
+}
+
+// Internal implementation of ByteSwapSimd
+// TEMPLATE_DATA_WIDTH: 16byte or 32byte, corresponding to SSSE3 or AVX2 routine
+// SIMD_FUNC: function pointer to ByteSwapSSE_Unit(16byte) or ByteSwapAVX_Unit(32byte)
+// dest: the memory address of destination
+// source: the memory address of source
+// len: the number of bytes of input data
+template <int TEMPLATE_DATA_WIDTH>
+inline void SimdByteSwap::ByteSwapSimd(const void* source, const int len, void* dest) {
+  DCHECK(TEMPLATE_DATA_WIDTH == 16 || TEMPLATE_DATA_WIDTH == 32)
+    << "Only 16 or 32 are valid for TEMPLATE_DATA_WIDTH now.";
+  /// Function pointer to SIMD ByteSwap functions
+  void (*bswap_fptr)(const uint8_t* src, uint8_t* dst) = NULL;
+  if (TEMPLATE_DATA_WIDTH == 16) {
+    bswap_fptr = SimdByteSwap::ByteSwap128;
+  } else if (TEMPLATE_DATA_WIDTH == 32) {
+    bswap_fptr = SimdByteSwap::ByteSwap256;
+  }
+
+  const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
+  uint8_t* dst = reinterpret_cast<uint8_t*>(dest);
+  src += len - TEMPLATE_DATA_WIDTH;
+  int i = len - TEMPLATE_DATA_WIDTH;
+  while (true) {
+    bswap_fptr(src, dst);
+    dst += TEMPLATE_DATA_WIDTH;
+    if (i < TEMPLATE_DATA_WIDTH) break;
+    i -= TEMPLATE_DATA_WIDTH;
+    src -= TEMPLATE_DATA_WIDTH;
+  }
+  if (TEMPLATE_DATA_WIDTH > 16 && i >= 16) {
+    src -= 16;
+    SimdByteSwap::ByteSwap128(src, dst);
+    i -= 16;
+    dst += 16;
+  }
+  // Remaining bytes(<16) are dealt with scalar routine
+  // TODO: improve the performance of following code further using pshufb intrinsic
+  src -= i;
+  SimdByteSwap::ByteSwapScalar(src, i, dst);
+}
+
+// Explicit instantiations for ByteSwapSSE_Unit and ByteSwapAVX2_Unit
+template void SimdByteSwap::ByteSwapSimd<16>(const void* source, const int len, void* dest);
+template void SimdByteSwap::ByteSwapSimd<32>(const void* source, const int len, void* dest);
+
+void BitUtil::ByteSwap(void* dest, const void* source, int len) {
+  // Branch selection according to current CPU capacity and input data length
+  if (LIKELY(len < 16)) {
+    SimdByteSwap::ByteSwapScalar(source, len, dest);
+  } else if (len >= 32) {
+    // AVX2 can only be used to process data whose size >= 32byte
+    if (CpuInfo::IsSupported(CpuInfo::AVX2)) {
+      SimdByteSwap::ByteSwapSimd<32>(source, len, dest);
+    } else if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSSE3))) {
+      // SSSE3 support is more popular than AVX2.
+      SimdByteSwap::ByteSwapSimd<16>(source, len, dest);
+    } else {
+      SimdByteSwap::ByteSwapScalar(source, len, dest);
+    }
+  } else {
+    // SSSE3 can only be used to process data whose size >= 16byte
+    // 16 <= len < 32
+    if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSSE3))) {
+      SimdByteSwap::ByteSwapSimd<16>(source, len, dest);
+    } else {
+      SimdByteSwap::ByteSwapScalar(source, len, dest);
+    }
+  }
+}
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7bf59f5/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index cf3d784..f947a17 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -243,6 +243,33 @@ class BitUtil {
   }
 };
 
+/// An encapsulation class of SIMD byteswap functions
+class SimdByteSwap {
+ public:
+  /// Byteswap an array in the scalar way with some builtin optimization for arrays of
+  /// length <= 16 bytes.
+  ///   const void* src: the source address of the input array;
+  ///   int len: the length of the input array;
+  ///   void* dst: the destination address of the output array;
+  static void ByteSwapScalar(const void* src, int len, void* dst);
+
+  /// SIMD ByteSwap functions:
+  /// ByteSwap128 is to byteswap an array of 16 bytes(128 bits) using SSSE3 intrinsics;
+  /// ByteSwap256 is to byteswap an array of 32 bytes(256 bits) using AVX2 intrinsics;
+  /// Function parameters have the same meaning as ByteSwapScalar.
+  static void ByteSwap128(const uint8_t* src, uint8_t* dst);
+  static void ByteSwap256(const uint8_t* src, uint8_t* dst);
+
+  /// Template function ByteSwapSimd is the entry point function to byteswap an array
+  /// using SIMD approach.
+  /// Template parameter:
+  ///   int TEMPLATE_DATA_WIDTH: only 16 or 32 are valid now;
+  ///   16 means using ByteSwap128 as the internal SIMD implementation;
+  ///   32 means using ByteSwap256 as the internal SIMD implementation;
+  /// Function parameters have the same meaning as ByteSwapScalar.
+  template <int TEMPLATE_DATA_WIDTH>
+  static void ByteSwapSimd(const void* src, const int len, void* dst);
+};
 }
 
 #endif