You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2016/05/13 22:53:00 UTC
[4/4] incubator-impala git commit: IMPALA-2809: Improve scalar ByteSwap().

IMPALA-2809: Improve scalar ByteSwap().

This patch improves our ByteSwap() function by handling
more byte sizes in the fast path, as opposed to the
loop-based slow path.

ByteSwap() is used heavily in when scanning Parquet decimals.

Before this patch, VTune showed ByteSwap() among the top
three worst cycle offenders when running TPCH-Q6 on my local
setup with a large lineitem table.

After this patch, ByteSwap() shows no significant contribution
to the overall cycles spent.

There was a measurable improvement of a few percent for TPCH-Q6.

Change-Id: I4f462e6bdb022db46b48889a6a7426120a80d9b4
Reviewed-on: http://gerrit.cloudera.org:8080/3033
Reviewed-by: Dan Hecht <dh...@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/0306dd57
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/0306dd57
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/0306dd57

Branch: refs/heads/master
Commit: 0306dd576d05b6773e380d93af8b3dab90ecdd32
Parents: 46c3e43
Author: Youwei Wang <yo...@intel.com>
Authored: Wed May 11 15:27:27 2016 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Fri May 13 15:52:53 2016 -0700

----------------------------------------------------------------------
 be/src/util/bit-util-test.cc  |  24 ++++++-
 be/src/util/bit-util.h        |  40 +++---------
 be/src/util/bit-util.inline.h | 130 +++++++++++++++++++++++++++++++++++++
 be/src/util/decimal-util.h    |   2 +-
 4 files changed, 162 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc
index f376737..e2bdd97 100644
--- a/be/src/util/bit-util-test.cc
+++ b/be/src/util/bit-util-test.cc
@@ -21,7 +21,7 @@
 #include <gtest/gtest.h>
 
 #include "gutil/bits.h"
-#include "util/bit-util.h"
+#include "util/bit-util.inline.h"
 #include "util/cpu-info.h"
 
 #include "common/names.h"
@@ -111,6 +111,28 @@ TEST(BitUtil, ByteSwap) {
 
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
   EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
+
+  // Test ByteSwap() with an input/output buffer, swapping up to 32 bytes.
+  int buf_size = 32;
+  uint8_t src_buf[buf_size];
+  for (int i = 0; i < buf_size; ++i) {
+    src_buf[i] = i;
+  }
+  uint8_t dst_buf[buf_size];
+  for (int i = 0; i < buf_size; ++i) {
+    // Init dst buffer and swap i bytes.
+    memset(dst_buf, 0, buf_size);
+    BitUtil::ByteSwap(dst_buf, src_buf, i);
+    // Validate the swap results.
+    for (int j = 0; j < i; ++j) {
+      EXPECT_EQ(dst_buf[j], i - j - 1);
+      EXPECT_EQ(dst_buf[j], src_buf[i - j - 1]);
+    }
+    // Check that the dst buffer is otherwise unmodified.
+    for (int j = i; j < buf_size; ++j) {
+      EXPECT_EQ(dst_buf[j], 0);
+    }
+  }
 }
 
 TEST(BitUtil, Log2) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index eed6df3..cae8212 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -148,49 +148,25 @@ class BitUtil {
     return __builtin_bswap64(value);
   }
   static inline uint64_t ByteSwap(uint64_t value) {
-    return static_cast<uint64_t>(__builtin_bswap64(value));
+    return __builtin_bswap64(value);
   }
   static inline int32_t ByteSwap(int32_t value) {
     return __builtin_bswap32(value);
   }
   static inline uint32_t ByteSwap(uint32_t value) {
-    return static_cast<uint32_t>(__builtin_bswap32(value));
+    return __builtin_bswap32(value);
   }
   static inline int16_t ByteSwap(int16_t value) {
-    return (((value >> 8) & 0xff) | ((value & 0xff) << 8));
+    return __builtin_bswap16(value);
   }
   static inline uint16_t ByteSwap(uint16_t value) {
-    return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+    return __builtin_bswap16(value);
   }
 
-  /// Write the swapped bytes into dst. Src and st cannot overlap.
-  static inline void ByteSwap(void* dst, const void* src, int len) {
-    switch (len) {
-      case 1:
-        *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
-        return;
-      case 2:
-        *reinterpret_cast<int16_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int16_t*>(src));
-        return;
-      case 4:
-        *reinterpret_cast<int32_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int32_t*>(src));
-        return;
-      case 8:
-        *reinterpret_cast<int64_t*>(dst) =
-            ByteSwap(*reinterpret_cast<const int64_t*>(src));
-        return;
-      default: break;
-    }
-
-    uint8_t* d = reinterpret_cast<uint8_t*>(dst);
-    const uint8_t* s = reinterpret_cast<const uint8_t*>(src);
-    for (int i = 0; i < len; ++i) {
-      d[i] = s[len - i - 1];
-    }
-
-  }
+  /// Write the swapped bytes into dest; source and dest must not overlap.
+  /// This function is optimized for len <= 16. It reverts to a slow loop-based
+  /// swap for len > 16.
+  static inline void ByteSwap(void* dest, const void* source, int len);
 
   /// Converts to big endian format (if not already in big endian) from the
   /// machine's native endian format.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/bit-util.inline.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.inline.h b/be/src/util/bit-util.inline.h
new file mode 100644
index 0000000..33b44c6
--- /dev/null
+++ b/be/src/util/bit-util.inline.h
@@ -0,0 +1,130 @@
+// Copyright 2016 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef IMPALA_BIT_UTIL_INLINE_H
+#define IMPALA_BIT_UTIL_INLINE_H
+
+#include "util/bit-util.h"
+
+namespace impala {
+
+inline void BitUtil::ByteSwap(void* dest, const void* source, int len) {
+  uint8_t* dst = reinterpret_cast<uint8_t*>(dest);
+  const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
+  switch (len) {
+    case 1:
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src);
+      return;
+    case 2:
+      *reinterpret_cast<uint16_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src));
+      return;
+    case 3:
+      *reinterpret_cast<uint16_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 2);
+      return;
+    case 4:
+      *reinterpret_cast<uint32_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+      return;
+    case 5:
+      *reinterpret_cast<uint32_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 4);
+      return;
+    case 6:
+      *reinterpret_cast<uint32_t*>(dst + 2) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+      *reinterpret_cast<uint16_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
+      return;
+    case 7:
+      *reinterpret_cast<uint32_t*>(dst + 3) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src));
+      *reinterpret_cast<uint16_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 4));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 6);
+      return;
+    case 8:
+      *reinterpret_cast<uint64_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      return;
+    case 9:
+      *reinterpret_cast<uint64_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 8);
+      return;
+    case 10:
+      *reinterpret_cast<uint64_t*>(dst + 2) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint16_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
+      return;
+    case 11:
+      *reinterpret_cast<uint64_t*>(dst + 3) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint16_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 8));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 10);
+      return;
+    case 12:
+      *reinterpret_cast<uint64_t*>(dst + 4) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint32_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+      return;
+    case 13:
+      *reinterpret_cast<uint64_t*>(dst + 5) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint32_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 12);
+      return;
+    case 14:
+      *reinterpret_cast<uint64_t*>(dst + 6) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint32_t*>(dst + 2) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+      *reinterpret_cast<uint16_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
+      return;
+    case 15:
+      *reinterpret_cast<uint64_t*>(dst + 7) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint32_t*>(dst + 3) =
+          ByteSwap(*reinterpret_cast<const uint32_t*>(src + 8));
+      *reinterpret_cast<uint16_t*>(dst + 1) =
+          ByteSwap(*reinterpret_cast<const uint16_t*>(src + 12));
+      *reinterpret_cast<uint8_t*>(dst) = *reinterpret_cast<const uint8_t*>(src + 14);
+      return;
+    case 16:
+      *reinterpret_cast<uint64_t*>(dst + 8) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src));
+      *reinterpret_cast<uint64_t*>(dst) =
+          ByteSwap(*reinterpret_cast<const uint64_t*>(src + 8));
+      return;
+    default:
+      // Revert to slow loop-based swap.
+      for (int i = 0; i < len; ++i) {
+        dst[i] = src[len - i - 1];
+      }
+      return;
+  }
+}
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/0306dd57/be/src/util/decimal-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/decimal-util.h b/be/src/util/decimal-util.h
index 79b7130..2ce69bc 100644
--- a/be/src/util/decimal-util.h
+++ b/be/src/util/decimal-util.h
@@ -22,7 +22,7 @@
 
 #include "runtime/multi-precision.h"
 #include "runtime/types.h"
-#include "util/bit-util.h"
+#include "util/bit-util.inline.h"
 
 namespace impala {