You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/11/29 16:01:08 UTC

[arrow] branch master updated: ARROW-3849: [C++] Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 05c70b0  ARROW-3849: [C++] Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64
05c70b0 is described below

commit 05c70b0caf35af0f07ee0a58e411a896ee806be6
Author: Yuqi Gu <yu...@arm.com>
AuthorDate: Thu Nov 29 17:00:58 2018 +0100

    ARROW-3849: [C++] Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64
    
    The 'Hash utility' leverages SSE4 to accelerate the Crc32 data hash computation for x86.
    Correspondingly, we will leverage the Arm crc32 extension instructions
    to accelerate the hash computation for Arm64.
    
    1. Add Arm hardware Crc32 support.
    2. Add the hash computing mode respectively:
    - USE_DEFAULT: Murmur2-64
    - USE_SSE42
    - USE_ARMCRC
    3. Modify the cmake configuration to detect whether the Arm architecture is supported or not on compiling phase.  The code will also do a Crc32 run time check(only available for Linux).
    
    Author: Yuqi Gu <yu...@arm.com>
    
    Closes #3010 from guyuqi/ARROW-3849 and squashes the following commits:
    
    6b99d208 <Yuqi Gu> Fix the coding style
    1cf378a4 <Yuqi Gu> Rebase the patch to master
    fcf972e2 <Yuqi Gu> ARROW-3849 Leverage Armv8 crc32 extension instructions to accelerate the hash computation for Arm64
---
 cpp/cmake_modules/SetupCxxFlags.cmake |  6 +++
 cpp/src/arrow/util/CMakeLists.txt     |  1 +
 cpp/src/arrow/util/hash-util.h        | 97 +++++++++++++++++++++++++----------
 cpp/src/arrow/util/neon-util.h        | 89 ++++++++++++++++++++++++++++++++
 4 files changed, 165 insertions(+), 28 deletions(-)

diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index d239d69..893ec36 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -22,6 +22,8 @@ include(CheckCXXCompilerFlag)
 CHECK_CXX_COMPILER_FLAG("-msse4.2" CXX_SUPPORTS_SSE4_2)
 # power compiler flags
 CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC)
+# Arm64 compiler flags
+CHECK_CXX_COMPILER_FLAG("-march=armv8-a+crc" CXX_SUPPORTS_ARMCRC)
 
 # This ensures that things like gnu++11 get passed correctly
 set(CMAKE_CXX_STANDARD 11)
@@ -220,6 +222,10 @@ if (CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
   set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec")
 endif()
 
+if (CXX_SUPPORTS_ARMCRC)
+  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=armv8-a+crc")
+endif()
+
 if (ARROW_USE_SIMD)
   add_definitions(-DARROW_USE_SIMD)
 endif()
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 122c551..d785eee 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -44,6 +44,7 @@ install(FILES
   logging.h
   macros.h
   memory.h
+  neon-util.h
   parallel.h
   rle-encoding.h
   sse-util.h
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index 3f7e404..fd69cb9 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -25,22 +25,55 @@
 
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/neon-util.h"
 #include "arrow/util/sse-util.h"
 
+static inline uint32_t HW_crc32_u8(uint32_t crc, uint8_t v) {
+  DCHECK(false) << "Hardware CRC support is not enabled";
+  return 0;
+}
+
+static inline uint32_t HW_crc32_u16(uint32_t crc, uint16_t v) {
+  DCHECK(false) << "Hardware CRC support is not enabled";
+  return 0;
+}
+
+static inline uint32_t HW_crc32_u32(uint32_t crc, uint32_t v) {
+  DCHECK(false) << "Hardware CRC support is not enabled";
+  return 0;
+}
+
+static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) {
+  DCHECK(false) << "Hardware CRC support is not enabled";
+  return 0;
+}
+
+#ifdef ARROW_HAVE_SSE4_2
+#define HW_crc32_u8 SSE4_crc32_u8
+#define HW_crc32_u16 SSE4_crc32_u16
+#define HW_crc32_u32 SSE4_crc32_u32
+#define HW_crc32_u64 SSE4_crc32_u64
+#elif defined(ARROW_HAVE_ARM_CRC)
+#define HW_crc32_u8 ARMCE_crc32_u8
+#define HW_crc32_u16 ARMCE_crc32_u16
+#define HW_crc32_u32 ARMCE_crc32_u32
+#define HW_crc32_u64 ARMCE_crc32_u64
+#endif
+
 namespace arrow {
 
 /// Utility class to compute hash values.
 class HashUtil {
  public:
-#ifdef ARROW_HAVE_SSE4_2
+#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC)
   static constexpr bool have_hardware_crc32 = true;
 #else
   static constexpr bool have_hardware_crc32 = false;
 #endif
 
-  /// Compute the Crc32 hash for data using SSE4 instructions.  The input hash
+  /// Compute the Crc32 hash for data using SSE4/ArmCRC instructions.  The input hash
   /// parameter is the current hash/seed value.
-  /// This should only be called if SSE is supported.
+  /// This should only be called if SSE/ArmCRC is supported.
   /// This is ~4x faster than Fnv/Boost Hash.
   /// TODO: crc32 hashes with different seeds do not result in different hash functions.
   /// The resulting hashes are correlated.
@@ -49,15 +82,15 @@ class HashUtil {
     const uint8_t* end = p + nbytes;
 
     while (p <= end - 8) {
-      hash = SSE4_crc32_u64(hash, *reinterpret_cast<const uint64_t*>(p));
+      hash = HW_crc32_u64(hash, *reinterpret_cast<const uint64_t*>(p));
       p += 8;
     }
     while (p <= end - 4) {
-      hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
+      hash = HW_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
       p += 4;
     }
     while (p < end) {
-      hash = SSE4_crc32_u8(hash, *p);
+      hash = HW_crc32_u8(hash, *p);
       ++p;
     }
 
@@ -81,30 +114,30 @@ class HashUtil {
     uint32_t h2 = static_cast<uint32_t>(hash);
 
     while (nbytes >= 16) {
-      h1 = SSE4_crc32_u64(h1, *reinterpret_cast<const uint64_t*>(p));
-      h2 = SSE4_crc32_u64(h2, *reinterpret_cast<const uint64_t*>(p + 8));
+      h1 = HW_crc32_u64(h1, *reinterpret_cast<const uint64_t*>(p));
+      h2 = HW_crc32_u64(h2, *reinterpret_cast<const uint64_t*>(p + 8));
       nbytes -= 16;
       p += 16;
     }
     if (nbytes >= 8) {
-      h1 = SSE4_crc32_u32(h1, *reinterpret_cast<const uint32_t*>(p));
-      h2 = SSE4_crc32_u32(h2, *reinterpret_cast<const uint32_t*>(p + 4));
+      h1 = HW_crc32_u32(h1, *reinterpret_cast<const uint32_t*>(p));
+      h2 = HW_crc32_u32(h2, *reinterpret_cast<const uint32_t*>(p + 4));
       nbytes -= 8;
       p += 8;
     }
     if (nbytes >= 4) {
-      h1 = SSE4_crc32_u16(h1, *reinterpret_cast<const uint16_t*>(p));
-      h2 = SSE4_crc32_u16(h2, *reinterpret_cast<const uint16_t*>(p + 2));
+      h1 = HW_crc32_u16(h1, *reinterpret_cast<const uint16_t*>(p));
+      h2 = HW_crc32_u16(h2, *reinterpret_cast<const uint16_t*>(p + 2));
       nbytes -= 4;
       p += 4;
     }
     switch (nbytes) {
       case 3:
-        h1 = SSE4_crc32_u8(h1, p[3]);
+        h1 = HW_crc32_u8(h1, p[3]);
       case 2:
-        h2 = SSE4_crc32_u8(h2, p[2]);
+        h2 = HW_crc32_u8(h2, p[2]);
       case 1:
-        h1 = SSE4_crc32_u8(h1, p[1]);
+        h1 = HW_crc32_u8(h1, p[1]);
       case 0:
         break;
       default:
@@ -118,7 +151,7 @@ class HashUtil {
   /// CrcHash() specialized for 1-byte data
   static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
     const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
-    hash = SSE4_crc32_u8(hash, *s);
+    hash = HW_crc32_u8(hash, *s);
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -126,7 +159,7 @@ class HashUtil {
   /// CrcHash() specialized for 2-byte data
   static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
     const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
-    hash = SSE4_crc32_u16(hash, *s);
+    hash = HW_crc32_u16(hash, *s);
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -134,7 +167,7 @@ class HashUtil {
   /// CrcHash() specialized for 4-byte data
   static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
     const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
-    hash = SSE4_crc32_u32(hash, *p);
+    hash = HW_crc32_u32(hash, *p);
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -142,7 +175,7 @@ class HashUtil {
   /// CrcHash() specialized for 8-byte data
   static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
+    hash = HW_crc32_u64(hash, *p);
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -150,9 +183,9 @@ class HashUtil {
   /// CrcHash() specialized for 12-byte data
   static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
+    hash = HW_crc32_u64(hash, *p);
     ++p;
-    hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
+    hash = HW_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -160,9 +193,9 @@ class HashUtil {
   /// CrcHash() specialized for 16-byte data
   static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
-    hash = SSE4_crc32_u64(hash, *p);
+    hash = HW_crc32_u64(hash, *p);
     ++p;
-    hash = SSE4_crc32_u64(hash, *p);
+    hash = HW_crc32_u64(hash, *p);
     hash = (hash << 16) | (hash >> 16);
     return hash;
   }
@@ -251,8 +284,8 @@ class HashUtil {
     return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
   }
 
-  // With sse4.2
-  template <bool use_sse42 = true>
+  // Hash template
+  template <bool hw>
   static inline int Hash(const void* data, int32_t bytes, uint32_t seed);
 
   /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
@@ -288,13 +321,21 @@ class HashUtil {
   }
 };
 
-// With sse4.2
+// HW Hash
 template <>
 inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) {
-  return static_cast<int>(HashUtil::CrcHash(data, bytes, seed));
+#ifdef ARROW_HAVE_ARM_CRC
+  // Need run time check for Arm
+  // if not support, fall back to Murmur
+  if (!crc32c_runtime_check())
+    return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
+  else
+#endif
+    // Double CRC
+    return static_cast<int>(HashUtil::DoubleCrcHash(data, bytes, seed));
 }
 
-// Non-sse4 hash
+// Murmur Hash
 template <>
 inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) {
   return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
diff --git a/cpp/src/arrow/util/neon-util.h b/cpp/src/arrow/util/neon-util.h
new file mode 100644
index 0000000..c81bf14
--- /dev/null
+++ b/cpp/src/arrow/util/neon-util.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_NEON_UTIL_H
+#define ARROW_UTIL_NEON_UTIL_H
+
+namespace arrow {
+
+#if defined(__aarch64__) || defined(__AARCH64__)
+#ifdef __ARM_FEATURE_CRC32
+#define ARROW_HAVE_ARM_CRC
+#include <arm_acle.h>
+#endif
+#endif
+
+#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
+
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1 << 7)
+#endif
+static inline uint32_t crc32c_runtime_check(void) {
+  uint64_t auxv = getauxval(AT_HWCAP);
+  return (auxv & HWCAP_CRC32) != 0;
+}
+
+static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) {
+  return __crc32cb(crc, v);
+}
+
+static inline uint32_t ARMCE_crc32_u16(uint32_t crc, uint16_t v) {
+  return __crc32ch(crc, v);
+}
+
+static inline uint32_t ARMCE_crc32_u32(uint32_t crc, uint32_t v) {
+  return __crc32cw(crc, v);
+}
+
+static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) {
+  return __crc32cd(crc, v);
+}
+
+#else
+
+static inline uint32_t crc32c_runtime_check(void) {
+  DCHECK(false) << "Arm crc32 support is not enabled";
+  return 0;
+}
+
+static inline uint32_t ARMCE_crc32_u8(uint32_t, uint8_t) {
+  DCHECK(false) << "Arm crc32 support is not enabled";
+  return 0;
+}
+
+static inline uint32_t ARMCE_crc32_u16(uint32_t, uint16_t) {
+  DCHECK(false) << "Arm crc32 is not enabled";
+  return 0;
+}
+
+static inline uint32_t ARMCE_crc32_u32(uint32_t, uint32_t) {
+  DCHECK(false) << "Arm crc32 support is not enabled";
+  return 0;
+}
+
+static inline uint32_t ARMCE_crc32_u64(uint32_t, uint64_t) {
+  DCHECK(false) << "Arm crc32 support is not enabled";
+  return 0;
+}
+
+#endif  // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
+
+}  // namespace arrow
+
+#endif  //  ARROW_UTIL_NEON_UTIL_H