You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2020/04/06 14:37:48 UTC

[arrow] branch master updated: ARROW-8227: [C++] Refine SIMD feature definitions

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b1d4c86  ARROW-8227: [C++] Refine SIMD feature definitions
b1d4c86 is described below

commit b1d4c86eb28267525c52f436c3a096e70b8ef6e0
Author: Yibo Cai <yi...@arm.com>
AuthorDate: Mon Apr 6 16:37:23 2020 +0200

    ARROW-8227: [C++] Refine SIMD feature definitions
    
    This patch moves SIMD feature definitions from source code to cmake,
    and supports more flexible Arm64 CPU feature settings.
    
    Binary building is controlled by two factors: compiler capability and
    build requirement. Compiler capability is detected in cmake by trying
    flags like "-mavx2". Build requirement is passed by cmake command line
    such as "-DARROW_SIMD_LEVEL=AVX2". Combining these two factors, we can
    define SIMD feature macros ARROW_HAVE_AVX2, which controls conditional
    compiling of related SIMD implementations in source code.
    
    Currently we set compiler options(e.g. -msse4.2) in cmake but define
    SIMD features by checking compiler macros in source code like below:
      #if defined(__SSE4_2__)
      #define ARROW_HAVE_SSE4_2 1
      #endif
    Putting them together in cmake eases maintenance.
    
    Closes #6794 from cyb70289/simd
    
    Authored-by: Yibo Cai <yi...@arm.com>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/cmake_modules/DefineOptions.cmake  |   4 ++
 cpp/cmake_modules/SetupCxxFlags.cmake  | 108 ++++++++++++++++++++++++---------
 cpp/src/arrow/json/rapidjson_defs.h    |   5 +-
 cpp/src/arrow/util/bpacking.h          |   2 +-
 cpp/src/arrow/util/byte_stream_split.h |   2 +-
 cpp/src/arrow/util/cpu_info.cc         |   4 +-
 cpp/src/arrow/util/hash_util.h         |   4 +-
 cpp/src/arrow/util/neon_util.h         |  35 +++--------
 cpp/src/arrow/util/sse_util.h          |  20 +-----
 cpp/src/parquet/encoding.cc            |   6 +-
 cpp/src/parquet/encoding_benchmark.cc  |   2 +-
 11 files changed, 105 insertions(+), 87 deletions(-)

diff --git a/cpp/cmake_modules/DefineOptions.cmake b/cpp/cmake_modules/DefineOptions.cmake
index 9e12ac9..dbcc5b6 100644
--- a/cpp/cmake_modules/DefineOptions.cmake
+++ b/cpp/cmake_modules/DefineOptions.cmake
@@ -106,6 +106,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
                        "AVX2"
                        "AVX512")
 
+  # Arm64 architectures and extensions can lead to exploding combinations.
+  # So set it directly through cmake command line.
+  define_option_string(ARROW_ARMV8_ARCH "Arm64 arch and extensions" "armv8-a+crc+crypto")
+
   define_option(ARROW_ALTIVEC "Build with Altivec if compiler has support" ON)
 
   define_option(ARROW_RPATH_ORIGIN "Build Arrow libraries with RATH set to \$ORIGIN" OFF)
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index 046311d..22994e0 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -18,17 +18,39 @@
 # Check if the target architecture and compiler supports some special
 # instruction sets that would boost performance.
 include(CheckCXXCompilerFlag)
-# x86/amd64 compiler flags
-check_cxx_compiler_flag("-msse4.2" CXX_SUPPORTS_SSE4_2)
-check_cxx_compiler_flag("-mavx2" CXX_SUPPORTS_AVX2)
-check_cxx_compiler_flag("-mavx512f" CXX_SUPPORTS_AVX512)
-# power compiler flags
-check_cxx_compiler_flag("-maltivec" CXX_SUPPORTS_ALTIVEC)
-# Arm64 compiler flags
-set(ARROW_ARMV8_CRC_FLAG "-march=armv8-a+crc")
-check_cxx_compiler_flag(${ARROW_ARMV8_CRC_FLAG} CXX_SUPPORTS_ARMCRC)
-set(ARROW_ARMV8_CRC_CRYPTO_FLAG "-march=armv8-a+crc+crypto")
-check_cxx_compiler_flag(${ARROW_ARMV8_CRC_CRYPTO_FLAG} CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
+# Get cpu architecture
+set(ARROW_CPU_FLAG "x86")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|ARM64")
+  set(ARROW_CPU_FLAG "arm")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc")
+  set(ARROW_CPU_FLAG "ppc")
+endif()
+# Check architecture specific compiler flags
+if(ARROW_CPU_FLAG STREQUAL "x86")
+  # x86/amd64 compiler flags, msvc/gcc/clang
+  if(MSVC)
+    set(ARROW_SSE4_2_FLAG "")
+    set(ARROW_AVX2_FLAG "/arch:AVX2")
+    set(ARROW_AVX512_FLAG "/arch:AVX512")
+    set(CXX_SUPPORTS_SSE4_2 TRUE)
+  else()
+    set(ARROW_SSE4_2_FLAG "-msse4.2")
+    set(ARROW_AVX2_FLAG "-mavx2")
+    # skylake-avx512 consists of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
+    set(ARROW_AVX512_FLAG "-march=skylake-avx512")
+    check_cxx_compiler_flag(${ARROW_SSE4_2_FLAG} CXX_SUPPORTS_SSE4_2)
+  endif()
+  check_cxx_compiler_flag(${ARROW_AVX2_FLAG} CXX_SUPPORTS_AVX2)
+  check_cxx_compiler_flag(${ARROW_AVX512_FLAG} CXX_SUPPORTS_AVX512)
+elseif(ARROW_CPU_FLAG STREQUAL "ppc")
+  # power compiler flags, gcc/clang only
+  set(ARROW_ALTIVEC_FLAG "-maltivec")
+  check_cxx_compiler_flag(${ARROW_ALTIVEC_FLAG} CXX_SUPPORTS_ALTIVEC)
+elseif(ARROW_CPU_FLAG STREQUAL "arm")
+  # Arm64 compiler flags, gcc/clang only
+  set(ARROW_ARMV8_ARCH_FLAG "-march=${ARROW_ARMV8_ARCH}")
+  check_cxx_compiler_flag(${ARROW_ARMV8_ARCH_FLAG} CXX_SUPPORTS_ARMV8_ARCH)
+endif()
 
 # Support C11
 set(CMAKE_C_STANDARD 11)
@@ -274,29 +296,59 @@ if(BUILD_WARNING_FLAGS)
 endif(BUILD_WARNING_FLAGS)
 
 # Only enable additional instruction sets if they are supported
-if(CXX_SUPPORTS_AVX512 AND ARROW_SIMD_LEVEL STREQUAL "AVX512")
-  # skylake-avx512 consist of AVX512F,AVX512BW,AVX512VL,AVX512CD,AVX512DQ
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -march=skylake-avx512")
-elseif(CXX_SUPPORTS_AVX2 AND ARROW_SIMD_LEVEL STREQUAL "AVX2")
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -mavx2")
-elseif(CXX_SUPPORTS_SSE4_2 AND ARROW_SIMD_LEVEL STREQUAL "SSE4_2")
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
+if(ARROW_CPU_FLAG STREQUAL "x86" AND ARROW_USE_SIMD)
+  if(ARROW_SIMD_LEVEL STREQUAL "AVX512")
+    if(NOT CXX_SUPPORTS_AVX512)
+      message(FATAL_ERROR "AVX512 required but compiler doesn't support it.")
+    endif()
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX512_FLAG}")
+    add_definitions(-DARROW_HAVE_AVX512 -DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2)
+  elseif(ARROW_SIMD_LEVEL STREQUAL "AVX2")
+    if(NOT CXX_SUPPORTS_AVX2)
+      message(FATAL_ERROR "AVX2 required but compiler doesn't support it.")
+    endif()
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_AVX2_FLAG}")
+    add_definitions(-DARROW_HAVE_AVX2 -DARROW_HAVE_SSE4_2)
+  elseif(ARROW_SIMD_LEVEL STREQUAL "SSE4_2")
+    if(NOT CXX_SUPPORTS_SSE4_2)
+      message(FATAL_ERROR "SSE4.2 required but compiler doesn't support it.")
+    endif()
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_SSE4_2_FLAG}")
+    add_definitions(-DARROW_HAVE_SSE4_2)
+  endif()
 endif()
 
-if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec")
+if(ARROW_CPU_FLAG STREQUAL "ppc" AND ARROW_USE_SIMD)
+  if(CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
+    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ALTIVEC_FLAG}")
+  endif()
 endif()
 
-if(CXX_SUPPORTS_ARMCRC)
-  if(CXX_SUPPORTS_ARMV8_CRC_CRYPTO)
-    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_CRC_CRYPTO_FLAG}")
-  else()
-    set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_CRC_FLAG}")
+if(ARROW_CPU_FLAG STREQUAL "arm")
+  if(NOT CXX_SUPPORTS_ARMV8_ARCH)
+    message(FATAL_ERROR "Unsupported arch flag: ${ARROW_ARMV8_ARCH_FLAG}.")
+  endif()
+  if(ARROW_ARMV8_ARCH_FLAG MATCHES "native")
+    message(FATAL_ERROR "native arch not allowed, please specify arch explicitly.")
+  endif()
+  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} ${ARROW_ARMV8_ARCH_FLAG}")
+
+  if(ARROW_USE_SIMD)
+    add_definitions(-DARROW_HAVE_NEON)
   endif()
-endif()
 
-if(ARROW_USE_SIMD)
-  add_definitions(-DARROW_USE_SIMD)
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
+     AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "5.4")
+    message(WARNING "Disable Armv8 CRC and Crypto as compiler doesn't support them well.")
+  else()
+    if(ARROW_ARMV8_ARCH_FLAG MATCHES "\\+crypto")
+      add_definitions(-DARROW_HAVE_ARMV8_CRYPTO)
+    endif()
+    # armv8.1+ implies crc support
+    if(ARROW_ARMV8_ARCH_FLAG MATCHES "armv8\\.[1-9]|\\+crc")
+      add_definitions(-DARROW_HAVE_ARMV8_CRC)
+    endif()
+  endif()
 endif()
 
 # ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/json/rapidjson_defs.h b/cpp/src/arrow/json/rapidjson_defs.h
index 67ee318..5b52669 100644
--- a/cpp/src/arrow/json/rapidjson_defs.h
+++ b/cpp/src/arrow/json/rapidjson_defs.h
@@ -36,11 +36,8 @@
 #include "arrow/util/sse_util.h"
 
 // enable SIMD whitespace skipping, if available
-#if defined(ARROW_HAVE_SSE2)
-#define RAPIDJSON_SSE2 1
-#endif
-
 #if defined(ARROW_HAVE_SSE4_2)
+#define RAPIDJSON_SSE2 1
 #define RAPIDJSON_SSE42 1
 #endif
 
diff --git a/cpp/src/arrow/util/bpacking.h b/cpp/src/arrow/util/bpacking.h
index 7dab2cf..a7730cb 100644
--- a/cpp/src/arrow/util/bpacking.h
+++ b/cpp/src/arrow/util/bpacking.h
@@ -19,7 +19,7 @@
 
 #include "arrow/util/logging.h"
 #include "arrow/util/ubsan.h"
-#if defined(__AVX512F__)
+#if defined(ARROW_HAVE_AVX512)
 #include "arrow/util/bpacking_avx512_generated.h"
 #else
 #include "arrow/util/bpacking_default.h"
diff --git a/cpp/src/arrow/util/byte_stream_split.h b/cpp/src/arrow/util/byte_stream_split.h
index 08cc11b..8cf89ef 100644
--- a/cpp/src/arrow/util/byte_stream_split.h
+++ b/cpp/src/arrow/util/byte_stream_split.h
@@ -28,7 +28,7 @@ namespace arrow {
 namespace util {
 namespace internal {
 
-#if defined(ARROW_HAVE_SSE2)
+#if defined(ARROW_HAVE_SSE4_2)
 
 template <typename T>
 void ByteStreamSplitDecodeSSE2(const uint8_t* data, int64_t num_values, int64_t stride,
diff --git a/cpp/src/arrow/util/cpu_info.cc b/cpp/src/arrow/util/cpu_info.cc
index 7c1617e..2d77379 100644
--- a/cpp/src/arrow/util/cpu_info.cc
+++ b/cpp/src/arrow/util/cpu_info.cc
@@ -341,7 +341,7 @@ void CpuInfo::VerifyCpuRequirements() {
     DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
   }
 #endif
-#if defined(__aarch64__)
+#if defined(ARROW_HAVE_NEON)
   if (!IsSupported(CpuInfo::ASIMD)) {
     DCHECK(false) << "CPU does not support the Armv8 Neon instruction set";
   }
@@ -349,7 +349,7 @@ void CpuInfo::VerifyCpuRequirements() {
 }
 
 bool CpuInfo::CanUseSSE4_2() const {
-#if defined(ARROW_HAVE_SSE4_2) && defined(ARROW_USE_SIMD)
+#if defined(ARROW_HAVE_SSE4_2)
   return IsSupported(CpuInfo::SSE4_2);
 #else
   return false;
diff --git a/cpp/src/arrow/util/hash_util.h b/cpp/src/arrow/util/hash_util.h
index f102be8..c5b8701 100644
--- a/cpp/src/arrow/util/hash_util.h
+++ b/cpp/src/arrow/util/hash_util.h
@@ -55,7 +55,7 @@ static inline uint32_t HW_crc32_u64(uint32_t crc, uint64_t v) {
 #define HW_crc32_u16 SSE4_crc32_u16
 #define HW_crc32_u32 SSE4_crc32_u32
 #define HW_crc32_u64 SSE4_crc32_u64
-#elif defined(ARROW_HAVE_ARM_CRC)
+#elif defined(ARROW_HAVE_ARMV8_CRC)
 #define HW_crc32_u8 ARMCE_crc32_u8
 #define HW_crc32_u16 ARMCE_crc32_u16
 #define HW_crc32_u32 ARMCE_crc32_u32
@@ -67,7 +67,7 @@ namespace arrow {
 /// Utility class to compute hash values.
 class HashUtil {
  public:
-#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARM_CRC)
+#if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_ARMV8_CRC)
   static constexpr bool have_hardware_crc32 = true;
 #else
   static constexpr bool have_hardware_crc32 = false;
diff --git a/cpp/src/arrow/util/neon_util.h b/cpp/src/arrow/util/neon_util.h
index d56e20e..a82c2f65 100644
--- a/cpp/src/arrow/util/neon_util.h
+++ b/cpp/src/arrow/util/neon_util.h
@@ -17,38 +17,17 @@
 
 #pragma once
 
-namespace arrow {
-
-#if defined(__aarch64__) || defined(__AARCH64__)
-
-#ifdef __ARM_NEON
-#define ARROW_HAVE_NEON
+#ifdef ARROW_HAVE_NEON
+#include <arm_neon.h>
 #endif
 
-#ifdef __ARM_FEATURE_CRC32
-#define ARROW_HAVE_ARM_CRC
+#ifdef ARROW_HAVE_ARMV8_CRC
 #include <arm_acle.h>
+#endif
 
-#ifdef __ARM_FEATURE_CRYPTO
-#include <arm_neon.h>
-#define ARROW_HAVE_ARMV8_CRYPTO
-#endif  // __ARM_FEATURE_CRYPTO
-
-#endif  // __ARM_FEATURE_CRC32
-
-#endif  // defined(__aarch64__) || defined(__AARCH64__)
-
-#if defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
+namespace arrow {
 
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-#ifndef HWCAP_CRC32
-#define HWCAP_CRC32 (1 << 7)
-#endif
-static inline uint32_t crc32c_runtime_check(void) {
-  uint64_t auxv = getauxval(AT_HWCAP);
-  return (auxv & HWCAP_CRC32) != 0;
-}
+#ifdef ARROW_HAVE_ARMV8_CRC
 
 static inline uint32_t ARMCE_crc32_u8(uint32_t crc, uint8_t v) {
   return __crc32cb(crc, v);
@@ -66,6 +45,6 @@ static inline uint32_t ARMCE_crc32_u64(uint32_t crc, uint64_t v) {
   return __crc32cd(crc, v);
 }
 
-#endif  // defined(__GNUC__) && defined(__linux__) && defined(ARROW_HAVE_ARM_CRC)
+#endif  // ARROW_HAVE_ARMV8_CRC
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/sse_util.h b/cpp/src/arrow/util/sse_util.h
index 6f451fd..d8b3f22 100644
--- a/cpp/src/arrow/util/sse_util.h
+++ b/cpp/src/arrow/util/sse_util.h
@@ -22,31 +22,17 @@
 
 #include "arrow/util/macros.h"
 
-#ifdef ARROW_USE_SIMD
+#ifdef ARROW_HAVE_SSE4_2
 
 // MSVC x86-64
-
 #if (defined(_M_AMD64) || defined(_M_X64))
-#define ARROW_HAVE_SSE2 1
-#define ARROW_HAVE_SSE4_2 1
 #include <intrin.h>
-#endif
-
+#else
 // gcc/clang (possibly others)
-
-#if defined(__SSE2__)
-#define ARROW_HAVE_SSE2 1
-#include <emmintrin.h>
-#endif
-
-#if defined(__SSE4_2__)
-#define ARROW_HAVE_SSE4_2 1
 #include <nmmintrin.h>
 #endif
 
-#endif  // ARROW_USE_SIMD
-
-// MSVC x86-64
+#endif  // ARROW_HAVE_SSE4_2
 
 namespace arrow {
 
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index d3e5b3c..c70d26a 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -863,7 +863,7 @@ std::shared_ptr<Buffer> ByteStreamSplitEncoder<DType>::FlushValues() {
   uint8_t* output_buffer_raw = output_buffer->mutable_data();
   const size_t num_values = values_.length();
   const uint8_t* raw_values = reinterpret_cast<const uint8_t*>(values_.data());
-#if defined(ARROW_HAVE_SSE2)
+#if defined(ARROW_HAVE_SSE4_2)
   arrow::util::internal::ByteStreamSplitEncodeSSE2<T>(raw_values, num_values,
                                                       output_buffer_raw);
 #else
@@ -2347,7 +2347,7 @@ int ByteStreamSplitDecoder<DType>::Decode(T* buffer, int max_values) {
   const int num_decoded_previously = num_values_in_buffer_ - num_values_;
   const uint8_t* data = data_ + num_decoded_previously;
 
-#if defined(ARROW_HAVE_SSE2)
+#if defined(ARROW_HAVE_SSE4_2)
   arrow::util::internal::ByteStreamSplitDecodeSSE2<T>(data, values_to_decode,
                                                       num_values_in_buffer_, buffer);
 #else
@@ -2375,7 +2375,7 @@ int ByteStreamSplitDecoder<DType>::DecodeArrow(
   const uint8_t* data = data_ + num_decoded_previously;
   int offset = 0;
 
-#if defined(ARROW_HAVE_SSE2)
+#if defined(ARROW_HAVE_SSE4_2)
   // Use fast decoding into intermediate buffer.  This will also decode
   // some null values, but it's fast enough that we don't care.
   T* decode_out = EnsureDecodeBuffer(values_decoded);
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index a4e1176..c272aa7 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -251,7 +251,7 @@ BENCHMARK(BM_ByteStreamSplitDecode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Range(MIN_RANGE, MAX_RANGE);
 
-#if defined(ARROW_HAVE_SSE2)
+#if defined(ARROW_HAVE_SSE4_2)
 static void BM_ByteStreamSplitDecode_Float_SSE2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
       state, arrow::util::internal::ByteStreamSplitDecodeSSE2<float>);