You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/17 16:09:38 UTC

[arrow] branch master updated: ARROW-3242: [C++] Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c698be3  ARROW-3242: [C++] Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
c698be3 is described below

commit c698be339b96aeb74763d70de1cf4c8789148824
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Sep 17 12:09:24 2018 -0400

    ARROW-3242: [C++] Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
    
    Rather than having a bunch of static state in a header file (which is not a great pattern anyway), this makes `arrow::CpuInfo` a singleton. I added support for vector-level dispatch to SSE4 hashing in the Parquet dictionary encoder.
    
    This solves ARROW-3241 for me
    
    NB. This is precisely the kind of change that is now radically simpler after the monorepo merge
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #2571 from wesm/ARROW-3242 and squashes the following commits:
    
    cc66a4064 <Wes McKinney> Remove cruft from mid-refactor
    c09aaea50 <Wes McKinney> Do not link librt on Apple platform
    abacaf4b8 <Wes McKinney> Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
---
 cpp/CMakeLists.txt                    |  6 ++-
 cpp/cmake_modules/SetupCxxFlags.cmake |  4 ++
 cpp/src/arrow/builder.cc              | 31 ++++++--------
 cpp/src/arrow/compute/context.cc      |  7 +---
 cpp/src/arrow/compute/context.h       |  6 +++
 cpp/src/arrow/compute/kernels/hash.cc |  9 ++--
 cpp/src/arrow/util/cpu-info.cc        | 79 +++++++++++++++--------------------
 cpp/src/arrow/util/cpu-info.h         | 54 +++++++++++++-----------
 cpp/src/arrow/util/hash-util.h        | 37 +++++++---------
 cpp/src/arrow/util/sse-util.h         |  2 +-
 cpp/src/parquet/encoding-benchmark.cc |  3 +-
 cpp/src/parquet/encoding-internal.h   | 68 ++++++++++++++++++++----------
 12 files changed, 161 insertions(+), 145 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9eb37d2..0cb52cb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -690,7 +690,11 @@ if (ARROW_JEMALLOC)
   add_definitions(-DARROW_JEMALLOC)
   add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR})
 
-  if (CMAKE_COMPILER_IS_GNUCXX AND PTHREAD_LIBRARY)
+  # If using gcc or clang on Linux, we need to link pthread for older Linuxes,
+  # including distros as new as Ubuntu 14.04
+  if ((CMAKE_COMPILER_IS_GNUCXX OR
+        (NOT APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+      AND PTHREAD_LIBRARY)
     set(ARROW_JEMALLOC_LINK_LIBS
       jemalloc_static
       # For glibc <2.17 we need to link to librt.
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index aee2654..a707a21 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -212,6 +212,10 @@ if (CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
   set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec")
 endif()
 
+if (ARROW_USE_SSE)
+  add_definitions(-DARROW_USE_SSE)
+endif()
+
 if (APPLE)
   # Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be
   # the default standard library which does not support C++11. libc++ is the
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 65d1ea7..677f2fd 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -33,12 +33,17 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/cpu-info.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/hash-util.h"
 #include "arrow/util/hash.h"
 #include "arrow/util/logging.h"
 
+#ifdef ARROW_USE_SSE
+#define SSE4_FLAG true
+#else
+#define SSE4_FLAG false
+#endif
+
 namespace arrow {
 
 using internal::AdaptiveIntBuilderBase;
@@ -776,7 +781,7 @@ struct DictionaryHashHelper<T, enable_if_has_c_type<T>> {
 
   // Compute the hash of a scalar value
   static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash(&value, sizeof(Scalar), 0);
+    return HashUtil::Hash<SSE4_FLAG>(&value, sizeof(Scalar), 0);
   }
 
   // Return whether the dictionary value at the given builder index is unequal to value
@@ -810,7 +815,7 @@ struct DictionaryHashHelper<T, enable_if_binary<T>> {
   }
 
   static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash(value.ptr_, value.length_, 0);
+    return HashUtil::Hash<SSE4_FLAG>(value.ptr_, value.length_, 0);
   }
 
   static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) {
@@ -846,7 +851,7 @@ struct DictionaryHashHelper<T, enable_if_fixed_size_binary<T>> {
   }
 
   static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash(value, byte_width, 0);
+    return HashUtil::Hash<SSE4_FLAG>(value, byte_width, 0);
   }
 
   static bool SlotDifferent(const Builder& builder, int64_t index, const uint8_t* value) {
@@ -879,19 +884,11 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
       dict_builder_(type, pool),
       overflow_dict_builder_(type, pool),
       values_builder_(pool),
-      byte_width_(-1) {
-  if (!::arrow::CpuInfo::initialized()) {
-    ::arrow::CpuInfo::Init();
-  }
-}
+      byte_width_(-1) {}
 
 DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
                                                MemoryPool* pool)
-    : ArrayBuilder(type, pool), values_builder_(pool) {
-  if (!::arrow::CpuInfo::initialized()) {
-    ::arrow::CpuInfo::Init();
-  }
-}
+    : ArrayBuilder(type, pool), values_builder_(pool) {}
 
 template <>
 DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
@@ -901,11 +898,7 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
       dict_builder_(type, pool),
       overflow_dict_builder_(type, pool),
       values_builder_(pool),
-      byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {
-  if (!::arrow::CpuInfo::initialized()) {
-    ::arrow::CpuInfo::Init();
-  }
-}
+      byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
 
 template <typename T>
 void DictionaryBuilder<T>::Reset() {
diff --git a/cpp/src/arrow/compute/context.cc b/cpp/src/arrow/compute/context.cc
index 63aa341..ab2b595 100644
--- a/cpp/src/arrow/compute/context.cc
+++ b/cpp/src/arrow/compute/context.cc
@@ -25,11 +25,8 @@
 namespace arrow {
 namespace compute {
 
-FunctionContext::FunctionContext(MemoryPool* pool) : pool_(pool) {
-  if (!::arrow::CpuInfo::initialized()) {
-    ::arrow::CpuInfo::Init();
-  }
-}
+FunctionContext::FunctionContext(MemoryPool* pool)
+    : pool_(pool), cpu_info_(CpuInfo::GetInstance()) {}
 
 MemoryPool* FunctionContext::memory_pool() const { return pool_; }
 
diff --git a/cpp/src/arrow/compute/context.h b/cpp/src/arrow/compute/context.h
index 0983819..7df61e6 100644
--- a/cpp/src/arrow/compute/context.h
+++ b/cpp/src/arrow/compute/context.h
@@ -26,6 +26,9 @@
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+
+class CpuInfo;
+
 namespace compute {
 
 #define RETURN_IF_ERROR(ctx)                  \
@@ -60,9 +63,12 @@ class ARROW_EXPORT FunctionContext {
   /// \brief Return the current status of the context
   const Status& status() const { return status_; }
 
+  CpuInfo* cpu_info() const { return cpu_info_; }
+
  private:
   Status status_;
   MemoryPool* pool_;
+  CpuInfo* cpu_info_;
 };
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 4004f8d..212f788 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -37,6 +37,9 @@
 namespace arrow {
 namespace compute {
 
+// TODO(wesm): Enable top-level dispatch to SSE4 hashing if it is enabled
+#define HASH_USE_SSE false
+
 namespace {
 
 enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
@@ -298,7 +301,7 @@ class HashTableKernel<
  protected:
   int64_t HashValue(const T& value) const {
     // TODO(wesm): Use faster hash function for C types
-    return HashUtil::Hash(&value, sizeof(T), 0);
+    return HashUtil::Hash<HASH_USE_SSE>(&value, sizeof(T), 0);
   }
 
   Status DoubleTableSize() {
@@ -489,7 +492,7 @@ class HashTableKernel<Type, Action, enable_if_binary<Type>> : public HashTable {
 
  protected:
   int64_t HashValue(const uint8_t* data, int32_t length) const {
-    return HashUtil::Hash(data, length, 0);
+    return HashUtil::Hash<HASH_USE_SSE>(data, length, 0);
   }
 
   Status DoubleTableSize() {
@@ -595,7 +598,7 @@ class HashTableKernel<Type, Action, enable_if_fixed_size_binary<Type>>
 
  protected:
   int64_t HashValue(const uint8_t* data) const {
-    return HashUtil::Hash(data, byte_width_, 0);
+    return HashUtil::Hash<HASH_USE_SSE>(data, byte_width_, 0);
   }
 
   Status DoubleTableSize() {
diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc
index 822fcae..9280ac8 100644
--- a/cpp/src/arrow/util/cpu-info.cc
+++ b/cpp/src/arrow/util/cpu-info.cc
@@ -44,6 +44,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <fstream>
+#include <memory>
 #include <mutex>
 #include <string>
 
@@ -52,21 +53,11 @@
 using boost::algorithm::contains;
 using boost::algorithm::trim;
 using std::max;
-using std::string;
 
 namespace arrow {
 
-bool CpuInfo::initialized_ = false;
-int64_t CpuInfo::hardware_flags_ = 0;
-int64_t CpuInfo::original_hardware_flags_;
-int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
-int64_t CpuInfo::cycles_per_ms_;
-int CpuInfo::num_cores_ = 1;
-string CpuInfo::model_name_ = "unknown";  // NOLINT
-static std::mutex cpuinfo_mutex;
-
 static struct {
-  string name;
+  std::string name;
   int64_t flag;
 } flag_mappings[] = {
     {"ssse3", CpuInfo::SSSE3},
@@ -82,7 +73,7 @@ namespace {
 // values contains a list of space-seperated flags.  check to see if the flags we
 // care about are present.
 // Returns a bitmap of flags.
-int64_t ParseCPUFlags(const string& values) {
+int64_t ParseCPUFlags(const std::string& values) {
   int64_t flags = 0;
   for (int i = 0; i < num_flags; ++i) {
     if (contains(values, flag_mappings[i].name)) {
@@ -181,16 +172,24 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name) {
 }
 #endif
 
-void CpuInfo::Init() {
-  std::lock_guard<std::mutex> cpuinfo_lock(cpuinfo_mutex);
+CpuInfo::CpuInfo() : hardware_flags_(0), num_cores_(1), model_name_("unknown") {}
+
+std::unique_ptr<CpuInfo> g_cpu_info;
+static std::mutex cpuinfo_mutex;
 
-  if (initialized()) {
-    return;
+CpuInfo* CpuInfo::GetInstance() {
+  std::lock_guard<std::mutex> lock(cpuinfo_mutex);
+  if (!g_cpu_info) {
+    g_cpu_info.reset(new CpuInfo);
+    g_cpu_info->Init();
   }
+  return g_cpu_info.get();
+}
 
-  string line;
-  string name;
-  string value;
+void CpuInfo::Init() {
+  std::string line;
+  std::string name;
+  std::string value;
 
   float max_mhz = 0;
   int num_cores = 0;
@@ -212,9 +211,9 @@ void CpuInfo::Init() {
   while (cpuinfo) {
     getline(cpuinfo, line);
     size_t colon = line.find(':');
-    if (colon != string::npos) {
+    if (colon != std::string::npos) {
       name = line.substr(0, colon - 1);
-      value = line.substr(colon + 1, string::npos);
+      value = line.substr(colon + 1, std::string::npos);
       trim(name);
       trim(value);
       if (name.compare("flags") == 0) {
@@ -270,18 +269,23 @@ void CpuInfo::Init() {
   } else {
     num_cores_ = 1;
   }
-
-  initialized_ = true;
 }
 
 void CpuInfo::VerifyCpuRequirements() {
-  if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+  if (!IsSupported(CpuInfo::SSSE3)) {
     DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
   }
 }
 
+bool CpuInfo::CanUseSSE4_2() const {
+#ifdef ARROW_USE_SSE
+  return IsSupported(CpuInfo::SSE4_2);
+#else
+  return false;
+#endif
+}
+
 void CpuInfo::EnableFeature(int64_t flag, bool enable) {
-  DCHECK(initialized_);
   if (!enable) {
     hardware_flags_ &= ~flag;
   } else {
@@ -291,30 +295,15 @@ void CpuInfo::EnableFeature(int64_t flag, bool enable) {
   }
 }
 
-int64_t CpuInfo::hardware_flags() {
-  DCHECK(initialized_);
-  return hardware_flags_;
-}
+int64_t CpuInfo::hardware_flags() { return hardware_flags_; }
 
-int64_t CpuInfo::CacheSize(CacheLevel level) {
-  DCHECK(initialized_);
-  return cache_sizes_[level];
-}
+int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }
 
-int64_t CpuInfo::cycles_per_ms() {
-  DCHECK(initialized_);
-  return cycles_per_ms_;
-}
+int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }
 
-int CpuInfo::num_cores() {
-  DCHECK(initialized_);
-  return num_cores_;
-}
+int CpuInfo::num_cores() { return num_cores_; }
 
-std::string CpuInfo::model_name() {
-  DCHECK(initialized_);
-  return model_name_;
-}
+std::string CpuInfo::model_name() { return model_name_; }
 
 void CpuInfo::SetDefaultCacheSize() {
 #ifndef _SC_LEVEL1_DCACHE_SIZE
diff --git a/cpp/src/arrow/util/cpu-info.h b/cpp/src/arrow/util/cpu-info.h
index f4bc8c3..dee6a52 100644
--- a/cpp/src/arrow/util/cpu-info.h
+++ b/cpp/src/arrow/util/cpu-info.h
@@ -34,10 +34,10 @@ namespace arrow {
 /// /sys/devices)
 class ARROW_EXPORT CpuInfo {
  public:
-  static const int64_t SSSE3 = (1 << 1);
-  static const int64_t SSE4_1 = (1 << 2);
-  static const int64_t SSE4_2 = (1 << 3);
-  static const int64_t POPCNT = (1 << 4);
+  static constexpr int64_t SSSE3 = (1 << 1);
+  static constexpr int64_t SSE4_1 = (1 << 2);
+  static constexpr int64_t SSE4_2 = (1 << 3);
+  static constexpr int64_t POPCNT = (1 << 4);
 
   /// Cache enums for L1 (data), L2 and L3
   enum CacheLevel {
@@ -46,48 +46,52 @@ class ARROW_EXPORT CpuInfo {
     L3_CACHE = 2,
   };
 
-  /// Initialize CpuInfo.
-  static void Init();
+  static CpuInfo* GetInstance();
 
   /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
   /// and terminate.
-  static void VerifyCpuRequirements();
+  void VerifyCpuRequirements();
 
   /// Returns all the flags for this cpu
-  static int64_t hardware_flags();
+  int64_t hardware_flags();
 
   /// Returns whether of not the cpu supports this flag
-  inline static bool IsSupported(int64_t flag) { return (hardware_flags_ & flag) != 0; }
+  bool IsSupported(int64_t flag) const { return (hardware_flags_ & flag) != 0; }
+
+  /// \brief The processor supports SSE4.2 and the Arrow libraries are built
+  /// with support for it
+  bool CanUseSSE4_2() const;
 
   /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
   /// that the underlying hardware cannot support. This is useful for testing.
-  static void EnableFeature(int64_t flag, bool enable);
+  void EnableFeature(int64_t flag, bool enable);
 
   /// Returns the size of the cache in KB at this cache level
-  static int64_t CacheSize(CacheLevel level);
+  int64_t CacheSize(CacheLevel level);
 
   /// Returns the number of cpu cycles per millisecond
-  static int64_t cycles_per_ms();
+  int64_t cycles_per_ms();
 
   /// Returns the number of cores (including hyper-threaded) on this machine.
-  static int num_cores();
+  int num_cores();
 
   /// Returns the model name of the cpu (e.g. Intel i7-2600)
-  static std::string model_name();
-
-  static bool initialized() { return initialized_; }
+  std::string model_name();
 
  private:
+  CpuInfo();
+
+  void Init();
+
   /// Inits CPU cache size variables with default values
-  static void SetDefaultCacheSize();
-
-  static bool initialized_;
-  static int64_t hardware_flags_;
-  static int64_t original_hardware_flags_;
-  static int64_t cache_sizes_[L3_CACHE + 1];
-  static int64_t cycles_per_ms_;
-  static int num_cores_;
-  static std::string model_name_;  // NOLINT
+  void SetDefaultCacheSize();
+
+  int64_t hardware_flags_;
+  int64_t original_hardware_flags_;
+  int64_t cache_sizes_[L3_CACHE + 1];
+  int64_t cycles_per_ms_;
+  int num_cores_;
+  std::string model_name_;
 };
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index 3bba07b..da23b8f 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -40,7 +40,6 @@ class HashUtil {
   /// The resulting hashes are correlated.
   /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
   static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
     bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
 
@@ -64,7 +63,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 1-byte data
   static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
     hash = SSE4_crc32_u8(hash, *s);
     hash = (hash << 16) | (hash >> 16);
@@ -73,7 +71,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 2-byte data
   static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
     hash = SSE4_crc32_u16(hash, *s);
     hash = (hash << 16) | (hash >> 16);
@@ -82,7 +79,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 4-byte data
   static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
     hash = SSE4_crc32_u32(hash, *p);
     hash = (hash << 16) | (hash >> 16);
@@ -91,7 +87,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 8-byte data
   static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
     hash = SSE4_crc32_u64(hash, *p);
     hash = (hash << 16) | (hash >> 16);
@@ -100,7 +95,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 12-byte data
   static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
     hash = SSE4_crc32_u64(hash, *p);
     ++p;
@@ -111,7 +105,6 @@ class HashUtil {
 
   /// CrcHash() specialized for 16-byte data
   static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
-    DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
     const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
     hash = SSE4_crc32_u64(hash, *p);
     ++p;
@@ -204,21 +197,9 @@ class HashUtil {
     return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
   }
 
-  /// Computes the hash value for data.  Will call either CrcHash or MurmurHash
-  /// depending on hardware capabilities.
-  /// Seed values for different steps of the query execution should use different seeds
-  /// to prevent accidental key collisions. (See IMPALA-219 for more details).
-  static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
-#ifdef ARROW_USE_SSE
-    if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
-      return CrcHash(data, bytes, seed);
-    } else {
-      return MurmurHash2_64(data, bytes, seed);
-    }
-#else
-    return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
-#endif
-  }
+  // With sse4.2
+  template <bool use_sse42 = true>
+  static inline int Hash(const void* data, int32_t bytes, uint32_t seed);
 
   /// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
   static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
@@ -253,6 +234,18 @@ class HashUtil {
   }
 };
 
+// With sse4.2
+template <>
+inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) {
+  return static_cast<int>(HashUtil::CrcHash(data, bytes, seed));
+}
+
+// Non-sse4 hash
+template <>
+inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) {
+  return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
+}
+
 }  // namespace arrow
 
 #endif  // ARROW_UTIL_HASH_UTIL_H
diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h
index 32ac43f..50e38d7 100644
--- a/cpp/src/arrow/util/sse-util.h
+++ b/cpp/src/arrow/util/sse-util.h
@@ -123,7 +123,7 @@ static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
 static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
   uint64_t result = crc;
   __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
-  return result;
+  return static_cast<uint32_t>(result);
 }
 
 static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc
index 364cdba..e7309db 100644
--- a/cpp/src/parquet/encoding-benchmark.cc
+++ b/cpp/src/parquet/encoding-benchmark.cc
@@ -110,7 +110,8 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,
 
   DictEncoder<Type> encoder(descr.get(), &pool, allocator);
   for (int i = 0; i < num_values; ++i) {
-    encoder.Put(values[i]);
+    // No SSE
+    encoder.template Put<false>(values[i]);
   }
 
   std::shared_ptr<ResizableBuffer> dict_buffer =
diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h
index 2dfb9ff..0bfd26f 100644
--- a/cpp/src/parquet/encoding-internal.h
+++ b/cpp/src/parquet/encoding-internal.h
@@ -40,7 +40,6 @@
 namespace parquet {
 
 namespace BitUtil = ::arrow::BitUtil;
-using HashUtil = ::arrow::HashUtil;
 
 class ColumnDescriptor;
 
@@ -469,9 +468,7 @@ class DictEncoder : public Encoder<DType> {
         dict_encoded_size_(0),
         type_length_(desc->type_length()) {
     hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY);
-    if (!::arrow::CpuInfo::initialized()) {
-      ::arrow::CpuInfo::Init();
-    }
+    cpu_info_ = ::arrow::CpuInfo::GetInstance();
   }
 
   ~DictEncoder() override { DCHECK(buffered_indices_.empty()); }
@@ -516,8 +513,12 @@ class DictEncoder : public Encoder<DType> {
 
   /// Encode value. Note that this does not actually write any data, just
   /// buffers the value's index to be written later.
+  template <bool use_sse42>
   void Put(const T& value);
 
+  template <bool use_sse42>
+  int Hash(const T& value);
+
   std::shared_ptr<Buffer> FlushValues() override {
     std::shared_ptr<ResizableBuffer> buffer =
         AllocateBuffer(this->allocator_, EstimatedDataEncodedSize());
@@ -529,20 +530,38 @@ class DictEncoder : public Encoder<DType> {
   }
 
   void Put(const T* values, int num_values) override {
-    for (int i = 0; i < num_values; i++) {
-      Put(values[i]);
+    if (cpu_info_->CanUseSSE4_2()) {
+      for (int i = 0; i < num_values; i++) {
+        Put<true>(values[i]);
+      }
+    } else {
+      for (int i = 0; i < num_values; i++) {
+        Put<false>(values[i]);
+      }
     }
   }
 
+  template <bool use_sse42>
+  void DoubleTableSize();
+
   void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
                  int64_t valid_bits_offset) override {
     ::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
                                                       num_values);
-    for (int32_t i = 0; i < num_values; i++) {
-      if (valid_bits_reader.IsSet()) {
-        Put(src[i]);
+    if (cpu_info_->CanUseSSE4_2()) {
+      for (int32_t i = 0; i < num_values; i++) {
+        if (valid_bits_reader.IsSet()) {
+          Put<true>(src[i]);
+        }
+        valid_bits_reader.Next();
+      }
+    } else {
+      for (int32_t i = 0; i < num_values; i++) {
+        if (valid_bits_reader.IsSet()) {
+          Put<false>(src[i]);
+        }
+        valid_bits_reader.Next();
       }
-      valid_bits_reader.Next();
     }
   }
 
@@ -561,6 +580,8 @@ class DictEncoder : public Encoder<DType> {
   // For ByteArray / FixedLenByteArray data. Not owned
   ChunkedAllocator* pool_;
 
+  ::arrow::CpuInfo* cpu_info_;
+
   /// Size of the table. Must be a power of 2.
   int hash_table_size_;
 
@@ -583,37 +604,36 @@ class DictEncoder : public Encoder<DType> {
   std::vector<T> uniques_;
 
   bool SlotDifferent(const T& v, hash_slot_t slot);
-  void DoubleTableSize();
 
   /// Size of each encoded dictionary value. -1 for variable-length types.
   int type_length_;
 
-  /// Hash function for mapping a value to a bucket.
-  inline int Hash(const T& value) const;
-
   /// Adds value to the hash table and updates dict_encoded_size_
   void AddDictKey(const T& value);
 };
 
 template <typename DType>
-inline int DictEncoder<DType>::Hash(const typename DType::c_type& value) const {
-  return HashUtil::Hash(&value, sizeof(value), 0);
+template <bool use_sse42>
+int DictEncoder<DType>::Hash(const typename DType::c_type& value) {
+  return ::arrow::HashUtil::Hash<use_sse42>(&value, sizeof(value), 0);
 }
 
 template <>
-inline int DictEncoder<ByteArrayType>::Hash(const ByteArray& value) const {
+template <bool use_sse42>
+int DictEncoder<ByteArrayType>::Hash(const ByteArray& value) {
   if (value.len > 0) {
     DCHECK_NE(nullptr, value.ptr) << "Value ptr cannot be NULL";
   }
-  return HashUtil::Hash(value.ptr, value.len, 0);
+  return ::arrow::HashUtil::Hash<use_sse42>(value.ptr, value.len, 0);
 }
 
 template <>
-inline int DictEncoder<FLBAType>::Hash(const FixedLenByteArray& value) const {
+template <bool use_sse42>
+int DictEncoder<FLBAType>::Hash(const FixedLenByteArray& value) {
   if (type_length_ > 0) {
     DCHECK_NE(nullptr, value.ptr) << "Value ptr cannot be NULL";
   }
-  return HashUtil::Hash(value.ptr, type_length_, 0);
+  return ::arrow::HashUtil::Hash<use_sse42>(value.ptr, type_length_, 0);
 }
 
 template <typename DType>
@@ -629,8 +649,9 @@ inline bool DictEncoder<FLBAType>::SlotDifferent(const FixedLenByteArray& v,
 }
 
 template <typename DType>
+template <bool use_sse42>
 inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
-  int j = Hash(v) & mod_bitmask_;
+  int j = Hash<use_sse42>(v) & mod_bitmask_;
   hash_slot_t index = hash_slots_[j];
 
   // Find an empty slot
@@ -649,7 +670,7 @@ inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
 
     if (ARROW_PREDICT_FALSE(static_cast<int>(uniques_.size()) >
                             hash_table_size_ * MAX_HASH_LOAD)) {
-      DoubleTableSize();
+      DoubleTableSize<use_sse42>();
     }
   }
 
@@ -657,6 +678,7 @@ inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
 }
 
 template <typename DType>
+template <bool use_sse42>
 inline void DictEncoder<DType>::DoubleTableSize() {
   int new_size = hash_table_size_ * 2;
   Vector<hash_slot_t> new_hash_slots(0, allocator_);
@@ -675,7 +697,7 @@ inline void DictEncoder<DType>::DoubleTableSize() {
     const typename DType::c_type& v = uniques_[index];
 
     // Find an empty slot in the new hash table
-    j = Hash(v) & (new_size - 1);
+    j = Hash<use_sse42>(v) & (new_size - 1);
     slot = new_hash_slots[j];
     while (HASH_SLOT_EMPTY != slot && SlotDifferent(v, slot)) {
       ++j;