You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/09/17 16:09:38 UTC
[arrow] branch master updated: ARROW-3242: [C++] Make CpuInfo a
singleton,
use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c698be3 ARROW-3242: [C++] Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
c698be3 is described below
commit c698be339b96aeb74763d70de1cf4c8789148824
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Sep 17 12:09:24 2018 -0400
ARROW-3242: [C++] Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
Rather than having a bunch of static state in a header file (which is not a great pattern anyway), this makes `arrow::CpuInfo` a singleton. I added support for vector-level dispatch to SSE4 hashing in the Parquet dictionary encoder.
This solves ARROW-3241 for me
NB. This is precisely the kind of change that is now radically simpler after the monorepo merge
Author: Wes McKinney <we...@apache.org>
Closes #2571 from wesm/ARROW-3242 and squashes the following commits:
cc66a4064 <Wes McKinney> Remove cruft from mid-refactor
c09aaea50 <Wes McKinney> Do not link librt on Apple platform
abacaf4b8 <Wes McKinney> Make CpuInfo a singleton, use coarser-grained dispatch to SSE4 in Parquet dictionary encoding
---
cpp/CMakeLists.txt | 6 ++-
cpp/cmake_modules/SetupCxxFlags.cmake | 4 ++
cpp/src/arrow/builder.cc | 31 ++++++--------
cpp/src/arrow/compute/context.cc | 7 +---
cpp/src/arrow/compute/context.h | 6 +++
cpp/src/arrow/compute/kernels/hash.cc | 9 ++--
cpp/src/arrow/util/cpu-info.cc | 79 +++++++++++++++--------------------
cpp/src/arrow/util/cpu-info.h | 54 +++++++++++++-----------
cpp/src/arrow/util/hash-util.h | 37 +++++++---------
cpp/src/arrow/util/sse-util.h | 2 +-
cpp/src/parquet/encoding-benchmark.cc | 3 +-
cpp/src/parquet/encoding-internal.h | 68 ++++++++++++++++++++----------
12 files changed, 161 insertions(+), 145 deletions(-)
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9eb37d2..0cb52cb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -690,7 +690,11 @@ if (ARROW_JEMALLOC)
add_definitions(-DARROW_JEMALLOC)
add_definitions(-DARROW_JEMALLOC_INCLUDE_DIR=${JEMALLOC_INCLUDE_DIR})
- if (CMAKE_COMPILER_IS_GNUCXX AND PTHREAD_LIBRARY)
+ # If using gcc or clang on Linux, we need to link pthread for older Linuxes,
+ # including distros as new as Ubuntu 14.04
+ if ((CMAKE_COMPILER_IS_GNUCXX OR
+ (NOT APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"))
+ AND PTHREAD_LIBRARY)
set(ARROW_JEMALLOC_LINK_LIBS
jemalloc_static
# For glibc <2.17 we need to link to librt.
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index aee2654..a707a21 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -212,6 +212,10 @@ if (CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec")
endif()
+if (ARROW_USE_SSE)
+ add_definitions(-DARROW_USE_SSE)
+endif()
+
if (APPLE)
# Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be
# the default standard library which does not support C++11. libc++ is the
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 65d1ea7..677f2fd 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -33,12 +33,17 @@
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
-#include "arrow/util/cpu-info.h"
#include "arrow/util/decimal.h"
#include "arrow/util/hash-util.h"
#include "arrow/util/hash.h"
#include "arrow/util/logging.h"
+#ifdef ARROW_USE_SSE
+#define SSE4_FLAG true
+#else
+#define SSE4_FLAG false
+#endif
+
namespace arrow {
using internal::AdaptiveIntBuilderBase;
@@ -776,7 +781,7 @@ struct DictionaryHashHelper<T, enable_if_has_c_type<T>> {
// Compute the hash of a scalar value
static int64_t HashValue(const Scalar& value, int byte_width) {
- return HashUtil::Hash(&value, sizeof(Scalar), 0);
+ return HashUtil::Hash<SSE4_FLAG>(&value, sizeof(Scalar), 0);
}
// Return whether the dictionary value at the given builder index is unequal to value
@@ -810,7 +815,7 @@ struct DictionaryHashHelper<T, enable_if_binary<T>> {
}
static int64_t HashValue(const Scalar& value, int byte_width) {
- return HashUtil::Hash(value.ptr_, value.length_, 0);
+ return HashUtil::Hash<SSE4_FLAG>(value.ptr_, value.length_, 0);
}
static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) {
@@ -846,7 +851,7 @@ struct DictionaryHashHelper<T, enable_if_fixed_size_binary<T>> {
}
static int64_t HashValue(const Scalar& value, int byte_width) {
- return HashUtil::Hash(value, byte_width, 0);
+ return HashUtil::Hash<SSE4_FLAG>(value, byte_width, 0);
}
static bool SlotDifferent(const Builder& builder, int64_t index, const uint8_t* value) {
@@ -879,19 +884,11 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
dict_builder_(type, pool),
overflow_dict_builder_(type, pool),
values_builder_(pool),
- byte_width_(-1) {
- if (!::arrow::CpuInfo::initialized()) {
- ::arrow::CpuInfo::Init();
- }
-}
+ byte_width_(-1) {}
DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
MemoryPool* pool)
- : ArrayBuilder(type, pool), values_builder_(pool) {
- if (!::arrow::CpuInfo::initialized()) {
- ::arrow::CpuInfo::Init();
- }
-}
+ : ArrayBuilder(type, pool), values_builder_(pool) {}
template <>
DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
@@ -901,11 +898,7 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
dict_builder_(type, pool),
overflow_dict_builder_(type, pool),
values_builder_(pool),
- byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {
- if (!::arrow::CpuInfo::initialized()) {
- ::arrow::CpuInfo::Init();
- }
-}
+ byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
template <typename T>
void DictionaryBuilder<T>::Reset() {
diff --git a/cpp/src/arrow/compute/context.cc b/cpp/src/arrow/compute/context.cc
index 63aa341..ab2b595 100644
--- a/cpp/src/arrow/compute/context.cc
+++ b/cpp/src/arrow/compute/context.cc
@@ -25,11 +25,8 @@
namespace arrow {
namespace compute {
-FunctionContext::FunctionContext(MemoryPool* pool) : pool_(pool) {
- if (!::arrow::CpuInfo::initialized()) {
- ::arrow::CpuInfo::Init();
- }
-}
+FunctionContext::FunctionContext(MemoryPool* pool)
+ : pool_(pool), cpu_info_(CpuInfo::GetInstance()) {}
MemoryPool* FunctionContext::memory_pool() const { return pool_; }
diff --git a/cpp/src/arrow/compute/context.h b/cpp/src/arrow/compute/context.h
index 0983819..7df61e6 100644
--- a/cpp/src/arrow/compute/context.h
+++ b/cpp/src/arrow/compute/context.h
@@ -26,6 +26,9 @@
#include "arrow/util/visibility.h"
namespace arrow {
+
+class CpuInfo;
+
namespace compute {
#define RETURN_IF_ERROR(ctx) \
@@ -60,9 +63,12 @@ class ARROW_EXPORT FunctionContext {
/// \brief Return the current status of the context
const Status& status() const { return status_; }
+ CpuInfo* cpu_info() const { return cpu_info_; }
+
private:
Status status_;
MemoryPool* pool_;
+ CpuInfo* cpu_info_;
};
} // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index 4004f8d..212f788 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -37,6 +37,9 @@
namespace arrow {
namespace compute {
+// TODO(wesm): Enable top-level dispatch to SSE4 hashing if it is enabled
+#define HASH_USE_SSE false
+
namespace {
enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
@@ -298,7 +301,7 @@ class HashTableKernel<
protected:
int64_t HashValue(const T& value) const {
// TODO(wesm): Use faster hash function for C types
- return HashUtil::Hash(&value, sizeof(T), 0);
+ return HashUtil::Hash<HASH_USE_SSE>(&value, sizeof(T), 0);
}
Status DoubleTableSize() {
@@ -489,7 +492,7 @@ class HashTableKernel<Type, Action, enable_if_binary<Type>> : public HashTable {
protected:
int64_t HashValue(const uint8_t* data, int32_t length) const {
- return HashUtil::Hash(data, length, 0);
+ return HashUtil::Hash<HASH_USE_SSE>(data, length, 0);
}
Status DoubleTableSize() {
@@ -595,7 +598,7 @@ class HashTableKernel<Type, Action, enable_if_fixed_size_binary<Type>>
protected:
int64_t HashValue(const uint8_t* data) const {
- return HashUtil::Hash(data, byte_width_, 0);
+ return HashUtil::Hash<HASH_USE_SSE>(data, byte_width_, 0);
}
Status DoubleTableSize() {
diff --git a/cpp/src/arrow/util/cpu-info.cc b/cpp/src/arrow/util/cpu-info.cc
index 822fcae..9280ac8 100644
--- a/cpp/src/arrow/util/cpu-info.cc
+++ b/cpp/src/arrow/util/cpu-info.cc
@@ -44,6 +44,7 @@
#include <algorithm>
#include <cstdint>
#include <fstream>
+#include <memory>
#include <mutex>
#include <string>
@@ -52,21 +53,11 @@
using boost::algorithm::contains;
using boost::algorithm::trim;
using std::max;
-using std::string;
namespace arrow {
-bool CpuInfo::initialized_ = false;
-int64_t CpuInfo::hardware_flags_ = 0;
-int64_t CpuInfo::original_hardware_flags_;
-int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
-int64_t CpuInfo::cycles_per_ms_;
-int CpuInfo::num_cores_ = 1;
-string CpuInfo::model_name_ = "unknown"; // NOLINT
-static std::mutex cpuinfo_mutex;
-
static struct {
- string name;
+ std::string name;
int64_t flag;
} flag_mappings[] = {
{"ssse3", CpuInfo::SSSE3},
@@ -82,7 +73,7 @@ namespace {
// values contains a list of space-seperated flags. check to see if the flags we
// care about are present.
// Returns a bitmap of flags.
-int64_t ParseCPUFlags(const string& values) {
+int64_t ParseCPUFlags(const std::string& values) {
int64_t flags = 0;
for (int i = 0; i < num_flags; ++i) {
if (contains(values, flag_mappings[i].name)) {
@@ -181,16 +172,24 @@ bool RetrieveCPUInfo(int64_t* hardware_flags, std::string* model_name) {
}
#endif
-void CpuInfo::Init() {
- std::lock_guard<std::mutex> cpuinfo_lock(cpuinfo_mutex);
+CpuInfo::CpuInfo() : hardware_flags_(0), num_cores_(1), model_name_("unknown") {}
+
+std::unique_ptr<CpuInfo> g_cpu_info;
+static std::mutex cpuinfo_mutex;
- if (initialized()) {
- return;
+CpuInfo* CpuInfo::GetInstance() {
+ std::lock_guard<std::mutex> lock(cpuinfo_mutex);
+ if (!g_cpu_info) {
+ g_cpu_info.reset(new CpuInfo);
+ g_cpu_info->Init();
}
+ return g_cpu_info.get();
+}
- string line;
- string name;
- string value;
+void CpuInfo::Init() {
+ std::string line;
+ std::string name;
+ std::string value;
float max_mhz = 0;
int num_cores = 0;
@@ -212,9 +211,9 @@ void CpuInfo::Init() {
while (cpuinfo) {
getline(cpuinfo, line);
size_t colon = line.find(':');
- if (colon != string::npos) {
+ if (colon != std::string::npos) {
name = line.substr(0, colon - 1);
- value = line.substr(colon + 1, string::npos);
+ value = line.substr(colon + 1, std::string::npos);
trim(name);
trim(value);
if (name.compare("flags") == 0) {
@@ -270,18 +269,23 @@ void CpuInfo::Init() {
} else {
num_cores_ = 1;
}
-
- initialized_ = true;
}
void CpuInfo::VerifyCpuRequirements() {
- if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+ if (!IsSupported(CpuInfo::SSSE3)) {
DCHECK(false) << "CPU does not support the Supplemental SSE3 instruction set";
}
}
+bool CpuInfo::CanUseSSE4_2() const {
+#ifdef ARROW_USE_SSE
+ return IsSupported(CpuInfo::SSE4_2);
+#else
+ return false;
+#endif
+}
+
void CpuInfo::EnableFeature(int64_t flag, bool enable) {
- DCHECK(initialized_);
if (!enable) {
hardware_flags_ &= ~flag;
} else {
@@ -291,30 +295,15 @@ void CpuInfo::EnableFeature(int64_t flag, bool enable) {
}
}
-int64_t CpuInfo::hardware_flags() {
- DCHECK(initialized_);
- return hardware_flags_;
-}
+int64_t CpuInfo::hardware_flags() { return hardware_flags_; }
-int64_t CpuInfo::CacheSize(CacheLevel level) {
- DCHECK(initialized_);
- return cache_sizes_[level];
-}
+int64_t CpuInfo::CacheSize(CacheLevel level) { return cache_sizes_[level]; }
-int64_t CpuInfo::cycles_per_ms() {
- DCHECK(initialized_);
- return cycles_per_ms_;
-}
+int64_t CpuInfo::cycles_per_ms() { return cycles_per_ms_; }
-int CpuInfo::num_cores() {
- DCHECK(initialized_);
- return num_cores_;
-}
+int CpuInfo::num_cores() { return num_cores_; }
-std::string CpuInfo::model_name() {
- DCHECK(initialized_);
- return model_name_;
-}
+std::string CpuInfo::model_name() { return model_name_; }
void CpuInfo::SetDefaultCacheSize() {
#ifndef _SC_LEVEL1_DCACHE_SIZE
diff --git a/cpp/src/arrow/util/cpu-info.h b/cpp/src/arrow/util/cpu-info.h
index f4bc8c3..dee6a52 100644
--- a/cpp/src/arrow/util/cpu-info.h
+++ b/cpp/src/arrow/util/cpu-info.h
@@ -34,10 +34,10 @@ namespace arrow {
/// /sys/devices)
class ARROW_EXPORT CpuInfo {
public:
- static const int64_t SSSE3 = (1 << 1);
- static const int64_t SSE4_1 = (1 << 2);
- static const int64_t SSE4_2 = (1 << 3);
- static const int64_t POPCNT = (1 << 4);
+ static constexpr int64_t SSSE3 = (1 << 1);
+ static constexpr int64_t SSE4_1 = (1 << 2);
+ static constexpr int64_t SSE4_2 = (1 << 3);
+ static constexpr int64_t POPCNT = (1 << 4);
/// Cache enums for L1 (data), L2 and L3
enum CacheLevel {
@@ -46,48 +46,52 @@ class ARROW_EXPORT CpuInfo {
L3_CACHE = 2,
};
- /// Initialize CpuInfo.
- static void Init();
+ static CpuInfo* GetInstance();
/// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
/// and terminate.
- static void VerifyCpuRequirements();
+ void VerifyCpuRequirements();
/// Returns all the flags for this cpu
- static int64_t hardware_flags();
+ int64_t hardware_flags();
/// Returns whether of not the cpu supports this flag
- inline static bool IsSupported(int64_t flag) { return (hardware_flags_ & flag) != 0; }
+ bool IsSupported(int64_t flag) const { return (hardware_flags_ & flag) != 0; }
+
+ /// \brief The processor supports SSE4.2 and the Arrow libraries are built
+ /// with support for it
+ bool CanUseSSE4_2() const;
/// Toggle a hardware feature on and off. It is not valid to turn on a feature
/// that the underlying hardware cannot support. This is useful for testing.
- static void EnableFeature(int64_t flag, bool enable);
+ void EnableFeature(int64_t flag, bool enable);
/// Returns the size of the cache in KB at this cache level
- static int64_t CacheSize(CacheLevel level);
+ int64_t CacheSize(CacheLevel level);
/// Returns the number of cpu cycles per millisecond
- static int64_t cycles_per_ms();
+ int64_t cycles_per_ms();
/// Returns the number of cores (including hyper-threaded) on this machine.
- static int num_cores();
+ int num_cores();
/// Returns the model name of the cpu (e.g. Intel i7-2600)
- static std::string model_name();
-
- static bool initialized() { return initialized_; }
+ std::string model_name();
private:
+ CpuInfo();
+
+ void Init();
+
/// Inits CPU cache size variables with default values
- static void SetDefaultCacheSize();
-
- static bool initialized_;
- static int64_t hardware_flags_;
- static int64_t original_hardware_flags_;
- static int64_t cache_sizes_[L3_CACHE + 1];
- static int64_t cycles_per_ms_;
- static int num_cores_;
- static std::string model_name_; // NOLINT
+ void SetDefaultCacheSize();
+
+ int64_t hardware_flags_;
+ int64_t original_hardware_flags_;
+ int64_t cache_sizes_[L3_CACHE + 1];
+ int64_t cycles_per_ms_;
+ int num_cores_;
+ std::string model_name_;
};
} // namespace arrow
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index 3bba07b..da23b8f 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -40,7 +40,6 @@ class HashUtil {
/// The resulting hashes are correlated.
/// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
@@ -64,7 +63,6 @@ class HashUtil {
/// CrcHash() specialized for 1-byte data
static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
hash = SSE4_crc32_u8(hash, *s);
hash = (hash << 16) | (hash >> 16);
@@ -73,7 +71,6 @@ class HashUtil {
/// CrcHash() specialized for 2-byte data
static inline uint32_t CrcHash2(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint16_t* s = reinterpret_cast<const uint16_t*>(v);
hash = SSE4_crc32_u16(hash, *s);
hash = (hash << 16) | (hash >> 16);
@@ -82,7 +79,6 @@ class HashUtil {
/// CrcHash() specialized for 4-byte data
static inline uint32_t CrcHash4(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint32_t* p = reinterpret_cast<const uint32_t*>(v);
hash = SSE4_crc32_u32(hash, *p);
hash = (hash << 16) | (hash >> 16);
@@ -91,7 +87,6 @@ class HashUtil {
/// CrcHash() specialized for 8-byte data
static inline uint32_t CrcHash8(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
hash = (hash << 16) | (hash >> 16);
@@ -100,7 +95,6 @@ class HashUtil {
/// CrcHash() specialized for 12-byte data
static inline uint32_t CrcHash12(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
++p;
@@ -111,7 +105,6 @@ class HashUtil {
/// CrcHash() specialized for 16-byte data
static inline uint32_t CrcHash16(const void* v, uint32_t hash) {
- DCHECK(CpuInfo::IsSupported(CpuInfo::SSE4_2));
const uint64_t* p = reinterpret_cast<const uint64_t*>(v);
hash = SSE4_crc32_u64(hash, *p);
++p;
@@ -204,21 +197,9 @@ class HashUtil {
return static_cast<uint32_t>((hash_u64 >> 32) ^ (hash_u64 & 0xFFFFFFFF));
}
- /// Computes the hash value for data. Will call either CrcHash or MurmurHash
- /// depending on hardware capabilities.
- /// Seed values for different steps of the query execution should use different seeds
- /// to prevent accidental key collisions. (See IMPALA-219 for more details).
- static uint32_t Hash(const void* data, int32_t bytes, uint32_t seed) {
-#ifdef ARROW_USE_SSE
- if (LIKELY(CpuInfo::IsSupported(CpuInfo::SSE4_2))) {
- return CrcHash(data, bytes, seed);
- } else {
- return MurmurHash2_64(data, bytes, seed);
- }
-#else
- return static_cast<uint32_t>(MurmurHash2_64(data, bytes, seed));
-#endif
- }
+ // With sse4.2
+ template <bool use_sse42 = true>
+ static inline int Hash(const void* data, int32_t bytes, uint32_t seed);
/// The magic number (used in hash_combine()) 0x9e3779b9 = 2^32 / (golden ratio).
static const uint32_t HASH_COMBINE_SEED = 0x9e3779b9;
@@ -253,6 +234,18 @@ class HashUtil {
}
};
+// With sse4.2
+template <>
+inline int HashUtil::Hash<true>(const void* data, int32_t bytes, uint32_t seed) {
+ return static_cast<int>(HashUtil::CrcHash(data, bytes, seed));
+}
+
+// Non-sse4 hash
+template <>
+inline int HashUtil::Hash<false>(const void* data, int32_t bytes, uint32_t seed) {
+ return static_cast<int>(HashUtil::MurmurHash2_64(data, bytes, seed));
+}
+
} // namespace arrow
#endif // ARROW_UTIL_HASH_UTIL_H
diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h
index 32ac43f..50e38d7 100644
--- a/cpp/src/arrow/util/sse-util.h
+++ b/cpp/src/arrow/util/sse-util.h
@@ -123,7 +123,7 @@ static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
uint64_t result = crc;
__asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
- return result;
+ return static_cast<uint32_t>(result);
}
static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc
index 364cdba..e7309db 100644
--- a/cpp/src/parquet/encoding-benchmark.cc
+++ b/cpp/src/parquet/encoding-benchmark.cc
@@ -110,7 +110,8 @@ static void DecodeDict(std::vector<typename Type::c_type>& values,
DictEncoder<Type> encoder(descr.get(), &pool, allocator);
for (int i = 0; i < num_values; ++i) {
- encoder.Put(values[i]);
+ // No SSE
+ encoder.template Put<false>(values[i]);
}
std::shared_ptr<ResizableBuffer> dict_buffer =
diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h
index 2dfb9ff..0bfd26f 100644
--- a/cpp/src/parquet/encoding-internal.h
+++ b/cpp/src/parquet/encoding-internal.h
@@ -40,7 +40,6 @@
namespace parquet {
namespace BitUtil = ::arrow::BitUtil;
-using HashUtil = ::arrow::HashUtil;
class ColumnDescriptor;
@@ -469,9 +468,7 @@ class DictEncoder : public Encoder<DType> {
dict_encoded_size_(0),
type_length_(desc->type_length()) {
hash_slots_.Assign(hash_table_size_, HASH_SLOT_EMPTY);
- if (!::arrow::CpuInfo::initialized()) {
- ::arrow::CpuInfo::Init();
- }
+ cpu_info_ = ::arrow::CpuInfo::GetInstance();
}
~DictEncoder() override { DCHECK(buffered_indices_.empty()); }
@@ -516,8 +513,12 @@ class DictEncoder : public Encoder<DType> {
/// Encode value. Note that this does not actually write any data, just
/// buffers the value's index to be written later.
+ template <bool use_sse42>
void Put(const T& value);
+ template <bool use_sse42>
+ int Hash(const T& value);
+
std::shared_ptr<Buffer> FlushValues() override {
std::shared_ptr<ResizableBuffer> buffer =
AllocateBuffer(this->allocator_, EstimatedDataEncodedSize());
@@ -529,20 +530,38 @@ class DictEncoder : public Encoder<DType> {
}
void Put(const T* values, int num_values) override {
- for (int i = 0; i < num_values; i++) {
- Put(values[i]);
+ if (cpu_info_->CanUseSSE4_2()) {
+ for (int i = 0; i < num_values; i++) {
+ Put<true>(values[i]);
+ }
+ } else {
+ for (int i = 0; i < num_values; i++) {
+ Put<false>(values[i]);
+ }
}
}
+ template <bool use_sse42>
+ void DoubleTableSize();
+
void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
int64_t valid_bits_offset) override {
::arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset,
num_values);
- for (int32_t i = 0; i < num_values; i++) {
- if (valid_bits_reader.IsSet()) {
- Put(src[i]);
+ if (cpu_info_->CanUseSSE4_2()) {
+ for (int32_t i = 0; i < num_values; i++) {
+ if (valid_bits_reader.IsSet()) {
+ Put<true>(src[i]);
+ }
+ valid_bits_reader.Next();
+ }
+ } else {
+ for (int32_t i = 0; i < num_values; i++) {
+ if (valid_bits_reader.IsSet()) {
+ Put<false>(src[i]);
+ }
+ valid_bits_reader.Next();
}
- valid_bits_reader.Next();
}
}
@@ -561,6 +580,8 @@ class DictEncoder : public Encoder<DType> {
// For ByteArray / FixedLenByteArray data. Not owned
ChunkedAllocator* pool_;
+ ::arrow::CpuInfo* cpu_info_;
+
/// Size of the table. Must be a power of 2.
int hash_table_size_;
@@ -583,37 +604,36 @@ class DictEncoder : public Encoder<DType> {
std::vector<T> uniques_;
bool SlotDifferent(const T& v, hash_slot_t slot);
- void DoubleTableSize();
/// Size of each encoded dictionary value. -1 for variable-length types.
int type_length_;
- /// Hash function for mapping a value to a bucket.
- inline int Hash(const T& value) const;
-
/// Adds value to the hash table and updates dict_encoded_size_
void AddDictKey(const T& value);
};
template <typename DType>
-inline int DictEncoder<DType>::Hash(const typename DType::c_type& value) const {
- return HashUtil::Hash(&value, sizeof(value), 0);
+template <bool use_sse42>
+int DictEncoder<DType>::Hash(const typename DType::c_type& value) {
+ return ::arrow::HashUtil::Hash<use_sse42>(&value, sizeof(value), 0);
}
template <>
-inline int DictEncoder<ByteArrayType>::Hash(const ByteArray& value) const {
+template <bool use_sse42>
+int DictEncoder<ByteArrayType>::Hash(const ByteArray& value) {
if (value.len > 0) {
DCHECK_NE(nullptr, value.ptr) << "Value ptr cannot be NULL";
}
- return HashUtil::Hash(value.ptr, value.len, 0);
+ return ::arrow::HashUtil::Hash<use_sse42>(value.ptr, value.len, 0);
}
template <>
-inline int DictEncoder<FLBAType>::Hash(const FixedLenByteArray& value) const {
+template <bool use_sse42>
+int DictEncoder<FLBAType>::Hash(const FixedLenByteArray& value) {
if (type_length_ > 0) {
DCHECK_NE(nullptr, value.ptr) << "Value ptr cannot be NULL";
}
- return HashUtil::Hash(value.ptr, type_length_, 0);
+ return ::arrow::HashUtil::Hash<use_sse42>(value.ptr, type_length_, 0);
}
template <typename DType>
@@ -629,8 +649,9 @@ inline bool DictEncoder<FLBAType>::SlotDifferent(const FixedLenByteArray& v,
}
template <typename DType>
+template <bool use_sse42>
inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
- int j = Hash(v) & mod_bitmask_;
+ int j = Hash<use_sse42>(v) & mod_bitmask_;
hash_slot_t index = hash_slots_[j];
// Find an empty slot
@@ -649,7 +670,7 @@ inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
if (ARROW_PREDICT_FALSE(static_cast<int>(uniques_.size()) >
hash_table_size_ * MAX_HASH_LOAD)) {
- DoubleTableSize();
+ DoubleTableSize<use_sse42>();
}
}
@@ -657,6 +678,7 @@ inline void DictEncoder<DType>::Put(const typename DType::c_type& v) {
}
template <typename DType>
+template <bool use_sse42>
inline void DictEncoder<DType>::DoubleTableSize() {
int new_size = hash_table_size_ * 2;
Vector<hash_slot_t> new_hash_slots(0, allocator_);
@@ -675,7 +697,7 @@ inline void DictEncoder<DType>::DoubleTableSize() {
const typename DType::c_type& v = uniques_[index];
// Find an empty slot in the new hash table
- j = Hash(v) & (new_size - 1);
+ j = Hash<use_sse42>(v) & (new_size - 1);
slot = new_hash_slots[j];
while (HASH_SLOT_EMPTY != slot && SlotDifferent(v, slot)) {
++j;