You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/05/25 15:48:41 UTC
[4/4] incubator-impala git commit: IMPALA-5347: Parquet scanner
microoptimizations
IMPALA-5347: Parquet scanner microoptimizations
A mix of microoptimizations that profiling the parquet scanner revealed.
All lead to some measurable improvement and added up to significant
speedups for various scans.
* Add ALWAYS_INLINE to hot functions that GCC was mistakenly not inlining
in all cases.
* Apply __restrict__ in a few places so the compiler knows that it is
safe to cache values accessed via those pointers
* memset() the whole batch instead of the null indicators is cases where
it is almost certainly cheaper.
* Avoid unnecessary initialization of often-unused 'val' in ReadSlot().
* Shave a few instructions off the (still very expensive) bit unpacking
and dict decoding logic.
Performance:
Some local TPC-H and targeted-perf benchmarks showed average speedups of
~5%.
I did some benchmarks targeted at measuring column materialisation
performance using a version of lineitem with duplicated data to make
it bigger. These queries all got significantly faster.
Dict-encoded DECIMAL: 2.23 -> 1.23s
SELECT count(*) FROM biglineitem WHERE l_quantity > 49
Plain-encoded BIGINT: 2.33s -> 1.62s
SELECT count(*) FROM biglineitem WHERE l_orderkey != 10
Dict-encoded STRING: 2.73s -> 1.72s
SELECT count(*) FROM biglineitem WHERE l_returnflag = 'A'
Plain-encoded STRING: 7.07s -> 6.08s (most time spent in Snappy)
SELECT count(*) FROM biglineitem WHERE length(l_comment) > 37
Multiple columns: 5.15s -> 3.74s
SELECT count(*) FROM biglineitem
WHERE l_quantity > 49 and l_partkey != 199 and l_tax < 100
Change-Id: I49ec523a65542fdbabd53fbcc4a8901d769e5cd5
Reviewed-on: http://gerrit.cloudera.org:8080/6950
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b2782774
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b2782774
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b2782774
Branch: refs/heads/master
Commit: b2782774491b5e280c4b51e39c393810deaa1e22
Parents: 25290a0
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Sun May 21 15:04:21 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu May 25 11:38:06 2017 +0000
----------------------------------------------------------------------
be/src/common/compiler-util.h | 10 +++++++
be/src/exec/hdfs-parquet-scanner.cc | 15 ++++-------
be/src/exec/hdfs-scanner.h | 42 ++++++++++++++++++++++++++---
be/src/exec/parquet-column-readers.cc | 43 +++++++++++++++++++-----------
be/src/runtime/tuple.h | 7 +++--
be/src/util/bit-stream-utils.inline.h | 17 +++++++++---
be/src/util/bit-util.h | 7 +++--
be/src/util/dict-encoding.h | 14 ++++++----
be/src/util/rle-encoding.h | 6 +++--
9 files changed, 114 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/common/compiler-util.h
----------------------------------------------------------------------
diff --git a/be/src/common/compiler-util.h b/be/src/common/compiler-util.h
index 7c0379b..5132eb0 100644
--- a/be/src/common/compiler-util.h
+++ b/be/src/common/compiler-util.h
@@ -43,6 +43,16 @@
/// decision, e.g. not inlining a small function on a hot path.
#define ALWAYS_INLINE __attribute__((always_inline))
+/// Clang is pedantic about __restrict__ (e.g. never allows calling a non-__restrict__)
+/// member function from a __restrict__-ed memory function and has some apparent bugs
+/// (e.g. can't convert a __restrict__ reference to a const& __restrict__ reference).
+/// Just disable it.
+#ifdef __clang__
+#define RESTRICT
+#else
+#define RESTRICT __restrict__
+#endif
+
namespace impala {
/// The size of an L1 cache line in bytes on x86-64.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 0119efd..4f90cb1 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -930,12 +930,7 @@ Status HdfsParquetScanner::AssembleRows(
while (!column_readers[0]->RowGroupAtEnd()) {
// Start a new scratch batch.
RETURN_IF_ERROR(scratch_batch_->Reset(state_));
- int scratch_capacity = scratch_batch_->capacity;
-
- // Initialize tuple memory.
- for (int i = 0; i < scratch_capacity; ++i) {
- InitTuple(template_tuple_, scratch_batch_->GetTuple(i));
- }
+ InitTupleBuffer(template_tuple_, scratch_batch_->tuple_mem, scratch_batch_->capacity);
// Materialize the top-level slots into the scratch batch column-by-column.
int last_num_tuples = -1;
@@ -944,12 +939,12 @@ Status HdfsParquetScanner::AssembleRows(
for (int c = 0; c < num_col_readers; ++c) {
ParquetColumnReader* col_reader = column_readers[c];
if (col_reader->max_rep_level() > 0) {
- continue_execution = col_reader->ReadValueBatch(
- &scratch_batch_->aux_mem_pool, scratch_capacity,
- tuple_byte_size_, scratch_batch_->tuple_mem, &scratch_batch_->num_tuples);
+ continue_execution = col_reader->ReadValueBatch(&scratch_batch_->aux_mem_pool,
+ scratch_batch_->capacity, tuple_byte_size_, scratch_batch_->tuple_mem,
+ &scratch_batch_->num_tuples);
} else {
continue_execution = col_reader->ReadNonRepeatedValueBatch(
- &scratch_batch_->aux_mem_pool, scratch_capacity, tuple_byte_size_,
+ &scratch_batch_->aux_mem_pool, scratch_batch_->capacity, tuple_byte_size_,
scratch_batch_->tuple_mem, &scratch_batch_->num_tuples);
}
// Check that all column readers populated the same number of values.
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index 24a3c4f..035b41c 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -452,17 +452,51 @@ class HdfsScanner {
/// Initialize a tuple.
/// TODO: only copy over non-null slots.
- /// TODO: InitTuple is called frequently, avoid the if, perhaps via templatization.
void InitTuple(const TupleDescriptor* desc, Tuple* template_tuple, Tuple* tuple) {
if (template_tuple != NULL) {
- memcpy(tuple, template_tuple, desc->byte_size());
+ InitTupleFromTemplate(template_tuple, tuple, desc->byte_size());
} else {
tuple->ClearNullBits(*desc);
}
}
- // TODO: replace this function with above once we can inline constants from
- // scan_node_->tuple_desc() via codegen
+ /// Initialize 'tuple' with size 'tuple_byte_size' from 'template_tuple'
+ void InitTupleFromTemplate(Tuple* template_tuple, Tuple* tuple, int tuple_byte_size) {
+ memcpy(tuple, template_tuple, tuple_byte_size);
+ }
+
+ /// Initialize a dense array of 'num_tuples' tuples.
+ /// TODO: we could do better here if we inlined the tuple and null indicator byte
+ /// widths with codegen to eliminate all the branches in memcpy()/memset().
+ void InitTupleBuffer(
+ Tuple* template_tuple, uint8_t* __restrict__ tuple_mem, int64_t num_tuples) {
+ const TupleDescriptor* desc = scan_node_->tuple_desc();
+ const int tuple_byte_size = desc->byte_size();
+ // Handle the different template/non-template cases with different loops to avoid
+ // unnecessary branches inside the loop.
+ if (template_tuple != nullptr) {
+ for (int64_t i = 0; i < num_tuples; ++i) {
+ InitTupleFromTemplate(
+ template_tuple, reinterpret_cast<Tuple*>(tuple_mem), tuple_byte_size);
+ tuple_mem += tuple_byte_size;
+ }
+ } else if (tuple_byte_size <= CACHE_LINE_SIZE) {
+ // If each tuple fits in a cache line, it is quicker to zero the whole memory buffer
+ // instead of just the null indicators. This is because we are fetching the cache
+ // line anyway and zeroing a cache line is cheap (a couple of AVX2 instructions)
+ // compared with the overhead of calling memset() row-by-row.
+ memset(tuple_mem, 0, num_tuples * tuple_byte_size);
+ } else {
+ const int null_bytes_offset = desc->null_bytes_offset();
+ const int num_null_bytes = desc->num_null_bytes();
+ for (int64_t i = 0; i < num_tuples; ++i) {
+ reinterpret_cast<Tuple*>(tuple_mem)->ClearNullBits(
+ null_bytes_offset, num_null_bytes);
+ tuple_mem += tuple_byte_size;
+ }
+ }
+ }
+
void InitTuple(Tuple* template_tuple, Tuple* tuple) {
InitTuple(scan_node_->tuple_desc(), template_tuple, tuple);
}
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc
index 3465632..5c7c14d 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -296,9 +296,14 @@ class ScalarColumnReader : public BaseScalarColumnReader {
/// caching of rep/def levels. Once a data page and cached levels are available,
/// it calls into a more specialized MaterializeValueBatch() for doing the actual
/// value materialization using the level caches.
- template<bool IN_COLLECTION>
- bool ReadValueBatch(MemPool* pool, int max_values, int tuple_size,
- uint8_t* tuple_mem, int* num_values) {
+ /// Use RESTRICT so that the compiler knows that it is safe to cache member
+ /// variables in registers or on the stack (otherwise gcc's alias analysis
+ /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the
+ /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with
+ /// -fno-strict-alias).
+ template <bool IN_COLLECTION>
+ bool ReadValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size,
+ uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT {
// Repetition level is only present if this column is nested in a collection type.
if (!IN_COLLECTION) DCHECK_EQ(max_rep_level(), 0) << slot_desc()->DebugString();
if (IN_COLLECTION) DCHECK_GT(max_rep_level(), 0) << slot_desc()->DebugString();
@@ -362,9 +367,14 @@ class ScalarColumnReader : public BaseScalarColumnReader {
/// level caches have been populated.
/// For efficiency, the simple special case of !MATERIALIZED && !IN_COLLECTION is not
/// handled in this function.
- template<bool IN_COLLECTION, bool IS_DICT_ENCODED>
- bool MaterializeValueBatch(MemPool* pool, int max_values, int tuple_size,
- uint8_t* tuple_mem, int* num_values) {
+ /// Use RESTRICT so that the compiler knows that it is safe to cache member
+ /// variables in registers or on the stack (otherwise gcc's alias analysis
+ /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the
+ /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with
+ /// -fno-strict-alias).
+ template <bool IN_COLLECTION, bool IS_DICT_ENCODED>
+ bool MaterializeValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size,
+ uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT {
DCHECK(MATERIALIZED || IN_COLLECTION);
DCHECK_GT(num_buffered_values_, 0);
DCHECK(def_levels_.CacheHasNext());
@@ -372,7 +382,7 @@ class ScalarColumnReader : public BaseScalarColumnReader {
uint8_t* curr_tuple = tuple_mem;
int val_count = 0;
- while (def_levels_.CacheHasNext()) {
+ while (def_levels_.CacheHasNext() && val_count < max_values) {
Tuple* tuple = reinterpret_cast<Tuple*>(curr_tuple);
int def_level = def_levels_.CacheGetNext();
@@ -400,10 +410,8 @@ class ScalarColumnReader : public BaseScalarColumnReader {
tuple->SetNull(null_indicator_offset_);
}
}
-
curr_tuple += tuple_size;
++val_count;
- if (UNLIKELY(val_count == max_values)) break;
}
*num_values = val_count;
return true;
@@ -460,11 +468,15 @@ class ScalarColumnReader : public BaseScalarColumnReader {
/// Returns false if execution should be aborted for some reason, e.g. parse_error_ is
/// set, the query is cancelled, or the scan node limit was reached. Otherwise returns
/// true.
- template<bool IS_DICT_ENCODED>
- inline bool ReadSlot(Tuple* tuple, MemPool* pool) {
+ ///
+ /// Force inlining - GCC does not always inline this into hot loops.
+ template <bool IS_DICT_ENCODED>
+ ALWAYS_INLINE bool ReadSlot(Tuple* tuple, MemPool* pool) {
void* slot = tuple->GetSlot(tuple_offset_);
- T val;
- T* val_ptr = NeedsConversionInline() ? &val : reinterpret_cast<T*>(slot);
+ // Use an uninitialized stack allocation for temporary value to avoid running
+ // constructors doing work unnecessarily, e.g. if T == StringValue.
+ alignas(T) uint8_t val_buf[sizeof(T)];
+ T* val_ptr = reinterpret_cast<T*>(NeedsConversionInline() ? val_buf : slot);
if (IS_DICT_ENCODED) {
DCHECK_EQ(page_encoding_, parquet::Encoding::PLAIN_DICTIONARY);
if (UNLIKELY(!dict_decoder_.GetNextValue(val_ptr))) {
@@ -485,9 +497,8 @@ class ScalarColumnReader : public BaseScalarColumnReader {
if (UNLIKELY(NeedsValidationInline() && !ValidateSlot(val_ptr, tuple))) {
return false;
}
- if (UNLIKELY(NeedsConversionInline() &&
- !tuple->IsNull(null_indicator_offset_) &&
- !ConvertSlot(&val, reinterpret_cast<T*>(slot), pool))) {
+ if (UNLIKELY(NeedsConversionInline() && !tuple->IsNull(null_indicator_offset_)
+ && !ConvertSlot(val_ptr, reinterpret_cast<T*>(slot), pool))) {
return false;
}
return true;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/runtime/tuple.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h
index cf278ae..15b90a4 100644
--- a/be/src/runtime/tuple.h
+++ b/be/src/runtime/tuple.h
@@ -72,8 +72,11 @@ class Tuple {
void Init(int size) { memset(this, 0, size); }
void ClearNullBits(const TupleDescriptor& tuple_desc) {
- memset(reinterpret_cast<uint8_t*>(this) + tuple_desc.null_bytes_offset(),
- 0, tuple_desc.num_null_bytes());
+ ClearNullBits(tuple_desc.null_bytes_offset(), tuple_desc.num_null_bytes());
+ }
+
+ void ClearNullBits(int null_bytes_offset, int num_null_bytes) {
+ memset(reinterpret_cast<uint8_t*>(this) + null_bytes_offset, 0, num_null_bytes);
}
/// The total size of all data represented in this tuple (tuple data and referenced
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-stream-utils.inline.h b/be/src/util/bit-stream-utils.inline.h
index dee2456..c8744aa 100644
--- a/be/src/util/bit-stream-utils.inline.h
+++ b/be/src/util/bit-stream-utils.inline.h
@@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-
#ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H
#define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H
+#include "common/compiler-util.h"
#include "util/bit-stream-utils.h"
namespace impala {
@@ -84,14 +84,23 @@ inline bool BitWriter::PutVlqInt(int32_t v) {
return result;
}
-template<typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
+/// Force inlining - this is used in perf-critical loops in Parquet and GCC often doesn't
+/// inline it in cases where it's beneficial.
+template <typename T>
+ALWAYS_INLINE inline bool BitReader::GetValue(int num_bits, T* v) {
DCHECK(num_bits == 0 || buffer_ != NULL);
// TODO: revisit this limit if necessary
DCHECK_LE(num_bits, MAX_BITWIDTH);
DCHECK_LE(num_bits, sizeof(T) * 8);
- if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
+ // First do a cheap check to see if we may read past the end of the stream, using
+ // constant upper bounds for 'bit_offset_' and 'num_bits'.
+ if (UNLIKELY(byte_offset_ + sizeof(buffered_values_) + MAX_BITWIDTH / 8 > max_bytes_)) {
+ // Now do the precise check.
+ if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) {
+ return false;
+ }
+ }
DCHECK_GE(bit_offset_, 0);
DCHECK_LE(bit_offset_, 64);
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 40a324c..581decc 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -165,11 +165,10 @@ class BitUtil {
}
/// Returns the 'num_bits' least-significant bits of 'v'.
- static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
- if (UNLIKELY(num_bits == 0)) return 0;
+ /// Force inlining - GCC does not always inline this into hot loops.
+ static ALWAYS_INLINE uint64_t TrailingBits(uint64_t v, int num_bits) {
if (UNLIKELY(num_bits >= 64)) return v;
- int n = 64 - num_bits;
- return (v << n) >> n;
+ return ((1UL << num_bits) - 1) & v;
}
/// Swaps the byte order (i.e. endianess)
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/dict-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h
index e842495..75e9340 100644
--- a/be/src/util/dict-encoding.h
+++ b/be/src/util/dict-encoding.h
@@ -22,8 +22,9 @@
#include <boost/unordered_map.hpp>
-#include "gutil/strings/substitute.h"
+#include "common/compiler-util.h"
#include "exec/parquet-common.h"
+#include "gutil/strings/substitute.h"
#include "runtime/mem-pool.h"
#include "runtime/string-value.h"
#include "util/bit-util.h"
@@ -285,8 +286,9 @@ inline int DictEncoder<StringValue>::AddToTable(const StringValue& value,
return bytes_added;
}
-template<typename T>
-inline bool DictDecoder<T>::GetNextValue(T* value) {
+// Force inlining - GCC does not always inline this into hot loops in Parquet scanner.
+template <typename T>
+ALWAYS_INLINE inline bool DictDecoder<T>::GetNextValue(T* value) {
int index = -1; // Initialize to avoid compiler warning.
bool result = data_decoder_.Get(&index);
// Use & to avoid branches.
@@ -297,8 +299,10 @@ inline bool DictDecoder<T>::GetNextValue(T* value) {
return false;
}
-template<>
-inline bool DictDecoder<Decimal16Value>::GetNextValue(Decimal16Value* value) {
+// Force inlining - GCC does not always inline this into hot loops in Parquet scanner.
+template <>
+ALWAYS_INLINE inline bool DictDecoder<Decimal16Value>::GetNextValue(
+ Decimal16Value* value) {
int index;
bool result = data_decoder_.Get(&index);
if (!result) return false;
http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h
index 9f39697..2e2dd7f 100644
--- a/be/src/util/rle-encoding.h
+++ b/be/src/util/rle-encoding.h
@@ -245,8 +245,10 @@ class RleEncoder {
uint8_t* literal_indicator_byte_;
};
-template<typename T>
-inline bool RleDecoder::Get(T* val) {
+// Force inlining - this is used in perf-critical loops in Parquet and GCC often
+// doesn't inline it in cases where it's beneficial.
+template <typename T>
+ALWAYS_INLINE inline bool RleDecoder::Get(T* val) {
DCHECK_GE(bit_width_, 0);
// Profiling has shown that the quality and performance of the generated code is very
// sensitive to the exact shape of this check. For example, the version below performs