You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/05/25 15:48:41 UTC

[4/4] incubator-impala git commit: IMPALA-5347: Parquet scanner microoptimizations

IMPALA-5347: Parquet scanner microoptimizations

A mix of microoptimizations that profiling the parquet scanner revealed.
All lead to some measurable improvement and added up to significant
speedups for various scans.

* Add ALWAYS_INLINE to hot functions that GCC was mistakenly not inlining
  in all cases.
* Apply __restrict__ in a few places so the compiler knows that it is
  safe to cache values accessed via those pointers
* memset() the whole batch instead of the null indicators is cases where
  it is almost certainly cheaper.
* Avoid unnecessary initialization of often-unused 'val' in ReadSlot().
* Shave a few instructions off the (still very expensive) bit unpacking
  and dict decoding logic.

Performance:

Some local TPC-H and targeted-perf benchmarks showed average speedups of
~5%.

I did some benchmarks targeted at measuring column materialisation
performance using a version of lineitem with duplicated data to make
it bigger. These queries all got significantly faster.

Dict-encoded DECIMAL: 2.23 -> 1.23s

  SELECT count(*) FROM biglineitem WHERE l_quantity > 49

Plain-encoded BIGINT: 2.33s -> 1.62s

  SELECT count(*) FROM biglineitem WHERE l_orderkey != 10

Dict-encoded STRING: 2.73s -> 1.72s

  SELECT count(*) FROM biglineitem WHERE l_returnflag = 'A'

Plain-encoded STRING: 7.07s -> 6.08s (most time spent in Snappy)
  SELECT count(*) FROM biglineitem WHERE length(l_comment) > 37

Multiple columns: 5.15s -> 3.74s

  SELECT count(*) FROM biglineitem
  WHERE l_quantity > 49 and l_partkey != 199 and l_tax < 100

Change-Id: I49ec523a65542fdbabd53fbcc4a8901d769e5cd5
Reviewed-on: http://gerrit.cloudera.org:8080/6950
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b2782774
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b2782774
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b2782774

Branch: refs/heads/master
Commit: b2782774491b5e280c4b51e39c393810deaa1e22
Parents: 25290a0
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Sun May 21 15:04:21 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu May 25 11:38:06 2017 +0000

----------------------------------------------------------------------
 be/src/common/compiler-util.h         | 10 +++++++
 be/src/exec/hdfs-parquet-scanner.cc   | 15 ++++-------
 be/src/exec/hdfs-scanner.h            | 42 ++++++++++++++++++++++++++---
 be/src/exec/parquet-column-readers.cc | 43 +++++++++++++++++++-----------
 be/src/runtime/tuple.h                |  7 +++--
 be/src/util/bit-stream-utils.inline.h | 17 +++++++++---
 be/src/util/bit-util.h                |  7 +++--
 be/src/util/dict-encoding.h           | 14 ++++++----
 be/src/util/rle-encoding.h            |  6 +++--
 9 files changed, 114 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/common/compiler-util.h
----------------------------------------------------------------------
diff --git a/be/src/common/compiler-util.h b/be/src/common/compiler-util.h
index 7c0379b..5132eb0 100644
--- a/be/src/common/compiler-util.h
+++ b/be/src/common/compiler-util.h
@@ -43,6 +43,16 @@
 /// decision, e.g. not inlining a small function on a hot path.
 #define ALWAYS_INLINE __attribute__((always_inline))
 
+/// Clang is pedantic about __restrict__ (e.g. never allows calling a non-__restrict__)
+/// member function from a __restrict__-ed memory function and has some apparent bugs
+/// (e.g. can't convert a __restrict__ reference to a const& __restrict__ reference).
+/// Just disable it.
+#ifdef __clang__
+#define RESTRICT
+#else
+#define RESTRICT __restrict__
+#endif
+
 namespace impala {
 
 /// The size of an L1 cache line in bytes on x86-64.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 0119efd..4f90cb1 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -930,12 +930,7 @@ Status HdfsParquetScanner::AssembleRows(
   while (!column_readers[0]->RowGroupAtEnd()) {
     // Start a new scratch batch.
     RETURN_IF_ERROR(scratch_batch_->Reset(state_));
-    int scratch_capacity = scratch_batch_->capacity;
-
-    // Initialize tuple memory.
-    for (int i = 0; i < scratch_capacity; ++i) {
-      InitTuple(template_tuple_, scratch_batch_->GetTuple(i));
-    }
+    InitTupleBuffer(template_tuple_, scratch_batch_->tuple_mem, scratch_batch_->capacity);
 
     // Materialize the top-level slots into the scratch batch column-by-column.
     int last_num_tuples = -1;
@@ -944,12 +939,12 @@ Status HdfsParquetScanner::AssembleRows(
     for (int c = 0; c < num_col_readers; ++c) {
       ParquetColumnReader* col_reader = column_readers[c];
       if (col_reader->max_rep_level() > 0) {
-        continue_execution = col_reader->ReadValueBatch(
-            &scratch_batch_->aux_mem_pool, scratch_capacity,
-            tuple_byte_size_, scratch_batch_->tuple_mem, &scratch_batch_->num_tuples);
+        continue_execution = col_reader->ReadValueBatch(&scratch_batch_->aux_mem_pool,
+            scratch_batch_->capacity, tuple_byte_size_, scratch_batch_->tuple_mem,
+            &scratch_batch_->num_tuples);
       } else {
         continue_execution = col_reader->ReadNonRepeatedValueBatch(
-            &scratch_batch_->aux_mem_pool, scratch_capacity, tuple_byte_size_,
+            &scratch_batch_->aux_mem_pool, scratch_batch_->capacity, tuple_byte_size_,
             scratch_batch_->tuple_mem, &scratch_batch_->num_tuples);
       }
       // Check that all column readers populated the same number of values.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/hdfs-scanner.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h
index 24a3c4f..035b41c 100644
--- a/be/src/exec/hdfs-scanner.h
+++ b/be/src/exec/hdfs-scanner.h
@@ -452,17 +452,51 @@ class HdfsScanner {
 
   /// Initialize a tuple.
   /// TODO: only copy over non-null slots.
-  /// TODO: InitTuple is called frequently, avoid the if, perhaps via templatization.
   void InitTuple(const TupleDescriptor* desc, Tuple* template_tuple, Tuple* tuple) {
     if (template_tuple != NULL) {
-      memcpy(tuple, template_tuple, desc->byte_size());
+      InitTupleFromTemplate(template_tuple, tuple, desc->byte_size());
     } else {
       tuple->ClearNullBits(*desc);
     }
   }
 
-  // TODO: replace this function with above once we can inline constants from
-  // scan_node_->tuple_desc() via codegen
+  /// Initialize 'tuple' with size 'tuple_byte_size' from 'template_tuple'
+  void InitTupleFromTemplate(Tuple* template_tuple, Tuple* tuple, int tuple_byte_size) {
+    memcpy(tuple, template_tuple, tuple_byte_size);
+  }
+
+  /// Initialize a dense array of 'num_tuples' tuples.
+  /// TODO: we could do better here if we inlined the tuple and null indicator byte
+  /// widths with codegen to eliminate all the branches in memcpy()/memset().
+  void InitTupleBuffer(
+      Tuple* template_tuple, uint8_t* __restrict__ tuple_mem, int64_t num_tuples) {
+    const TupleDescriptor* desc = scan_node_->tuple_desc();
+    const int tuple_byte_size = desc->byte_size();
+    // Handle the different template/non-template cases with different loops to avoid
+    // unnecessary branches inside the loop.
+    if (template_tuple != nullptr) {
+      for (int64_t i = 0; i < num_tuples; ++i) {
+        InitTupleFromTemplate(
+            template_tuple, reinterpret_cast<Tuple*>(tuple_mem), tuple_byte_size);
+        tuple_mem += tuple_byte_size;
+      }
+    } else if (tuple_byte_size <= CACHE_LINE_SIZE) {
+      // If each tuple fits in a cache line, it is quicker to zero the whole memory buffer
+      // instead of just the null indicators. This is because we are fetching the cache
+      // line anyway and zeroing a cache line is cheap (a couple of AVX2 instructions)
+      // compared with the overhead of calling memset() row-by-row.
+      memset(tuple_mem, 0, num_tuples * tuple_byte_size);
+    } else {
+      const int null_bytes_offset = desc->null_bytes_offset();
+      const int num_null_bytes = desc->num_null_bytes();
+      for (int64_t i = 0; i < num_tuples; ++i) {
+        reinterpret_cast<Tuple*>(tuple_mem)->ClearNullBits(
+            null_bytes_offset, num_null_bytes);
+        tuple_mem += tuple_byte_size;
+      }
+    }
+  }
+
   void InitTuple(Tuple* template_tuple, Tuple* tuple) {
     InitTuple(scan_node_->tuple_desc(), template_tuple, tuple);
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc
index 3465632..5c7c14d 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -296,9 +296,14 @@ class ScalarColumnReader : public BaseScalarColumnReader {
   /// caching of rep/def levels. Once a data page and cached levels are available,
   /// it calls into a more specialized MaterializeValueBatch() for doing the actual
   /// value materialization using the level caches.
-  template<bool IN_COLLECTION>
-  bool ReadValueBatch(MemPool* pool, int max_values, int tuple_size,
-      uint8_t* tuple_mem, int* num_values) {
+  /// Use RESTRICT so that the compiler knows that it is safe to cache member
+  /// variables in registers or on the stack (otherwise gcc's alias analysis
+  /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the
+  /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with
+  /// -fno-strict-alias).
+  template <bool IN_COLLECTION>
+  bool ReadValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size,
+      uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT {
     // Repetition level is only present if this column is nested in a collection type.
     if (!IN_COLLECTION) DCHECK_EQ(max_rep_level(), 0) << slot_desc()->DebugString();
     if (IN_COLLECTION) DCHECK_GT(max_rep_level(), 0) << slot_desc()->DebugString();
@@ -362,9 +367,14 @@ class ScalarColumnReader : public BaseScalarColumnReader {
   /// level caches have been populated.
   /// For efficiency, the simple special case of !MATERIALIZED && !IN_COLLECTION is not
   /// handled in this function.
-  template<bool IN_COLLECTION, bool IS_DICT_ENCODED>
-  bool MaterializeValueBatch(MemPool* pool, int max_values, int tuple_size,
-      uint8_t* tuple_mem, int* num_values) {
+  /// Use RESTRICT so that the compiler knows that it is safe to cache member
+  /// variables in registers or on the stack (otherwise gcc's alias analysis
+  /// conservatively assumes that buffers like 'tuple_mem', 'num_values' or the
+  /// 'def_levels_' 'rep_levels_' buffers may alias 'this', especially with
+  /// -fno-strict-alias).
+  template <bool IN_COLLECTION, bool IS_DICT_ENCODED>
+  bool MaterializeValueBatch(MemPool* RESTRICT pool, int max_values, int tuple_size,
+      uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT {
     DCHECK(MATERIALIZED || IN_COLLECTION);
     DCHECK_GT(num_buffered_values_, 0);
     DCHECK(def_levels_.CacheHasNext());
@@ -372,7 +382,7 @@ class ScalarColumnReader : public BaseScalarColumnReader {
 
     uint8_t* curr_tuple = tuple_mem;
     int val_count = 0;
-    while (def_levels_.CacheHasNext()) {
+    while (def_levels_.CacheHasNext() && val_count < max_values) {
       Tuple* tuple = reinterpret_cast<Tuple*>(curr_tuple);
       int def_level = def_levels_.CacheGetNext();
 
@@ -400,10 +410,8 @@ class ScalarColumnReader : public BaseScalarColumnReader {
           tuple->SetNull(null_indicator_offset_);
         }
       }
-
       curr_tuple += tuple_size;
       ++val_count;
-      if (UNLIKELY(val_count == max_values)) break;
     }
     *num_values = val_count;
     return true;
@@ -460,11 +468,15 @@ class ScalarColumnReader : public BaseScalarColumnReader {
   /// Returns false if execution should be aborted for some reason, e.g. parse_error_ is
   /// set, the query is cancelled, or the scan node limit was reached. Otherwise returns
   /// true.
-  template<bool IS_DICT_ENCODED>
-  inline bool ReadSlot(Tuple* tuple, MemPool* pool) {
+  ///
+  /// Force inlining - GCC does not always inline this into hot loops.
+  template <bool IS_DICT_ENCODED>
+  ALWAYS_INLINE bool ReadSlot(Tuple* tuple, MemPool* pool) {
     void* slot = tuple->GetSlot(tuple_offset_);
-    T val;
-    T* val_ptr = NeedsConversionInline() ? &val : reinterpret_cast<T*>(slot);
+    // Use an uninitialized stack allocation for temporary value to avoid running
+    // constructors doing work unnecessarily, e.g. if T == StringValue.
+    alignas(T) uint8_t val_buf[sizeof(T)];
+    T* val_ptr = reinterpret_cast<T*>(NeedsConversionInline() ? val_buf : slot);
     if (IS_DICT_ENCODED) {
       DCHECK_EQ(page_encoding_, parquet::Encoding::PLAIN_DICTIONARY);
       if (UNLIKELY(!dict_decoder_.GetNextValue(val_ptr))) {
@@ -485,9 +497,8 @@ class ScalarColumnReader : public BaseScalarColumnReader {
     if (UNLIKELY(NeedsValidationInline() && !ValidateSlot(val_ptr, tuple))) {
       return false;
     }
-    if (UNLIKELY(NeedsConversionInline() &&
-        !tuple->IsNull(null_indicator_offset_) &&
-        !ConvertSlot(&val, reinterpret_cast<T*>(slot), pool))) {
+    if (UNLIKELY(NeedsConversionInline() && !tuple->IsNull(null_indicator_offset_)
+            && !ConvertSlot(val_ptr, reinterpret_cast<T*>(slot), pool))) {
       return false;
     }
     return true;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/runtime/tuple.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.h b/be/src/runtime/tuple.h
index cf278ae..15b90a4 100644
--- a/be/src/runtime/tuple.h
+++ b/be/src/runtime/tuple.h
@@ -72,8 +72,11 @@ class Tuple {
   void Init(int size) { memset(this, 0, size); }
 
   void ClearNullBits(const TupleDescriptor& tuple_desc) {
-    memset(reinterpret_cast<uint8_t*>(this) + tuple_desc.null_bytes_offset(),
-        0, tuple_desc.num_null_bytes());
+    ClearNullBits(tuple_desc.null_bytes_offset(), tuple_desc.num_null_bytes());
+  }
+
+  void ClearNullBits(int null_bytes_offset, int num_null_bytes) {
+    memset(reinterpret_cast<uint8_t*>(this) + null_bytes_offset, 0, num_null_bytes);
   }
 
   /// The total size of all data represented in this tuple (tuple data and referenced

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-stream-utils.inline.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-stream-utils.inline.h b/be/src/util/bit-stream-utils.inline.h
index dee2456..c8744aa 100644
--- a/be/src/util/bit-stream-utils.inline.h
+++ b/be/src/util/bit-stream-utils.inline.h
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H
 #define IMPALA_UTIL_BIT_STREAM_UTILS_INLINE_H
 
+#include "common/compiler-util.h"
 #include "util/bit-stream-utils.h"
 
 namespace impala {
@@ -84,14 +84,23 @@ inline bool BitWriter::PutVlqInt(int32_t v) {
   return result;
 }
 
-template<typename T>
-inline bool BitReader::GetValue(int num_bits, T* v) {
+/// Force inlining - this is used in perf-critical loops in Parquet and GCC often doesn't
+/// inline it in cases where it's beneficial.
+template <typename T>
+ALWAYS_INLINE inline bool BitReader::GetValue(int num_bits, T* v) {
   DCHECK(num_bits == 0 || buffer_ != NULL);
   // TODO: revisit this limit if necessary
   DCHECK_LE(num_bits, MAX_BITWIDTH);
   DCHECK_LE(num_bits, sizeof(T) * 8);
 
-  if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) return false;
+  // First do a cheap check to see if we may read past the end of the stream, using
+  // constant upper bounds for 'bit_offset_' and 'num_bits'.
+  if (UNLIKELY(byte_offset_ + sizeof(buffered_values_) + MAX_BITWIDTH / 8 > max_bytes_)) {
+    // Now do the precise check.
+    if (UNLIKELY(byte_offset_ * 8 + bit_offset_ + num_bits > max_bytes_ * 8)) {
+      return false;
+    }
+  }
 
   DCHECK_GE(bit_offset_, 0);
   DCHECK_LE(bit_offset_, 64);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 40a324c..581decc 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -165,11 +165,10 @@ class BitUtil {
   }
 
   /// Returns the 'num_bits' least-significant bits of 'v'.
-  static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
-    if (UNLIKELY(num_bits == 0)) return 0;
+  /// Force inlining - GCC does not always inline this into hot loops.
+  static ALWAYS_INLINE uint64_t TrailingBits(uint64_t v, int num_bits) {
     if (UNLIKELY(num_bits >= 64)) return v;
-    int n = 64 - num_bits;
-    return (v << n) >> n;
+    return ((1UL << num_bits) - 1) & v;
   }
 
   /// Swaps the byte order (i.e. endianess)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/dict-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h
index e842495..75e9340 100644
--- a/be/src/util/dict-encoding.h
+++ b/be/src/util/dict-encoding.h
@@ -22,8 +22,9 @@
 
 #include <boost/unordered_map.hpp>
 
-#include "gutil/strings/substitute.h"
+#include "common/compiler-util.h"
 #include "exec/parquet-common.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/mem-pool.h"
 #include "runtime/string-value.h"
 #include "util/bit-util.h"
@@ -285,8 +286,9 @@ inline int DictEncoder<StringValue>::AddToTable(const StringValue& value,
   return bytes_added;
 }
 
-template<typename T>
-inline bool DictDecoder<T>::GetNextValue(T* value) {
+// Force inlining - GCC does not always inline this into hot loops in Parquet scanner.
+template <typename T>
+ALWAYS_INLINE inline bool DictDecoder<T>::GetNextValue(T* value) {
   int index = -1; // Initialize to avoid compiler warning.
   bool result = data_decoder_.Get(&index);
   // Use & to avoid branches.
@@ -297,8 +299,10 @@ inline bool DictDecoder<T>::GetNextValue(T* value) {
   return false;
 }
 
-template<>
-inline bool DictDecoder<Decimal16Value>::GetNextValue(Decimal16Value* value) {
+// Force inlining - GCC does not always inline this into hot loops in Parquet scanner.
+template <>
+ALWAYS_INLINE inline bool DictDecoder<Decimal16Value>::GetNextValue(
+    Decimal16Value* value) {
   int index;
   bool result = data_decoder_.Get(&index);
   if (!result) return false;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b2782774/be/src/util/rle-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/rle-encoding.h b/be/src/util/rle-encoding.h
index 9f39697..2e2dd7f 100644
--- a/be/src/util/rle-encoding.h
+++ b/be/src/util/rle-encoding.h
@@ -245,8 +245,10 @@ class RleEncoder {
   uint8_t* literal_indicator_byte_;
 };
 
-template<typename T>
-inline bool RleDecoder::Get(T* val) {
+// Force inlining - this is used in perf-critical loops in Parquet and GCC often
+// doesn't inline it in cases where it's beneficial.
+template <typename T>
+ALWAYS_INLINE inline bool RleDecoder::Get(T* val) {
   DCHECK_GE(bit_width_, 0);
   // Profiling has shown that the quality and performance of the generated code is very
   // sensitive to the exact shape of this check. For example, the version below performs