You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2017/04/21 22:50:07 UTC

[1/4] incubator-impala git commit: IMPALA-4883: Union Codegen

Repository: incubator-impala
Updated Branches:
  refs/heads/master baba8960b -> 58b206ff0


IMPALA-4883: Union Codegen

For each non-passthrough child of the Union node, codegen the loop that
does per row tuple materialization.

Testing:
Ran test_queries.py test locally in exchaustive mode.

Benchmark:
Ran a local benchmark on a local 10 GB TPCDS dataset on an unpartitioned
store_sales table.

SELECT
  COUNT(c),
  COUNT(ss_customer_sk),
  COUNT(ss_cdemo_sk),
  COUNT(ss_hdemo_sk),
  COUNT(ss_addr_sk),
  COUNT(ss_store_sk),
  COUNT(ss_promo_sk),
  COUNT(ss_ticket_number),
  COUNT(ss_quantity),
  COUNT(ss_wholesale_cost),
  COUNT(ss_list_price),
  COUNT(ss_sales_price),
  COUNT(ss_ext_discount_amt),
  COUNT(ss_ext_sales_price),
  COUNT(ss_ext_wholesale_cost),
  COUNT(ss_ext_list_price),
  COUNT(ss_ext_tax),
  COUNT(ss_coupon_amt),
  COUNT(ss_net_paid),
  COUNT(ss_net_paid_inc_tax),
  COUNT(ss_net_profit),
  COUNT(ss_sold_date_sk)
FROM (
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
  union all
  select fnv_hash(ss_sold_time_sk) c, * from tpcds_10_parquet.store_sales_unpartitioned
) t

Before: 39s704ms
Operator          #Hosts   Avg Time   Max Time    #Rows  Est. #Rows   Peak Mem  Est. Peak Mem  Detail
------------------------------------------------------------------------------------------------------------------------------
13:AGGREGATE           1  194.504us  194.504us        1           1   28.00 KB        -1.00 B  FINALIZE
12:EXCHANGE            1   17.284us   17.284us        3           1          0        -1.00 B  UNPARTITIONED
11:AGGREGATE           3    2s202ms    2s934ms        3           1  115.00 KB       10.00 MB
00:UNION               3   32s514ms   34s926ms  288.01M     288.01M    3.08 MB              0
|--02:SCAN HDFS        3  158.373ms  216.085ms   28.80M      28.80M  489.71 MB        1.88 GB  tpcds_10_parquet.store_sales
|--03:SCAN HDFS        3  167.002ms  171.738ms   28.80M      28.80M  489.74 MB        1.88 GB  tpcds_10_parquet.store_sales
|--04:SCAN HDFS        3  125.331ms  145.496ms   28.80M      28.80M  489.57 MB        1.88 GB  tpcds_10_parquet.store_sales
|--05:SCAN HDFS        3  148.478ms  194.311ms   28.80M      28.80M  489.69 MB        1.88 GB  tpcds_10_parquet.store_sales
|--06:SCAN HDFS        3  143.995ms  162.781ms   28.80M      28.80M  489.57 MB        1.88 GB  tpcds_10_parquet.store_sales
|--07:SCAN HDFS        3  169.731ms  250.201ms   28.80M      28.80M  489.58 MB        1.88 GB  tpcds_10_parquet.store_sales
|--08:SCAN HDFS        3  164.110ms  254.374ms   28.80M      28.80M  489.61 MB        1.88 GB  tpcds_10_parquet.store_sales
|--09:SCAN HDFS        3  135.631ms  162.117ms   28.80M      28.80M  489.63 MB        1.88 GB  tpcds_10_parquet.store_sales
|--10:SCAN HDFS        3  138.736ms  167.778ms   28.80M      28.80M  489.67 MB        1.88 GB  tpcds_10_parquet.store_sales
01:SCAN HDFS           3  202.015ms  248.728ms   28.80M      28.80M  489.68 MB        1.88 GB  tpcds_10_parquet.store_sales

After: 20s177ms
Operator          #Hosts   Avg Time   Max Time    #Rows  Est. #Rows   Peak Mem  Est. Peak Mem  Detail
------------------------------------------------------------------------------------------------------------------------------
13:AGGREGATE           1  174.617us  174.617us        1           1   28.00 KB        -1.00 B  FINALIZE
12:EXCHANGE            1   16.693us   16.693us        3           1          0        -1.00 B  UNPARTITIONED
11:AGGREGATE           3    2s830ms    3s615ms        3           1  115.00 KB       10.00 MB
00:UNION               3    4s296ms    5s258ms  288.01M     288.01M    3.08 MB              0
|--02:SCAN HDFS        3    1s212ms    1s340ms   28.80M      28.80M  488.82 MB        1.88 GB  tpcds_10_parquet.store_sales
|--03:SCAN HDFS        3    1s387ms    1s570ms   28.80M      28.80M  489.37 MB        1.88 GB  tpcds_10_parquet.store_sales
|--04:SCAN HDFS        3    1s224ms    1s347ms   28.80M      28.80M  487.22 MB        1.88 GB  tpcds_10_parquet.store_sales
|--05:SCAN HDFS        3    1s245ms    1s321ms   28.80M      28.80M  489.25 MB        1.88 GB  tpcds_10_parquet.store_sales
|--06:SCAN HDFS        3    1s232ms    1s505ms   28.80M      28.80M  484.21 MB        1.88 GB  tpcds_10_parquet.store_sales
|--07:SCAN HDFS        3    1s348ms    1s518ms   28.80M      28.80M  488.20 MB        1.88 GB  tpcds_10_parquet.store_sales
|--08:SCAN HDFS        3    1s231ms    1s335ms   28.80M      28.80M  483.58 MB        1.88 GB  tpcds_10_parquet.store_sales
|--09:SCAN HDFS        3    1s179ms    1s349ms   28.80M      28.80M  482.76 MB        1.88 GB  tpcds_10_parquet.store_sales
|--10:SCAN HDFS        3    1s121ms    1s154ms   28.80M      28.80M  486.59 MB        1.88 GB  tpcds_10_parquet.store_sales
01:SCAN HDFS           3    1s284ms    1s523ms   28.80M      28.80M  486.70 MB        1.88 GB  tpcds_10_parquet.store_sales

Change-Id: Ib4107d27582ff5416172810364a6e76d3d93c439
Reviewed-on: http://gerrit.cloudera.org:8080/6459
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/75553165
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/75553165
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/75553165

Branch: refs/heads/master
Commit: 75553165eed6c5decaa7fd0bfa3ae4d537aeb7ff
Parents: baba896
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Tue Mar 21 18:21:23 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Apr 21 04:53:09 2017 +0000

----------------------------------------------------------------------
 be/src/codegen/gen_ir_descriptions.py           |   4 +-
 be/src/codegen/impala-ir.cc                     |   1 +
 be/src/exec/CMakeLists.txt                      |   1 +
 be/src/exec/union-node-ir.cc                    |  51 +++++++
 be/src/exec/union-node.cc                       | 133 ++++++++++++-------
 be/src/exec/union-node.h                        |  34 ++++-
 .../queries/QueryTest/nested-types-subplan.test |   7 +-
 .../queries/QueryTest/union.test                |  29 ++++
 8 files changed, 203 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/codegen/gen_ir_descriptions.py
----------------------------------------------------------------------
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index 4b62dfe..8a82218 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -214,7 +214,9 @@ ir_functions = [
   ["MEMPOOL_CHECKED_ALLOCATE",
    "_ZN6impala7MemPool8AllocateILb1EEEPhli"],
   ["RUNTIME_FILTER_EVAL",
-   "_ZNK6impala13RuntimeFilter4EvalEPvRKNS_10ColumnTypeE"]
+   "_ZNK6impala13RuntimeFilter4EvalEPvRKNS_10ColumnTypeE"],
+  ["UNION_MATERIALIZE_BATCH",
+  "_ZN6impala9UnionNode16MaterializeBatchEPNS_8RowBatchEPPh"]
 ]
 
 enums_preamble = '\

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/codegen/impala-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/impala-ir.cc b/be/src/codegen/impala-ir.cc
index 5ff816b..6c8a9d5 100644
--- a/be/src/codegen/impala-ir.cc
+++ b/be/src/codegen/impala-ir.cc
@@ -37,6 +37,7 @@
 #include "exec/partitioned-hash-join-builder-ir.cc"
 #include "exec/partitioned-hash-join-node-ir.cc"
 #include "exec/topn-node-ir.cc"
+#include "exec/union-node-ir.cc"
 #include "exprs/aggregate-functions-ir.cc"
 #include "exprs/bit-byte-functions-ir.cc"
 #include "exprs/cast-functions-ir.cc"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/exec/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/exec/CMakeLists.txt b/be/src/exec/CMakeLists.txt
index 1193660..8632790 100644
--- a/be/src/exec/CMakeLists.txt
+++ b/be/src/exec/CMakeLists.txt
@@ -98,6 +98,7 @@ add_library(Exec
   topn-node.cc
   topn-node-ir.cc
   union-node.cc
+  union-node-ir.cc
   unnest-node.cc
 )
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/exec/union-node-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/union-node-ir.cc b/be/src/exec/union-node-ir.cc
new file mode 100644
index 0000000..38dfc97
--- /dev/null
+++ b/be/src/exec/union-node-ir.cc
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exec/union-node.h"
+#include "runtime/tuple-row.h"
+
+using namespace impala;
+
+void IR_ALWAYS_INLINE UnionNode::MaterializeExprs(const std::vector<ExprContext*>& exprs,
+    TupleRow* row, uint8_t* tuple_buf, RowBatch* dst_batch) {
+  DCHECK(!dst_batch->AtCapacity());
+  Tuple* dst_tuple = reinterpret_cast<Tuple*>(tuple_buf);
+  TupleRow* dst_row = dst_batch->GetRow(dst_batch->AddRow());
+  dst_tuple->MaterializeExprs<false, false>(row, *tuple_desc_, exprs, tuple_pool_.get());
+  dst_row->SetTuple(0, dst_tuple);
+  dst_batch->CommitLastRow();
+}
+
+void UnionNode::MaterializeBatch(RowBatch* dst_batch, uint8_t** tuple_buf) {
+  // Take all references to member variables out of the loop to reduce the number of
+  // loads and stores.
+  RowBatch* child_batch = child_batch_.get();
+  int tuple_byte_size = tuple_desc_->byte_size();
+  uint8_t* cur_tuple = *tuple_buf;
+  const std::vector<ExprContext*>& child_exprs = child_expr_lists_[child_idx_];
+
+  int num_rows_to_process = std::min(child_batch->num_rows() - child_row_idx_,
+      dst_batch->capacity() - dst_batch->num_rows());
+  FOREACH_ROW_LIMIT(child_batch, child_row_idx_, num_rows_to_process, batch_iter) {
+    TupleRow* child_row = batch_iter.Get();
+    MaterializeExprs(child_exprs, child_row, cur_tuple, dst_batch);
+    cur_tuple += tuple_byte_size;
+  }
+
+  child_row_idx_ += num_rows_to_process;
+  *tuple_buf = cur_tuple;
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/exec/union-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/union-node.cc b/be/src/exec/union-node.cc
index bff7435..0da1760 100644
--- a/be/src/exec/union-node.cc
+++ b/be/src/exec/union-node.cc
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "codegen/llvm-codegen.h"
 #include "exec/union-node.h"
 #include "exprs/expr.h"
 #include "exprs/expr-context.h"
@@ -27,7 +28,8 @@
 
 #include "common/names.h"
 
-namespace impala {
+using namespace llvm;
+using namespace impala;
 
 UnionNode::UnionNode(ObjectPool* pool, const TPlanNode& tnode,
     const DescriptorTbl& descs)
@@ -35,6 +37,7 @@ UnionNode::UnionNode(ObjectPool* pool, const TPlanNode& tnode,
       tuple_id_(tnode.union_node.tuple_id),
       tuple_desc_(nullptr),
       first_materialized_child_idx_(tnode.union_node.first_materialized_child_idx),
+      tuple_pool_(nullptr),
       child_idx_(0),
       child_batch_(nullptr),
       child_row_idx_(0),
@@ -68,6 +71,8 @@ Status UnionNode::Prepare(RuntimeState* state) {
   RETURN_IF_ERROR(ExecNode::Prepare(state));
   tuple_desc_ = state->desc_tbl().GetTupleDescriptor(tuple_id_);
   DCHECK(tuple_desc_ != nullptr);
+  tuple_pool_.reset(new MemPool(mem_tracker()));
+  codegend_union_materialize_batch_fns_.resize(child_expr_lists_.size());
 
   // Prepare const expr lists.
   for (const vector<ExprContext*>& exprs : const_expr_lists_) {
@@ -86,6 +91,50 @@ Status UnionNode::Prepare(RuntimeState* state) {
   return Status::OK();
 }
 
+void UnionNode::Codegen(RuntimeState* state) {
+  DCHECK(state->ShouldCodegen());
+  ExecNode::Codegen(state);
+  if (IsNodeCodegenDisabled()) return;
+
+  LlvmCodeGen* codegen = state->codegen();
+  DCHECK(codegen != nullptr);
+  std::stringstream codegen_message;
+  Status codegen_status;
+  for (int i = 0; i < child_expr_lists_.size(); ++i) {
+    if (IsChildPassthrough(i)) continue;
+
+    llvm::Function* tuple_materialize_exprs_fn;
+    codegen_status = Tuple::CodegenMaterializeExprs(codegen, false, *tuple_desc_,
+        child_expr_lists_[i], tuple_pool_.get(), &tuple_materialize_exprs_fn);
+    if (!codegen_status.ok()) {
+      // Codegen may fail in some corner cases (e.g. we don't handle TYPE_CHAR). If this
+      // happens, abort codegen for this and the remaining children.
+      codegen_message << "Codegen failed for child: " << children_[i]->id();
+      break;
+    }
+
+    // Get a copy of the function. This function will be modified and added to the
+    // vector of functions.
+    Function* union_materialize_batch_fn =
+        codegen->GetFunction(IRFunction::UNION_MATERIALIZE_BATCH, true);
+    DCHECK(union_materialize_batch_fn != nullptr);
+
+    int replaced = codegen->ReplaceCallSites(union_materialize_batch_fn,
+        tuple_materialize_exprs_fn, Tuple::MATERIALIZE_EXPRS_SYMBOL);
+    DCHECK_EQ(replaced, 1) << LlvmCodeGen::Print(union_materialize_batch_fn);
+
+    union_materialize_batch_fn = codegen->FinalizeFunction(
+        union_materialize_batch_fn);
+    DCHECK(union_materialize_batch_fn != nullptr);
+
+    // Add the function to Jit and to the vector of codegened functions.
+    codegen->AddFunctionToJit(union_materialize_batch_fn,
+        reinterpret_cast<void**>(&(codegend_union_materialize_batch_fns_.data()[i])));
+  }
+  runtime_profile()->AddCodegenMsg(
+      codegen_status.ok(), codegen_status, codegen_message.str());
+}
+
 Status UnionNode::Open(RuntimeState* state) {
   SCOPED_TIMER(runtime_profile_->total_time_counter());
   RETURN_IF_ERROR(ExecNode::Open(state));
@@ -114,12 +163,6 @@ Status UnionNode::GetNextPassThrough(RuntimeState* state, RowBatch* row_batch) {
   if (child_eos_) RETURN_IF_ERROR(child(child_idx_)->Open(state));
   DCHECK_EQ(row_batch->num_rows(), 0);
   RETURN_IF_ERROR(child(child_idx_)->GetNext(state, row_batch, &child_eos_));
-  if (limit_ != -1 && num_rows_returned_ + row_batch->num_rows() > limit_) {
-    row_batch->set_num_rows(limit_ - num_rows_returned_);
-  }
-  num_rows_returned_ += row_batch->num_rows();
-  DCHECK(limit_ == -1 || num_rows_returned_ <= limit_);
-  COUNTER_SET(rows_returned_counter_, num_rows_returned_);
   if (child_eos_) {
     // Even though the child is at eos, it's not OK to Close() it here. Once we close
     // the child, the row batches that it produced are invalid. Marking the batch as
@@ -143,10 +186,8 @@ Status UnionNode::GetNextMaterialized(RuntimeState* state, RowBatch* row_batch)
   memset(tuple_buf, 0, tuple_buf_size);
 
   while (HasMoreMaterialized() && !row_batch->AtCapacity()) {
-    // There are only 2 ways of getting out of this loop:
-    // 1. The loop ends normally when we are either done iterating over the children that
-    //    need materialization or the row batch is at capacity.
-    // 2. We return from the function from inside the loop if limit is reached.
+    // The loop runs until we are either done iterating over the children that require
+    // materialization, or the row batch is at capacity.
     DCHECK(!IsChildPassthrough(child_idx_));
     // Child row batch was either never set or we're moving on to a different child.
     if (child_batch_.get() == nullptr) {
@@ -163,12 +204,6 @@ Status UnionNode::GetNextMaterialized(RuntimeState* state, RowBatch* row_batch)
     }
 
     while (!row_batch->AtCapacity()) {
-      // This loop fetches row batches from a single child and materializes each output
-      // row, until one of these conditions:
-      // 1. The loop ends normally if the row batch is at capacity.
-      // 2. We break out of the loop if all the rows were consumed from the current child
-      //    and we are moving on to the next child.
-      // 3. We return from the function from inside the loop if the limit is reached.
       DCHECK(child_batch_.get() != nullptr);
       DCHECK_LE(child_row_idx_, child_batch_->num_rows());
       if (child_row_idx_ == child_batch_->num_rows()) {
@@ -184,21 +219,17 @@ Status UnionNode::GetNextMaterialized(RuntimeState* state, RowBatch* row_batch)
         // try again.
         if (child_batch_->num_rows() == 0) continue;
       }
-      DCHECK_LT(child_row_idx_, child_batch_->num_rows());
-      TupleRow* child_row = child_batch_->GetRow(child_row_idx_);
-      MaterializeExprs(child_expr_lists_[child_idx_], child_row, tuple_buf, row_batch);
-      tuple_buf += tuple_desc_->byte_size();
-      ++child_row_idx_;
-      if (ReachedLimit()) {
-        COUNTER_SET(rows_returned_counter_, num_rows_returned_);
-        // It's OK to close the child here even if we are inside a subplan.
-        child_batch_.reset();
-        child(child_idx_)->Close(state);
-        return Status::OK();
+      DCHECK_EQ(codegend_union_materialize_batch_fns_.size(), children_.size());
+      if (codegend_union_materialize_batch_fns_[child_idx_] == nullptr) {
+        MaterializeBatch(row_batch, &tuple_buf);
+      } else {
+        codegend_union_materialize_batch_fns_[child_idx_](this, row_batch, &tuple_buf);
       }
     }
-
+    // It shouldn't be the case that we reached the limit because we shouldn't have
+    // incremented 'num_rows_returned_' yet.
     DCHECK(!ReachedLimit());
+
     if (child_eos_ && child_row_idx_ == child_batch_->num_rows()) {
       // Unless we are inside a subplan expecting to call Open()/GetNext() on the child
       // again, the child can be closed at this point.
@@ -225,15 +256,14 @@ Status UnionNode::GetNextConst(RuntimeState* state, RowBatch* row_batch) {
   RETURN_IF_ERROR(
       row_batch->ResizeAndAllocateTupleBuffer(state, &tuple_buf_size, &tuple_buf));
   memset(tuple_buf, 0, tuple_buf_size);
-  while (const_expr_list_idx_ < const_expr_lists_.size() &&
-      !row_batch->AtCapacity() && !ReachedLimit()) {
+
+  while (const_expr_list_idx_ < const_expr_lists_.size() && !row_batch->AtCapacity()) {
     MaterializeExprs(
         const_expr_lists_[const_expr_list_idx_], nullptr, tuple_buf, row_batch);
     tuple_buf += tuple_desc_->byte_size();
     ++const_expr_list_idx_;
   }
 
-  COUNTER_SET(rows_returned_counter_, num_rows_returned_);
   return Status::OK();
 }
 
@@ -242,15 +272,22 @@ Status UnionNode::GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos) {
   RETURN_IF_ERROR(ExecDebugAction(TExecNodePhase::GETNEXT, state));
   RETURN_IF_CANCELLED(state);
   RETURN_IF_ERROR(QueryMaintenance(state));
+  // The tuple pool should be empty between GetNext() calls.
+  DCHECK_EQ(tuple_pool_.get()->GetTotalChunkSizes(), 0);
 
   if (to_close_child_idx_ != -1) {
     // The previous child needs to be closed if passthrough was enabled for it. In the non
     // passthrough case, the child was already closed in the previous call to GetNext().
     DCHECK(IsChildPassthrough(to_close_child_idx_));
+    DCHECK(!IsInSubplan());
     child(to_close_child_idx_)->Close(state);
     to_close_child_idx_ = -1;
   }
 
+  // Save the number of rows in case GetNext() is called with a non-empty batch, which can
+  // happen in a subplan.
+  int num_rows_before = row_batch->num_rows();
+
   if (HasMorePassthrough()) {
     RETURN_IF_ERROR(GetNextPassThrough(state, row_batch));
   } else if (HasMoreMaterialized()) {
@@ -259,32 +296,33 @@ Status UnionNode::GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos) {
     RETURN_IF_ERROR(GetNextConst(state, row_batch));
   }
 
+  int num_rows_added = row_batch->num_rows() - num_rows_before;
+  DCHECK_GE(num_rows_added, 0);
+  if (limit_ != -1 && num_rows_returned_ + num_rows_added > limit_) {
+    // Truncate the row batch if we went over the limit.
+    num_rows_added = limit_ - num_rows_returned_;
+    row_batch->set_num_rows(num_rows_before + num_rows_added);
+    DCHECK_GE(num_rows_added, 0);
+  }
+  num_rows_returned_ += num_rows_added;
+
   *eos = ReachedLimit() ||
       (!HasMorePassthrough() && !HasMoreMaterialized() && !HasMoreConst(state));
 
+  // Attach the memory in the tuple pool (if any) to the row batch.
+  row_batch->tuple_data_pool()->AcquireData(tuple_pool_.get(), false);
+  COUNTER_SET(rows_returned_counter_, num_rows_returned_);
   return Status::OK();
 }
 
-void UnionNode::MaterializeExprs(const vector<ExprContext*>& exprs,
-    TupleRow* row, uint8_t* tuple_buf, RowBatch* dst_batch) {
-  DCHECK(!dst_batch->AtCapacity());
-  Tuple* dst_tuple = reinterpret_cast<Tuple*>(tuple_buf);
-  TupleRow* dst_row = dst_batch->GetRow(dst_batch->AddRow());
-  dst_tuple->MaterializeExprs<false, false>(row, *tuple_desc_,
-      exprs, dst_batch->tuple_data_pool());
-  dst_row->SetTuple(0, dst_tuple);
-  dst_batch->CommitLastRow();
-  ++num_rows_returned_;
-}
-
 Status UnionNode::Reset(RuntimeState* state) {
   child_idx_ = 0;
   child_batch_.reset();
   child_row_idx_ = 0;
   child_eos_ = false;
   const_expr_list_idx_ = 0;
-  // Since passthrough is disabled in subplans, verify that there is no passthrough
-  // child that needs to be closed.
+  // Since passthrough is disabled in subplans, verify that there is no passthrough child
+  // that needs to be closed.
   DCHECK_EQ(to_close_child_idx_, -1);
   return ExecNode::Reset(state);
 }
@@ -298,7 +336,6 @@ void UnionNode::Close(RuntimeState* state) {
   for (const vector<ExprContext*>& exprs : child_expr_lists_) {
     Expr::Close(exprs, state);
   }
+  if (tuple_pool_.get() != nullptr) tuple_pool_->FreeAll();
   ExecNode::Close(state);
 }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/be/src/exec/union-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/union-node.h b/be/src/exec/union-node.h
index e1474d0..b1715f9 100644
--- a/be/src/exec/union-node.h
+++ b/be/src/exec/union-node.h
@@ -21,6 +21,7 @@
 
 #include <boost/scoped_ptr.hpp>
 
+#include "codegen/impala-ir.h"
 #include "exec/exec-node.h"
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
@@ -46,6 +47,7 @@ class UnionNode : public ExecNode {
 
   virtual Status Init(const TPlanNode& tnode, RuntimeState* state);
   virtual Status Prepare(RuntimeState* state);
+  virtual void Codegen(RuntimeState* state);
   virtual Status Open(RuntimeState* state);
   virtual Status GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos);
   virtual Status Reset(RuntimeState* state);
@@ -63,6 +65,13 @@ class UnionNode : public ExecNode {
   /// materialized.
   const int first_materialized_child_idx_;
 
+  /// Used by MaterializeExprs() to materialize var-len slots. The ownership of the memory
+  /// in this pool should be transferred to the row batch at the end of each GetNext()
+  /// call. The memory can't be attached to the row batch in MaterializeExprs() because
+  /// the pointer to the mem pool is hard coded in the codegen'ed MaterializeExprs().
+  /// TODO (IMPALA-5192): Remove this when no longer necessary in the future.
+  boost::scoped_ptr<MemPool> tuple_pool_;
+
   /// Const exprs materialized by this node. These exprs don't refer to any children.
   /// Only materialized by the first fragment instance to avoid duplication.
   std::vector<std::vector<ExprContext*>> const_expr_lists_;
@@ -83,6 +92,13 @@ class UnionNode : public ExecNode {
   /// Index of current row in child_row_batch_.
   int child_row_idx_;
 
+  typedef void (*UnionMaterializeBatchFn)(UnionNode*, RowBatch*, uint8_t**);
+  /// Vector of pointers to codegen'ed MaterializeBatch functions. The vector contains one
+  /// function for each child. The size of the vector should be equal to the number of
+  /// children. If a child is passthrough, there should be a NULL for that child. If
+  /// Codegen is disabled, there should be a NULL for every child.
+  std::vector<UnionMaterializeBatchFn> codegend_union_materialize_batch_fns_;
+
   /// Saved from the last to GetNext() on the current child.
   bool child_eos_;
 
@@ -96,6 +112,9 @@ class UnionNode : public ExecNode {
   /// END: Members that must be Reset()
   /////////////////////////////////////////
 
+  /// The following GetNext* functions don't apply the limit. It must be enforced by the
+  /// caller.
+
   /// GetNext() for the passthrough case. We pass 'row_batch' directly into the GetNext()
   /// call on the child.
   Status GetNextPassThrough(RuntimeState* state, RowBatch* row_batch);
@@ -107,31 +126,36 @@ class UnionNode : public ExecNode {
   /// GetNext() for the constant expression case.
   Status GetNextConst(RuntimeState* state, RowBatch* row_batch);
 
+  /// Evaluates exprs for the current child and materializes the results into 'tuple_buf',
+  /// which is attached to 'dst_batch'. Runs until 'dst_batch' is at capacity, or all rows
+  /// have been consumed from the current child batch. Updates 'child_row_idx_'.
+  void MaterializeBatch(RowBatch* dst_batch, uint8_t** tuple_buf);
+
   /// Evaluates 'exprs' over 'row', materializes the results in 'tuple_buf'.
   /// and appends the new tuple to 'dst_batch'. Increments 'num_rows_returned_'.
-  inline void MaterializeExprs(const std::vector<ExprContext*>& exprs,
+  void MaterializeExprs(const std::vector<ExprContext*>& exprs,
       TupleRow* row, uint8_t* tuple_buf, RowBatch* dst_batch);
 
   /// Returns true if the child at 'child_idx' can be passed through.
-  inline bool IsChildPassthrough(int child_idx) const {
+  bool IsChildPassthrough(int child_idx) const {
     DCHECK_LT(child_idx, children_.size());
     return child_idx < first_materialized_child_idx_;
   }
 
   /// Returns true if there are still rows to be returned from passthrough children.
-  inline bool HasMorePassthrough() const {
+  bool HasMorePassthrough() const {
     return child_idx_ < first_materialized_child_idx_;
   }
 
   /// Returns true if there are still rows to be returned from children that need
   /// materialization.
-  inline bool HasMoreMaterialized() const {
+  bool HasMoreMaterialized() const {
     return first_materialized_child_idx_ != children_.size() &&
         child_idx_ < children_.size();
   }
 
   /// Returns true if there are still rows to be returned from constant expressions.
-  inline bool HasMoreConst(const RuntimeState* state) const {
+  bool HasMoreConst(const RuntimeState* state) const {
     return state->instance_ctx().per_fragment_instance_idx == 0 &&
         const_expr_list_idx_ < const_expr_lists_.size();
   }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/testdata/workloads/functional-query/queries/QueryTest/nested-types-subplan.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-subplan.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-subplan.test
index fa494ab..90c925d 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-subplan.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-subplan.test
@@ -580,9 +580,10 @@ BIGINT,BIGINT
 # IMPALA-3678: union in a subplan - passthrough should be disabled.
 select count(c.c_custkey), count(v.tot_price)
 from tpch_nested_parquet.customer c, (
-select sum(o_totalprice) tot_price from c.c_orders
-union
-select sum(o_totalprice) tot_price from c.c_orders) v;
+  select sum(o_totalprice) tot_price from c.c_orders
+  union
+  select sum(o_totalprice) tot_price from c.c_orders
+) v;
 ---- RESULTS
 150000,99996
 ---- TYPES

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/75553165/testdata/workloads/functional-query/queries/QueryTest/union.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/union.test b/testdata/workloads/functional-query/queries/QueryTest/union.test
index ae75c12..5783c24 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/union.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/union.test
@@ -1121,3 +1121,32 @@ select t1.bigint_col from alltypestiny t1 inner join
 ---- TYPES
 bigint
 =====
+---- QUERY
+# IMPALA-4883: The second union operand references a char column, which causes codegen
+# to fail and be disabled for that operand and all operands that follow it. For this query
+# codegen is enabled only for the first operand.
+select count(s) from (
+  select cast(id as string) as s from alltypestiny
+  union all
+  select cast(cl as string) as s from functional.chars_tiny
+  union all
+  select cast(id as string) as s from alltypestiny
+) t
+---- RESULTS
+24
+---- TYPES
+bigint
+=====
+---- QUERY
+# IMPALA-4883: Verify that the union limit is enforced correctly
+select count(b) from (
+  select bigint_col as b from alltypestiny limit 4
+  union all
+  (select bigint_col as b from alltypestiny limit 4)
+  limit 7
+) t
+---- RESULTS
+7
+---- TYPES
+bigint
+=====


[4/4] incubator-impala git commit: IMPALA-5229: huge page-backed buffers with TCMalloc

Posted by ta...@apache.org.
IMPALA-5229: huge page-backed buffers with TCMalloc

This commit enables transparent huge pages when we're
allocating via malloc(), not just mmap(). This gives
us the perf benefits of huge pages, without the
challenge that the mmap() path presented - the overhead
of mapping and unmapping memory and the difficulty in
reasoning about peak virtual memory consumption.

Also sneak in some cleanup - use Rvalue refs for
BufferHandle methods where appropriate.

Testing:
Updated backend tests to ensure this combination is covered.
Ran some end-to-end tests and stress tests on my buffer pool
dev branch and all looks good.

Perf:
Compared to current master, this provides a pretty clear perf
benefit: I ran benchmarks on a single daemon with a reasonably
large TPC-H scale factor. Large aggregations are much faster
and everything else is the same (within variance) or slightly
faster.

Report Generated on 2017-04-18
Run Description: "Base: 68f32e52bc42bef578330a4fe0edc5b292891eea vs Ref: f39d69bcd8bdc7d6d4fb42ef19966a26dea3a29d"

Cluster Name: UNKNOWN
Lab Run Info: UNKNOWN
Impala Version:          impalad version 2.9.0-SNAPSHOT RELEASE ()
Baseline Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE (2017-04-06)

+--------------------+-----------------------+---------+------------+------------+----------------+
| Workload           | File Format           | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) |
+--------------------+-----------------------+---------+------------+------------+----------------+
| TARGETED-PERF(_60) | parquet / none / none | 19.30   | -3.05%     | 4.91       | -0.91%         |
+--------------------+-----------------------+---------+------------+------------+----------------+

+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+
| Workload           | Query                                                  | File Format           | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%)  | Base StdDev(%) | Num Clients | Iters |
+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+
| TARGETED-PERF(_60) | PERF_LIMIT-Q1                                          | parquet / none / none | 0.01   | 0.01        | R +22.95%  |   6.12%    |   2.30%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_topn_bigint                                  | parquet / none / none | 5.14   | 4.66        |   +10.35%  |   5.00%    | * 13.39% *     | 1           | 5     |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_4                          | parquet / none / none | 0.24   | 0.23        |   +8.12%   | * 12.81% * |   1.76%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_broadcast_join_3                             | parquet / none / none | 7.86   | 7.39        |   +6.44%   |   1.49%    |   1.41%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q6                                         | parquet / none / none | 10.53  | 10.30       |   +2.24%   |   0.61%    |   0.30%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_5                          | parquet / none / none | 17.23  | 16.90       |   +1.90%   |   1.61%    |   1.05%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_3                          | parquet / none / none | 3.19   | 3.13        |   +1.81%   |   1.47%    |   0.45%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_bigint_non_selective                  | parquet / none / none | 1.08   | 1.06        |   +1.60%   |   0.26%    |   2.07%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q3                                         | parquet / none / none | 3.75   | 3.71        |   +1.14%   |   0.39%    |   0.80%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_broadcast_join_2                             | parquet / none / none | 5.15   | 5.09        |   +1.11%   |   1.18%    |   0.89%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q2                                         | parquet / none / none | 3.47   | 3.44        |   +1.03%   |   1.27%    |   0.61%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q1                                            | parquet / none / none | 2.53   | 2.51        |   +1.01%   |   1.75%    |   1.91%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q7                                         | parquet / none / none | 8.37   | 8.31        |   +0.81%   |   0.49%    |   0.58%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_string_non_selective                  | parquet / none / none | 1.90   | 1.88        |   +0.74%   |   1.81%    |   0.73%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_string_like                           | parquet / none / none | 14.82  | 14.73       |   +0.62%   |   0.17%    |   0.02%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_top-n_all                                    | parquet / none / none | 40.93  | 40.69       |   +0.61%   |   0.09%    |   0.15%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q4                                         | parquet / none / none | 3.76   | 3.74        |   +0.60%   |   0.79%    |   0.53%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_broadcast_join_1                             | parquet / none / none | 2.54   | 2.53        |   +0.56%   |   1.43%    |   0.88%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q2                                            | parquet / none / none | 8.90   | 8.85        |   +0.56%   |   0.80%    |   0.81%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_empty_build_join_1                           | parquet / none / none | 23.47  | 23.39       |   +0.35%   |   0.24%    |   0.30%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_bigint_selective                      | parquet / none / none | 0.17   | 0.17        |   +0.35%   |   0.91%    |   1.37%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_orderby_all                                  | parquet / none / none | 32.74  | 32.66       |   +0.24%   |   0.30%    |   0.24%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q5                                            | parquet / none / none | 0.63   | 0.63        |   +0.02%   |   0.42%    |   0.79%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_string_selective                      | parquet / none / none | 1.94   | 1.94        |   -0.08%   |   2.31%    |   1.13%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_decimal_selective                     | parquet / none / none | 1.63   | 1.63        |   -0.10%   |   0.40%    |   0.64%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q5                                         | parquet / none / none | 4.63   | 4.64        |   -0.13%   |   0.54%    |   0.67%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_2                          | parquet / none / none | 9.98   | 10.03       |   -0.44%   |   0.41%    |   0.64%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q7                                            | parquet / none / none | 2.04   | 2.05        |   -0.59%   |   2.70%    |   2.54%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_orderby_bigint                               | parquet / none / none | 5.76   | 5.81        |   -0.75%   |   0.89%    |   0.40%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_in_predicate                          | parquet / none / none | 2.26   | 2.27        |   -0.84%   |   1.72%    |   1.91%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_exchange_broadcast                           | parquet / none / none | 37.21  | 37.57       |   -0.95%   |   1.86%    |   1.55%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_STRING-Q1                                         | parquet / none / none | 3.15   | 3.20        |   -1.33%   |   0.66%    |   0.41%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_groupby_decimal_lowndv.test                  | parquet / none / none | 3.82   | 3.87        |   -1.34%   |   1.12%    |   1.45%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_shuffle_join_one_to_many_string_with_groupby | parquet / none / none | 262.47 | 266.28      |   -1.43%   |   0.83%    |   0.65%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_conjunct_ordering_1                          | parquet / none / none | 0.07   | 0.07        |   -1.76%   |   0.46%    |   3.42%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_groupby_bigint_highndv                       | parquet / none / none | 32.43  | 33.08       |   -1.97%   |   0.55%    |   0.89%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_groupby_bigint_lowndv                        | parquet / none / none | 3.80   | 3.89        |   -2.25%   |   0.50%    |   0.88%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_groupby_bigint_pk                            | parquet / none / none | 111.05 | 113.86      |   -2.47%   |   0.31%    |   0.25%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_shuffle_join_union_all_with_groupby          | parquet / none / none | 52.19  | 53.59       |   -2.60%   |   0.12%    |   0.39%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_filter_decimal_non_selective                 | parquet / none / none | 1.64   | 1.69        |   -2.77%   |   1.37%    |   3.05%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q3                                            | parquet / none / none | 12.34  | 12.90       |   -4.32%   |   0.81%    |   1.05%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_exchange_shuffle                             | parquet / none / none | 80.49  | 85.96       |   -6.37%   |   3.41%    |   0.54%        | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q6                                            | parquet / none / none | 2.01   | 2.27        |   -11.49%  |   1.05%    | * 15.97% *     | 1           | 5     |
| TARGETED-PERF(_60) | PERF_AGG-Q4                                            | parquet / none / none | 17.69  | 21.46       |   -17.59%  |   0.44%    |   0.10%        | 1           | 5     |
| TARGETED-PERF(_60) | primitive_groupby_decimal_highndv                      | parquet / none / none | 21.50  | 31.83       | I -32.43%  |   2.50%    |   0.56%        | 1           | 5     |
+--------------------+--------------------------------------------------------+-----------------------+--------+-------------+------------+------------+----------------+-------------+-------+

(R) Regression: TARGETED-PERF(_60) PERF_LIMIT-Q1 [parquet / none / none] (0.01s -> 0.01s [+22.95%])
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+
| Operator | % of Query | Avg | Base Avg | Delta(Avg) | StdDev(%) | Max | Base Max | Delta(Max) | #Hosts | #Rows | Est #Rows |
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+
+----------+------------+-----+----------+------------+-----------+-----+----------+------------+--------+-------+-----------+

(I) Improvement: TARGETED-PERF(_60) primitive_groupby_decimal_highndv [parquet / none / none] (31.83s -> 21.50s [-32.43%])
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+
| Operator     | % of Query | Avg      | Base Avg | Delta(Avg) | StdDev(%) | Max      | Base Max | Delta(Max) | #Hosts | #Rows   | Est #Rows |
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+
| 01:AGGREGATE | 94.77%     | 20.36s   | 30.60s   | -33.47%    |   2.53%   | 21.20s   | 30.73s   | -31.02%    | 1      | 3.17M   | 2.98M     |
| 00:SCAN HDFS | 3.65%      | 783.14ms | 841.47ms | -6.93%     |   3.69%   | 832.41ms | 861.74ms | -3.40%     | 1      | 360.01M | 360.01M   |
+--------------+------------+----------+----------+------------+-----------+----------+----------+------------+--------+---------+-----------+

(V) Significant Variability: TARGETED-PERF(_60) primitive_conjunct_ordering_4 [parquet / none / none] (1.76% -> 12.81%)
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+
| Operator     | % of Query | StdDev(%) | Base StdDev(%) | Delta(StdDev(%)) | #Hosts | #Rows | Est #Rows |
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+
| 03:AGGREGATE | 10.76%     | 18.81%    | 16.87%         | +11.48%          | 1      | 0     | 1         |
| 01:AGGREGATE | 5.02%      | 38.41%    | 14.69%         | +161.46%         | 1      | 0     | 1         |
+--------------+------------+-----------+----------------+------------------+--------+-------+-----------+

Significant perf change detected

Run Description: "Base: 68f32e52bc42bef578330a4fe0edc5b292891eea vs Ref: f39d69bcd8bdc7d6d4fb42ef19966a26dea3a29d"

Cluster Name: UNKNOWN
Lab Run Info: UNKNOWN
Impala Version:          impalad version 2.9.0-SNAPSHOT RELEASE ()
Baseline Impala Version: impalad version 2.9.0-SNAPSHOT RELEASE (2017-04-06)

+-----------+-----------------------+---------+------------+------------+----------------+
| Workload  | File Format           | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) |
+-----------+-----------------------+---------+------------+------------+----------------+
| TPCH(_60) | parquet / none / none | 17.50   | -1.88%     | 12.18      | -1.70%         |
+-----------+-----------------------+---------+------------+------------+----------------+

+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+
| Workload  | Query    | File Format           | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%) | Base StdDev(%) | Num Clients | Iters |
+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+
| TPCH(_60) | TPCH-Q12 | parquet / none / none | 9.63   | 9.31        |   +3.50%   |   4.84%   |   2.62%        | 1           | 5     |
| TPCH(_60) | TPCH-Q14 | parquet / none / none | 8.20   | 7.96        |   +2.99%   |   5.30%   |   0.30%        | 1           | 5     |
| TPCH(_60) | TPCH-Q17 | parquet / none / none | 16.61  | 16.19       |   +2.64%   |   1.10%   |   0.55%        | 1           | 5     |
| TPCH(_60) | TPCH-Q5  | parquet / none / none | 9.77   | 9.70        |   +0.76%   |   0.39%   |   0.71%        | 1           | 5     |
| TPCH(_60) | TPCH-Q1  | parquet / none / none | 28.02  | 27.86       |   +0.59%   |   0.61%   |   0.55%        | 1           | 5     |
| TPCH(_60) | TPCH-Q20 | parquet / none / none | 7.83   | 7.79        |   +0.45%   |   1.61%   |   1.53%        | 1           | 5     |
| TPCH(_60) | TPCH-Q15 | parquet / none / none | 11.02  | 10.97       |   +0.42%   |   0.19%   |   0.53%        | 1           | 5     |
| TPCH(_60) | TPCH-Q6  | parquet / none / none | 4.61   | 4.59        |   +0.40%   |   0.44%   |   0.98%        | 1           | 5     |
| TPCH(_60) | TPCH-Q21 | parquet / none / none | 68.05  | 68.11       |   -0.09%   |   0.08%   |   0.21%        | 1           | 5     |
| TPCH(_60) | TPCH-Q18 | parquet / none / none | 47.03  | 47.10       |   -0.15%   |   1.63%   |   0.28%        | 1           | 5     |
| TPCH(_60) | TPCH-Q11 | parquet / none / none | 2.43   | 2.44        |   -0.58%   |   4.45%   |   2.23%        | 1           | 5     |
| TPCH(_60) | TPCH-Q8  | parquet / none / none | 12.31  | 12.43       |   -0.94%   |   0.79%   |   0.54%        | 1           | 5     |
| TPCH(_60) | TPCH-Q4  | parquet / none / none | 7.94   | 8.04        |   -1.16%   |   0.92%   |   0.52%        | 1           | 5     |
| TPCH(_60) | TPCH-Q19 | parquet / none / none | 11.02  | 11.20       |   -1.67%   |   0.70%   |   0.39%        | 1           | 5     |
| TPCH(_60) | TPCH-Q2  | parquet / none / none | 3.28   | 3.34        |   -1.95%   |   0.86%   |   2.08%        | 1           | 5     |
| TPCH(_60) | TPCH-Q9  | parquet / none / none | 35.53  | 36.37       |   -2.29%   |   0.48%   |   0.23%        | 1           | 5     |
| TPCH(_60) | TPCH-Q3  | parquet / none / none | 12.56  | 13.11       |   -4.19%   |   0.99%   |   0.81%        | 1           | 5     |
| TPCH(_60) | TPCH-Q16 | parquet / none / none | 5.28   | 5.52        |   -4.31%   |   0.69%   |   0.49%        | 1           | 5     |
| TPCH(_60) | TPCH-Q10 | parquet / none / none | 13.85  | 14.55       |   -4.82%   |   0.66%   |   0.81%        | 1           | 5     |
| TPCH(_60) | TPCH-Q7  | parquet / none / none | 41.17  | 44.36       |   -7.20%   |   1.40%   |   0.29%        | 1           | 5     |
| TPCH(_60) | TPCH-Q13 | parquet / none / none | 22.26  | 24.06       |   -7.46%   |   0.55%   |   0.27%        | 1           | 5     |
| TPCH(_60) | TPCH-Q22 | parquet / none / none | 6.57   | 7.35        |   -10.63%  |   0.62%   |   0.50%        | 1           | 5     |
+-----------+----------+-----------------------+--------+-------------+------------+-----------+----------------+-------------+-------+

Change-Id: If84b46a46efed9aee6af41b5f10bf3f4b15889b8
Reviewed-on: http://gerrit.cloudera.org:8080/6687
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/58b206ff
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/58b206ff
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/58b206ff

Branch: refs/heads/master
Commit: 58b206ff0e66e6357aaf81a9e54f660472171c88
Parents: 7fcf1ea
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Mon Apr 17 23:49:31 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Apr 21 21:25:40 2017 +0000

----------------------------------------------------------------------
 .../runtime/bufferpool/buffer-allocator-test.cc |  1 -
 be/src/runtime/bufferpool/buffer-allocator.cc   |  4 +-
 be/src/runtime/bufferpool/system-allocator.cc   | 70 ++++++++++++++++----
 be/src/runtime/bufferpool/system-allocator.h    |  3 +
 be/src/runtime/row-batch.cc                     |  4 +-
 be/src/runtime/row-batch.h                      |  4 +-
 6 files changed, 67 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/buffer-allocator-test.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator-test.cc b/be/src/runtime/bufferpool/buffer-allocator-test.cc
index 9887086..167298d 100644
--- a/be/src/runtime/bufferpool/buffer-allocator-test.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator-test.cc
@@ -182,7 +182,6 @@ int main(int argc, char** argv) {
   int result = 0;
   for (bool mmap : {false, true}) {
     for (bool madvise : {false, true}) {
-      if (madvise && !mmap) continue; // Not an interesting combination.
       std::cerr << "+==================================================" << std::endl
                 << "| Running tests with mmap=" << mmap << " madvise=" << madvise
                 << std::endl

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/buffer-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator.cc b/be/src/runtime/bufferpool/buffer-allocator.cc
index 0fd88fd..77169ce 100644
--- a/be/src/runtime/bufferpool/buffer-allocator.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator.cc
@@ -43,7 +43,7 @@ class BufferPool::FreeBufferArena : public CacheLineAligned {
 
   /// Add a free buffer to the free lists. May free buffers to the system allocator
   /// if the list becomes full. Caller should not hold 'lock_'
-  void AddFreeBuffer(BufferHandle buffer);
+  void AddFreeBuffer(BufferHandle&& buffer);
 
   /// Try to get a free buffer of 'buffer_len' bytes from this arena. Returns true and
   /// sets 'buffer' if found or false if not found. Caller should not hold 'lock_'.
@@ -406,7 +406,7 @@ BufferPool::FreeBufferArena::~FreeBufferArena() {
   }
 }
 
-void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle buffer) {
+void BufferPool::FreeBufferArena::AddFreeBuffer(BufferHandle&& buffer) {
   lock_guard<SpinLock> al(lock_);
   PerSizeLists* lists = GetListsForSize(buffer.len());
   FreeList* list = &lists->free_buffers;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/system-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/system-allocator.cc b/be/src/runtime/bufferpool/system-allocator.cc
index 7598d3c..756170a 100644
--- a/be/src/runtime/bufferpool/system-allocator.cc
+++ b/be/src/runtime/bufferpool/system-allocator.cc
@@ -19,27 +19,40 @@
 
 #include <sys/mman.h>
 
+#include <gperftools/malloc_extension.h>
+
+#include "gutil/strings/substitute.h"
 #include "util/bit-util.h"
 
+#include "common/names.h"
+
 // TODO: IMPALA-5073: this should eventually become the default once we are confident
 // that it is superior to allocating via TCMalloc.
 DEFINE_bool(mmap_buffers, false,
-    "(Advanced) If true, allocate buffers directly from the operating system instead of "
-    "with TCMalloc.");
+    "(Experimental) If true, allocate buffers directly from the operating system "
+    "instead of with TCMalloc.");
 
 DEFINE_bool(madvise_huge_pages, true,
-    "(Advanced) If true and --mmap_buffers is also "
-    "true, advise operating system to back large memory buffers with huge pages");
+    "(Advanced) If true, advise operating system to back large memory buffers with huge "
+    "pages");
 
 namespace impala {
 
-/// This is the huge page size on x86-64. We could parse /proc/meminfo to programmatically
+/// These are the page sizes on x86-64. We could parse /proc/meminfo to programmatically
 /// get this, but it is unlikely to change unless we port to a different architecture.
+static int64_t SMALL_PAGE_SIZE = 4LL * 1024;
 static int64_t HUGE_PAGE_SIZE = 2LL * 1024 * 1024;
 
 SystemAllocator::SystemAllocator(int64_t min_buffer_len)
   : min_buffer_len_(min_buffer_len) {
   DCHECK(BitUtil::IsPowerOf2(min_buffer_len));
+#ifndef ADDRESS_SANITIZER
+  // Free() assumes that aggressive decommit is enabled for TCMalloc.
+  size_t aggressive_decommit_enabled;
+  MallocExtension::instance()->GetNumericProperty(
+      "tcmalloc.aggressive_memory_decommit", &aggressive_decommit_enabled);
+  CHECK_EQ(true, aggressive_decommit_enabled);
+#endif
 }
 
 Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer) {
@@ -51,13 +64,7 @@ Status SystemAllocator::Allocate(int64_t len, BufferPool::BufferHandle* buffer)
   if (FLAGS_mmap_buffers) {
     RETURN_IF_ERROR(AllocateViaMMap(len, &buffer_mem));
   } else {
-    // AddressSanitizer does not instrument mmap(). Use malloc() to preserve
-    // instrumentation.
-    buffer_mem = reinterpret_cast<uint8_t*>(malloc(len));
-    if (buffer_mem == nullptr) {
-      return Status(
-          TErrorCode::BUFFER_ALLOCATION_FAILED, len, "malloc() failed under asan");
-    }
+    RETURN_IF_ERROR(AllocateViaMalloc(len, &buffer_mem));
   }
   buffer->Open(buffer_mem, len, CpuInfo::GetCurrentCore());
   return Status::OK();
@@ -107,11 +114,50 @@ Status SystemAllocator::AllocateViaMMap(int64_t len, uint8_t** buffer_mem) {
   return Status::OK();
 }
 
+Status SystemAllocator::AllocateViaMalloc(int64_t len, uint8_t** buffer_mem) {
+  bool use_huge_pages = len % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
+  // Allocate, aligned to the page size that we expect to back the memory range.
+  // This ensures that it can be backed by a whole pages, rather than parts of pages.
+  size_t alignment = use_huge_pages ? HUGE_PAGE_SIZE : SMALL_PAGE_SIZE;
+  int rc = posix_memalign(reinterpret_cast<void**>(buffer_mem), alignment, len);
+  if (rc != 0) {
+    return Status(TErrorCode::BUFFER_ALLOCATION_FAILED, len,
+        Substitute("posix_memalign() failed to allocate buffer: $0", GetStrErrMsg()));
+  }
+  if (use_huge_pages) {
+#ifdef MADV_HUGEPAGE
+    // According to madvise() docs it may return EAGAIN to signal that we should retry.
+    do {
+      rc = madvise(*buffer_mem, len, MADV_HUGEPAGE);
+    } while (rc == -1 && errno == EAGAIN);
+    DCHECK(rc == 0) << "madvise(MADV_HUGEPAGE) shouldn't fail" << errno;
+#endif
+  }
+  return Status::OK();
+}
+
 void SystemAllocator::Free(BufferPool::BufferHandle&& buffer) {
   if (FLAGS_mmap_buffers) {
     int rc = munmap(buffer.data(), buffer.len());
     DCHECK_EQ(rc, 0) << "Unexpected munmap() error: " << errno;
   } else {
+    bool use_huge_pages = buffer.len() % HUGE_PAGE_SIZE == 0 && FLAGS_madvise_huge_pages;
+    if (use_huge_pages) {
+      // Undo the madvise so that is isn't a candidate to be newly backed by huge pages.
+      // We depend on TCMalloc's "aggressive decommit" mode decommitting the physical
+      // huge pages with madvise(DONTNEED) when we call free(). Otherwise, this huge
+      // page region may be divvied up and subsequently decommitted in smaller chunks,
+      // which may not actually release the physical memory, causing Impala physical
+      // memory usage to exceed the process limit.
+#ifdef MADV_NOHUGEPAGE
+      // According to madvise() docs it may return EAGAIN to signal that we should retry.
+      int rc;
+      do {
+        rc = madvise(buffer.data(), buffer.len(), MADV_NOHUGEPAGE);
+      } while (rc == -1 && errno == EAGAIN);
+      DCHECK(rc == 0) << "madvise(MADV_NOHUGEPAGE) shouldn't fail" << errno;
+#endif
+    }
     free(buffer.data());
   }
   buffer.Reset(); // Avoid DCHECK in ~BufferHandle().

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/bufferpool/system-allocator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/system-allocator.h b/be/src/runtime/bufferpool/system-allocator.h
index 33ad525..d57b8df 100644
--- a/be/src/runtime/bufferpool/system-allocator.h
+++ b/be/src/runtime/bufferpool/system-allocator.h
@@ -43,6 +43,9 @@ class SystemAllocator {
   /// Allocate 'len' bytes of memory for a buffer via mmap().
   Status AllocateViaMMap(int64_t len, uint8_t** buffer_mem);
 
+  /// Allocate 'len' bytes of memory for a buffer via our malloc implementation.
+  Status AllocateViaMalloc(int64_t len, uint8_t** buffer_mem);
+
   const int64_t min_buffer_len_;
 };
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/row-batch.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.cc b/be/src/runtime/row-batch.cc
index 8dfc4ba..8c4ab54 100644
--- a/be/src/runtime/row-batch.cc
+++ b/be/src/runtime/row-batch.cc
@@ -310,8 +310,8 @@ void RowBatch::AddBlock(BufferedBlockMgr::Block* block, FlushMode flush) {
   if (flush == FlushMode::FLUSH_RESOURCES) MarkFlushResources();
 }
 
-void RowBatch::AddBuffer(
-    BufferPool::ClientHandle* client, BufferPool::BufferHandle buffer, FlushMode flush) {
+void RowBatch::AddBuffer(BufferPool::ClientHandle* client,
+    BufferPool::BufferHandle&& buffer, FlushMode flush) {
   auxiliary_mem_usage_ += buffer.len();
   BufferInfo buffer_info;
   buffer_info.client = client;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/58b206ff/be/src/runtime/row-batch.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/row-batch.h b/be/src/runtime/row-batch.h
index 0bb71d8..cc41adb 100644
--- a/be/src/runtime/row-batch.h
+++ b/be/src/runtime/row-batch.h
@@ -230,8 +230,8 @@ class RowBatch {
   /// for further explanation).
   /// TODO: IMPALA-4179: after IMPALA-3200, simplify the ownership transfer model and
   /// make it consistent between buffers and I/O buffers.
-  void AddBuffer(
-      BufferPool::ClientHandle* client, BufferPool::BufferHandle buffer, FlushMode flush);
+  void AddBuffer(BufferPool::ClientHandle* client, BufferPool::BufferHandle&& buffer,
+      FlushMode flush);
 
   /// Used by an operator to indicate that it cannot produce more rows until the
   /// resources that it has attached to the row batch are freed or acquired by an


[2/4] incubator-impala git commit: IMPALA-4943: Speed up block md loading for add/recover partition calls.

Posted by ta...@apache.org.
IMPALA-4943: Speed up block md loading for add/recover partition calls.

This change makes alter table add/recover partitions calls use the
per directory block metadata loading routines instead of doing it
per file. This is done since these calls always load the entire
partition directory from scratch and there is no advantage in
loading them incrementally on a per-file basis.

Tests: Ran core tests and the metadata benchmark tests.

(I) Improvement: METADATA-BENCHMARKS()
100K-PARTITIONS-1M-FILES-03-RECOVER [text / none / none] (718.62s ->
549.91s [-23.48%])

(I) Improvement: METADATA-BENCHMARKS()
100K-PARTITIONS-1M-FILES-08-ADD-PARTITION [text / none / none] (46.92s
-> 26.20s [-44.15%])

Change-Id: I331f1f090518f317bcd7df069e480edbd8f039f1
Reviewed-on: http://gerrit.cloudera.org:8080/6651
Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/8bd854df
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/8bd854df
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/8bd854df

Branch: refs/heads/master
Commit: 8bd854dfa6f40bd32e8fcd6f284c15b045b4f1ee
Parents: 7555316
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Fri Apr 14 12:42:45 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Apr 21 20:53:26 2017 +0000

----------------------------------------------------------------------
 .../org/apache/impala/catalog/HdfsTable.java    | 25 ++++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/8bd854df/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 30241b0..143e2b1 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -835,6 +835,19 @@ public class HdfsTable extends Table {
   }
 
   /**
+   * Helper method to load the partition file metadata from scratch. This method is
+   * optimized for loading newly added partitions. For refreshing existing partitions
+   * use refreshFileMetadata(HdfsPartition).
+   */
+  private void loadFileMetadataFromScratch(HdfsPartition partition) {
+    Path partitionDirPath = partition.getLocationPath();
+    Set<Path> dirsToLoad = Sets.newHashSet(partitionDirPath);
+    HashMap<Path, List<HdfsPartition>> partsByPath = Maps.newHashMap();
+    partsByPath.put(partitionDirPath, Lists.newArrayList(partition));
+    loadMetadataAndDiskIds(dirsToLoad, partsByPath);
+  }
+
+  /**
    * Helper method to load the block locations from each directory in 'locations'
    * and filtering only the paths from 'partsByPath'. Also loads the disk IDs
    * corresponding to these block locations.
@@ -903,7 +916,7 @@ public class HdfsTable extends Table {
       org.apache.hadoop.hive.metastore.api.Partition msPartition)
       throws CatalogException {
     HdfsPartition hdfsPartition = createPartition(storageDescriptor, msPartition);
-    refreshFileMetadata(hdfsPartition);
+    loadFileMetadataFromScratch(hdfsPartition);
     return hdfsPartition;
   }
 
@@ -1513,7 +1526,9 @@ public class HdfsTable extends Table {
   }
 
   /**
-   * Loads the file descriptors and block metadata of a list of partitions.
+   * Loads the file descriptors and block metadata of a list of partitions. This function
+   * is optimized for incremental loading of the partition file metadata. To load it from
+   * scratch, use loadFileMetadataFromScratch(HdfsPartition).
    */
   private void loadPartitionFileMetadata(List<HdfsPartition> partitions)
       throws Exception {
@@ -1548,8 +1563,7 @@ public class HdfsTable extends Table {
   /**
    * Loads the file descriptors and block metadata of a partition from its
    * StorageDescriptor. If 'partition' does not have an entry in the Hive Metastore,
-   * 'storageDescriptor' is the StorageDescriptor of the associated table. Populates
-   * 'perFsFileBlocks' with file block info and updates table metadata.
+   * 'storageDescriptor' is the StorageDescriptor of the associated table.
    */
   private void loadPartitionFileMetadata(StorageDescriptor storageDescriptor,
       HdfsPartition partition) throws Exception {
@@ -1994,8 +2008,9 @@ public class HdfsTable extends Table {
    */
   public void reloadPartition(HdfsPartition oldPartition, Partition hmsPartition)
       throws CatalogException {
-    HdfsPartition refreshedPartition = createAndLoadPartition(
+    HdfsPartition refreshedPartition = createPartition(
         hmsPartition.getSd(), hmsPartition);
+    refreshFileMetadata(refreshedPartition);
     Preconditions.checkArgument(oldPartition == null
         || oldPartition.compareTo(refreshedPartition) == 0);
     dropPartition(oldPartition);


[3/4] incubator-impala git commit: IMPALA-5222: don't call Bits::Log2*() functions

Posted by ta...@apache.org.
IMPALA-5222: don't call Bits::Log2*() functions

Some of gutil's functions are fairly inefficient. We accidentally
regressed this when switching to the new Bits::, and some of the
BufferPool code that was in flight didn't switch over.

This commit switches to calling BitUtil::Log2*() everywhere and makes
sure that those functions are all implemented in an efficient way.

Change-Id: I46471590ae7cf5ccd3e44d5c31f0b06108a2a01c
Reviewed-on: http://gerrit.cloudera.org:8080/6675
Reviewed-by: Henry Robinson <he...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/7fcf1ea4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/7fcf1ea4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/7fcf1ea4

Branch: refs/heads/master
Commit: 7fcf1ea4c36e41c299e36078664522ef629bb5de
Parents: 8bd854d
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Tue Apr 18 11:55:09 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Apr 21 21:03:43 2017 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-column-readers.cc         |  4 ++--
 be/src/runtime/bufferpool/buffer-allocator.cc | 10 ++++-----
 be/src/runtime/bufferpool/buffer-pool.cc      |  1 -
 be/src/runtime/disk-io-mgr.cc                 |  8 +++----
 be/src/runtime/free-pool.h                    |  4 ++--
 be/src/runtime/runtime-filter-bank.cc         |  9 ++++----
 be/src/runtime/tmp-file-mgr.cc                |  5 ++---
 be/src/util/bit-util-test.cc                  |  1 -
 be/src/util/bit-util.h                        | 25 +++++++++++++++++-----
 be/src/util/dict-encoding.h                   |  4 ++--
 be/src/util/hdr-histogram.cc                  |  6 +++---
 11 files changed, 45 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc
index ac74d72..f92de48 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -27,13 +27,13 @@
 #include "exec/parquet-metadata-utils.h"
 #include "exec/parquet-scratch-tuple-batch.h"
 #include "exec/read-write-util.h"
-#include "gutil/bits.h"
 #include "rpc/thrift-util.h"
 #include "runtime/collection-value-builder.h"
 #include "runtime/tuple-row.h"
 #include "runtime/tuple.h"
 #include "runtime/runtime-state.h"
 #include "runtime/mem-pool.h"
+#include "util/bit-util.h"
 #include "util/codec.h"
 #include "util/debug-util.h"
 #include "util/dict-encoding.h"
@@ -89,7 +89,7 @@ Status ParquetLevelDecoder::Init(const string& filename,
       if (num_bytes < 0) {
         return Status(TErrorCode::PARQUET_CORRUPT_RLE_BYTES, filename, num_bytes);
       }
-      int bit_width = Bits::Log2Ceiling64(max_level + 1);
+      int bit_width = BitUtil::Log2Ceiling64(max_level + 1);
       Reset(*data, num_bytes, bit_width);
       break;
     }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/bufferpool/buffer-allocator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-allocator.cc b/be/src/runtime/bufferpool/buffer-allocator.cc
index e3c6e60..0fd88fd 100644
--- a/be/src/runtime/bufferpool/buffer-allocator.cc
+++ b/be/src/runtime/bufferpool/buffer-allocator.cc
@@ -22,8 +22,8 @@
 #include <boost/bind.hpp>
 
 #include "common/atomic.h"
-#include "gutil/bits.h"
 #include "runtime/bufferpool/system-allocator.h"
+#include "util/bit-util.h"
 #include "util/cpu-info.h"
 #include "util/pretty-printer.h"
 #include "util/runtime-profile-counters.h"
@@ -122,7 +122,7 @@ class BufferPool::FreeBufferArena : public CacheLineAligned {
   /// Return the lists of buffers for buffers of the given length.
   PerSizeLists* GetListsForSize(int64_t buffer_len) {
     DCHECK(BitUtil::IsPowerOf2(buffer_len));
-    int idx = Bits::Log2Ceiling64(buffer_len) - parent_->log_min_buffer_len_;
+    int idx = BitUtil::Log2Ceiling64(buffer_len) - parent_->log_min_buffer_len_;
     DCHECK_LT(idx, NumBufferSizes());
     return &buffer_sizes_[idx];
   }
@@ -142,7 +142,7 @@ int64_t BufferPool::BufferAllocator::CalcMaxBufferLen(
     int64_t min_buffer_len, int64_t system_bytes_limit) {
   // Find largest power of 2 smaller than 'system_bytes_limit'.
   int64_t upper_bound = system_bytes_limit == 0 ? 1L : 1L
-          << Bits::Log2Floor64(system_bytes_limit);
+          << BitUtil::Log2Floor64(system_bytes_limit);
   upper_bound = min(MAX_BUFFER_BYTES, upper_bound);
   return max(min_buffer_len, upper_bound); // Can't be < min_buffer_len.
 }
@@ -153,8 +153,8 @@ BufferPool::BufferAllocator::BufferAllocator(
     system_allocator_(new SystemAllocator(min_buffer_len)),
     min_buffer_len_(min_buffer_len),
     max_buffer_len_(CalcMaxBufferLen(min_buffer_len, system_bytes_limit)),
-    log_min_buffer_len_(Bits::Log2Ceiling64(min_buffer_len_)),
-    log_max_buffer_len_(Bits::Log2Ceiling64(max_buffer_len_)),
+    log_min_buffer_len_(BitUtil::Log2Ceiling64(min_buffer_len_)),
+    log_max_buffer_len_(BitUtil::Log2Ceiling64(max_buffer_len_)),
     system_bytes_limit_(system_bytes_limit),
     system_bytes_remaining_(system_bytes_limit),
     per_core_arenas_(CpuInfo::GetMaxNumCores()),

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/bufferpool/buffer-pool.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/bufferpool/buffer-pool.cc b/be/src/runtime/bufferpool/buffer-pool.cc
index a71a7c7..1d2f1d3 100644
--- a/be/src/runtime/bufferpool/buffer-pool.cc
+++ b/be/src/runtime/bufferpool/buffer-pool.cc
@@ -22,7 +22,6 @@
 #include <boost/bind.hpp>
 
 #include "common/names.h"
-#include "gutil/bits.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/bufferpool/buffer-allocator.h"
 #include "util/bit-util.h"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/disk-io-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/disk-io-mgr.cc b/be/src/runtime/disk-io-mgr.cc
index b973a84..1f28b1a 100644
--- a/be/src/runtime/disk-io-mgr.cc
+++ b/be/src/runtime/disk-io-mgr.cc
@@ -20,8 +20,8 @@
 
 #include <boost/algorithm/string.hpp>
 
-#include "gutil/bits.h"
 #include "gutil/strings/substitute.h"
+#include "util/bit-util.h"
 #include "util/hdfs-util.h"
 #include "util/time.h"
 
@@ -304,7 +304,7 @@ DiskIoMgr::DiskIoMgr() :
         FileSystemUtil::MaxNumFileHandles()),
         &HdfsCachedFileHandle::Release) {
   int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(Bits::Log2Ceiling64(max_buffer_size_scaled) + 1);
+  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
   int num_local_disks = FLAGS_num_disks == 0 ? DiskInfo::num_disks() : FLAGS_num_disks;
   disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
   CheckSseSupport();
@@ -322,7 +322,7 @@ DiskIoMgr::DiskIoMgr(int num_local_disks, int threads_per_disk, int min_buffer_s
     file_handle_cache_(min(FLAGS_max_cached_file_handles,
             FileSystemUtil::MaxNumFileHandles()), &HdfsCachedFileHandle::Release) {
   int64_t max_buffer_size_scaled = BitUtil::Ceil(max_buffer_size_, min_buffer_size_);
-  free_buffers_.resize(Bits::Log2Ceiling64(max_buffer_size_scaled) + 1);
+  free_buffers_.resize(BitUtil::Log2Ceiling64(max_buffer_size_scaled) + 1);
   if (num_local_disks == 0) num_local_disks = DiskInfo::num_disks();
   disk_queues_.resize(num_local_disks + REMOTE_NUM_DISKS);
   CheckSseSupport();
@@ -1223,7 +1223,7 @@ Status DiskIoMgr::WriteRangeHelper(FILE* file_handle, WriteRange* write_range) {
 
 int DiskIoMgr::free_buffers_idx(int64_t buffer_size) {
   int64_t buffer_size_scaled = BitUtil::Ceil(buffer_size, min_buffer_size_);
-  int idx = Bits::Log2Ceiling64(buffer_size_scaled);
+  int idx = BitUtil::Log2Ceiling64(buffer_size_scaled);
   DCHECK_GE(idx, 0);
   DCHECK_LT(idx, free_buffers_.size());
   return idx;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/free-pool.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/free-pool.h b/be/src/runtime/free-pool.h
index 98b1ebe..55774e8 100644
--- a/be/src/runtime/free-pool.h
+++ b/be/src/runtime/free-pool.h
@@ -23,9 +23,9 @@
 #include <string.h>
 #include <string>
 #include <sstream>
+
 #include "common/atomic.h"
 #include "common/logging.h"
-#include "gutil/bits.h"
 #include "runtime/mem-pool.h"
 #include "util/bit-util.h"
 
@@ -73,7 +73,7 @@ class FreePool {
     /// MemPool allocations are 8-byte aligned, so making allocations < 8 bytes
     /// doesn't save memory and eliminates opportunities to recycle allocations.
     size = std::max<int64_t>(8, size);
-    int free_list_idx = Bits::Log2Ceiling64(size);
+    int free_list_idx = BitUtil::Log2Ceiling64(size);
     DCHECK_LT(free_list_idx, NUM_LISTS);
     FreeListNode* allocation = lists_[free_list_idx].next;
     if (allocation == NULL) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/runtime-filter-bank.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter-bank.cc b/be/src/runtime/runtime-filter-bank.cc
index 759b505..3e2dc6d 100644
--- a/be/src/runtime/runtime-filter-bank.cc
+++ b/be/src/runtime/runtime-filter-bank.cc
@@ -17,9 +17,7 @@
 
 #include "runtime/runtime-filter-bank.h"
 
-#include "common/names.h"
 #include "gen-cpp/ImpalaInternalService_types.h"
-#include "gutil/bits.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/client-cache.h"
 #include "runtime/exec-env.h"
@@ -27,8 +25,11 @@
 #include "runtime/mem-tracker.h"
 #include "runtime/runtime-filter.inline.h"
 #include "service/impala-server.h"
+#include "util/bit-util.h"
 #include "util/bloom-filter.h"
 
+#include "common/names.h"
+
 using namespace impala;
 using namespace boost;
 using namespace strings;
@@ -197,7 +198,7 @@ BloomFilter* RuntimeFilterBank::AllocateScratchBloomFilter(int32_t filter_id) {
   DCHECK(it != produced_filters_.end()) << "Filter ID " << filter_id << " not registered";
 
   // Track required space
-  int64_t log_filter_size = Bits::Log2Ceiling64(it->second->filter_size());
+  int64_t log_filter_size = BitUtil::Log2Ceiling64(it->second->filter_size());
   int64_t required_space = BloomFilter::GetExpectedHeapSpaceUsed(log_filter_size);
   if (!filter_mem_tracker_->TryConsume(required_space)) return NULL;
   BloomFilter* bloom_filter = obj_pool_.Add(new BloomFilter(log_filter_size));
@@ -217,7 +218,7 @@ int64_t RuntimeFilterBank::GetFilterSizeForNdv(int64_t ndv) {
 
 bool RuntimeFilterBank::FpRateTooHigh(int64_t filter_size, int64_t observed_ndv) {
   double fpp =
-      BloomFilter::FalsePositiveProb(observed_ndv, Bits::Log2Ceiling64(filter_size));
+      BloomFilter::FalsePositiveProb(observed_ndv, BitUtil::Log2Ceiling64(filter_size));
   return fpp > FLAGS_max_filter_error_rate;
 }
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/runtime/tmp-file-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tmp-file-mgr.cc b/be/src/runtime/tmp-file-mgr.cc
index 37a05cf..293a898 100644
--- a/be/src/runtime/tmp-file-mgr.cc
+++ b/be/src/runtime/tmp-file-mgr.cc
@@ -26,7 +26,6 @@
 #include <gutil/strings/join.h>
 #include <gutil/strings/substitute.h>
 
-#include "gutil/bits.h"
 #include "runtime/runtime-state.h"
 #include "runtime/tmp-file-mgr-internal.h"
 #include "util/bit-util.h"
@@ -299,7 +298,7 @@ Status TmpFileMgr::FileGroup::AllocateSpace(
     int64_t num_bytes, File** tmp_file, int64_t* file_offset) {
   lock_guard<SpinLock> lock(lock_);
   int64_t scratch_range_bytes = max<int64_t>(1L, BitUtil::RoundUpToPowerOfTwo(num_bytes));
-  int free_ranges_idx = Bits::Log2Ceiling64(scratch_range_bytes);
+  int free_ranges_idx = BitUtil::Log2Ceiling64(scratch_range_bytes);
   if (!free_ranges_[free_ranges_idx].empty()) {
     *tmp_file = free_ranges_[free_ranges_idx].back().first;
     *file_offset = free_ranges_[free_ranges_idx].back().second;
@@ -342,7 +341,7 @@ Status TmpFileMgr::FileGroup::AllocateSpace(
 void TmpFileMgr::FileGroup::RecycleFileRange(unique_ptr<WriteHandle> handle) {
   int64_t scratch_range_bytes =
       max<int64_t>(1L, BitUtil::RoundUpToPowerOfTwo(handle->len()));
-  int free_ranges_idx = Bits::Log2Ceiling64(scratch_range_bytes);
+  int free_ranges_idx = BitUtil::Log2Ceiling64(scratch_range_bytes);
   lock_guard<SpinLock> lock(lock_);
   free_ranges_[free_ranges_idx].emplace_back(
       handle->file_, handle->write_range_->offset());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/util/bit-util-test.cc
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util-test.cc b/be/src/util/bit-util-test.cc
index 948e5cf..2c40e569 100644
--- a/be/src/util/bit-util-test.cc
+++ b/be/src/util/bit-util-test.cc
@@ -23,7 +23,6 @@
 
 #include <boost/utility.hpp>
 
-#include "gutil/bits.h"
 #include "testutil/gtest-util.h"
 #include "util/bit-util.h"
 #include "util/cpu-info.h"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 25a8c96..40a324c 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -287,9 +287,24 @@ class BitUtil {
     return __builtin_ctzll(v);
   }
 
+  // Wrap the gutil/ version for convenience.
+  static inline int Log2Floor(uint32_t n) {
+    return Bits::Log2Floor(n);
+  }
+
+  // Wrap the gutil/ version for convenience.
+  static inline int Log2Floor64(uint64_t n) {
+    return Bits::Log2Floor64(n);
+  }
+
+  // Wrap the gutil/ version for convenience.
+  static inline int Log2FloorNonZero64(uint64_t n) {
+    return Bits::Log2FloorNonZero64(n);
+  }
+
   /// More efficient version of similar functions found in gutil/
   static inline int Log2Ceiling(uint32 n) {
-    int floor = Bits::Log2Floor(n);
+    int floor = Log2Floor(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {
@@ -299,8 +314,8 @@ class BitUtil {
     }
   }
 
-  static inline int Log2Ceiling64(uint64 n) {
-    int floor = Bits::Log2Floor64(n);
+  static inline int Log2Ceiling64(uint64_t n) {
+    int floor = Log2Floor64(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {
@@ -310,8 +325,8 @@ class BitUtil {
     }
   }
 
-  static inline int Log2CeilingNonZero64(uint64 n) {
-    int floor = Bits::Log2FloorNonZero64(n);
+  static inline int Log2CeilingNonZero64(uint64_t n) {
+    int floor = Log2FloorNonZero64(n);
     // Check if zero or a power of two. This pattern is recognised by gcc and optimised
     // into branch-free code.
     if (0 == (n & (n - 1))) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/util/dict-encoding.h
----------------------------------------------------------------------
diff --git a/be/src/util/dict-encoding.h b/be/src/util/dict-encoding.h
index e791d95..0b74468 100644
--- a/be/src/util/dict-encoding.h
+++ b/be/src/util/dict-encoding.h
@@ -22,11 +22,11 @@
 
 #include <boost/unordered_map.hpp>
 
-#include "gutil/bits.h"
 #include "gutil/strings/substitute.h"
 #include "exec/parquet-common.h"
 #include "runtime/mem-pool.h"
 #include "runtime/string-value.h"
+#include "util/bit-util.h"
 #include "util/rle-encoding.h"
 
 namespace impala {
@@ -73,7 +73,7 @@ class DictEncoderBase {
   int bit_width() const {
     if (UNLIKELY(num_entries() == 0)) return 0;
     if (UNLIKELY(num_entries() == 1)) return 1;
-    return Bits::Log2Ceiling64(num_entries());
+    return BitUtil::Log2Ceiling64(num_entries());
   }
 
   /// Writes out any buffered indices to buffer preceded by the bit width of this data.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/7fcf1ea4/be/src/util/hdr-histogram.cc
----------------------------------------------------------------------
diff --git a/be/src/util/hdr-histogram.cc b/be/src/util/hdr-histogram.cc
index 25b8835..1427aac 100644
--- a/be/src/util/hdr-histogram.cc
+++ b/be/src/util/hdr-histogram.cc
@@ -22,9 +22,9 @@
 #include <limits>
 #include <gutil/strings/substitute.h>
 #include <gutil/atomicops.h>
-#include <gutil/bits.h>
 
 #include "common/status.h"
+#include "util/bit-util.h"
 
 #include "common/names.h"
 
@@ -113,7 +113,7 @@ void HdrHistogram::Init() {
   // Each sub-bucket is sized to have enough bits for the requested
   // 10^precision accuracy.
   int sub_bucket_count_magnitude =
-      Bits::Log2Ceiling(largest_value_with_single_unit_resolution);
+      BitUtil::Log2Ceiling(largest_value_with_single_unit_resolution);
   sub_bucket_half_count_magnitude_ =
       (sub_bucket_count_magnitude >= 1) ? sub_bucket_count_magnitude - 1 : 0;
 
@@ -196,7 +196,7 @@ int HdrHistogram::BucketIndex(uint64_t value) const {
   // Here we are calculating the power-of-2 magnitude of the value with a
   // correction for precision in the first bucket.
   // Smallest power of 2 containing value.
-  int pow2ceiling = Bits::Log2Ceiling64(value | sub_bucket_mask_);
+  int pow2ceiling = BitUtil::Log2Ceiling64(value | sub_bucket_mask_);
   return pow2ceiling - (sub_bucket_half_count_magnitude_ + 1);
 }