You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by ji...@apache.org on 2017/01/08 02:27:13 UTC

incubator-quickstep git commit: Profiling

Repository: incubator-quickstep
Updated Branches:
  refs/heads/output-attr-order 65b92b47c -> 4b15e6191


Profiling


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/4b15e619
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/4b15e619
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/4b15e619

Branch: refs/heads/output-attr-order
Commit: 4b15e6191342463ff8e84a0e587bd58125b2572c
Parents: 65b92b4
Author: Jianqiao Zhu <ji...@cs.wisc.edu>
Authored: Sat Jan 7 20:26:59 2017 -0600
Committer: Jianqiao Zhu <ji...@cs.wisc.edu>
Committed: Sat Jan 7 20:26:59 2017 -0600

----------------------------------------------------------------------
 cli/QuickstepCli.cpp                      |  12 ++
 query_optimizer/rules/CMakeLists.txt      |   3 +
 query_optimizer/rules/ReorderColumns.cpp  | 118 +++++++++++-
 relational_operators/CMakeLists.txt       |   1 +
 relational_operators/HashJoinOperator.cpp | 241 +++++++++++++++++--------
 storage/InsertContext.hpp                 |  64 +++++++
 utility/CMakeLists.txt                    |   5 +
 utility/EventProfiler.cpp                 |  30 +++
 utility/EventProfiler.hpp                 | 190 +++++++++++++++++++
 utility/PlanVisualizer.cpp                |   5 +-
 10 files changed, 591 insertions(+), 78 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/cli/QuickstepCli.cpp
----------------------------------------------------------------------
diff --git a/cli/QuickstepCli.cpp b/cli/QuickstepCli.cpp
index 9db7577..71414bc 100644
--- a/cli/QuickstepCli.cpp
+++ b/cli/QuickstepCli.cpp
@@ -23,6 +23,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <exception>
+#include <fstream>
 #include <memory>
 #include <string>
 #include <utility>
@@ -68,6 +69,7 @@ typedef quickstep::LineReaderDumb LineReaderImpl;
 #include "storage/StorageConstants.hpp"
 #include "storage/StorageManager.hpp"
 #include "threading/ThreadIDBasedMap.hpp"
+#include "utility/EventProfiler.hpp"
 #include "utility/ExecutionDAGVisualizer.hpp"
 #include "utility/Macros.hpp"
 #include "utility/PtrVector.hpp"
@@ -155,6 +157,8 @@ DEFINE_string(profile_file_name, "",
               // To put things in perspective, the first run is, in my experiments, about 5-10
               // times more expensive than the average run. That means the query needs to be
               // run at least a hundred times to make the impact of the first run small (< 5 %).
+DEFINE_string(profile_output, "",
+              "Output file name for dumping the profiled events.");
 
 DECLARE_bool(profile_and_report_workorder_perf);
 DECLARE_bool(visualize_execution_dag);
@@ -359,6 +363,7 @@ int main(int argc, char* argv[]) {
 
           query_result_relation = query_handle->getQueryResultRelation();
 
+          quickstep::simple_profiler.clear();
           start = std::chrono::steady_clock::now();
           QueryExecutionUtil::ConstructAndSendAdmitRequestMessage(
               main_thread_client_id,
@@ -405,6 +410,13 @@ int main(int argc, char* argv[]) {
             dag_visualizer->bindProfilingStats(profiling_stats);
             std::cerr << "\n" << dag_visualizer->toDOT() << "\n";
           }
+          if (!quickstep::FLAGS_profile_output.empty()) {
+            std::ofstream ofs(
+                quickstep::FLAGS_profile_output + std::to_string(query_processor->query_id()),
+                std::ios::out);
+            quickstep::simple_profiler.writeToStream(ofs);
+            ofs.close();
+          }
         } catch (const std::exception &e) {
           fprintf(stderr, "QUERY EXECUTION ERROR: %s\n", e.what());
           break;

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/query_optimizer/rules/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/CMakeLists.txt b/query_optimizer/rules/CMakeLists.txt
index 0b76725..0ffa061 100644
--- a/query_optimizer/rules/CMakeLists.txt
+++ b/query_optimizer/rules/CMakeLists.txt
@@ -122,8 +122,11 @@ target_link_libraries(quickstep_queryoptimizer_rules_PushDownSemiAntiJoin
 target_link_libraries(quickstep_queryoptimizer_rules_ReorderColumns
                       quickstep_queryoptimizer_expressions_AttributeReference
                       quickstep_queryoptimizer_expressions_ExprId
+                      quickstep_queryoptimizer_expressions_NamedExpression
+                      quickstep_queryoptimizer_physical_HashJoin
                       quickstep_queryoptimizer_physical_Physical
                       quickstep_queryoptimizer_physical_PhysicalType
+                      quickstep_queryoptimizer_physical_Selection
                       quickstep_queryoptimizer_rules_Rule
                       quickstep_utility_Macros)
 target_link_libraries(quickstep_queryoptimizer_rules_Rule

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/query_optimizer/rules/ReorderColumns.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/ReorderColumns.cpp b/query_optimizer/rules/ReorderColumns.cpp
index f289655..c4f2aa1 100644
--- a/query_optimizer/rules/ReorderColumns.cpp
+++ b/query_optimizer/rules/ReorderColumns.cpp
@@ -19,11 +19,17 @@
 
 #include "query_optimizer/rules/ReorderColumns.hpp"
 
+#include <algorithm>
+#include <limits>
 #include <vector>
 
+#include "query_optimizer/expressions/AttributeReference.hpp"
 #include "query_optimizer/expressions/ExprId.hpp"
+#include "query_optimizer/expressions/NamedExpression.hpp"
+#include "query_optimizer/physical/HashJoin.hpp"
 #include "query_optimizer/physical/Physical.hpp"
 #include "query_optimizer/physical/PhysicalType.hpp"
+#include "query_optimizer/physical/Selection.hpp"
 
 #include "glog/logging.h"
 
@@ -66,15 +72,119 @@ P::PhysicalPtr ReorderColumns::applyInternal(const P::PhysicalPtr &input) {
   std::vector<P::PhysicalPtr> nodes;
   std::unordered_map<E::ExprId, std::size_t> gen;
   std::unordered_map<E::ExprId, std::size_t> kill;
+  std::unordered_map<E::ExprId, std::size_t> base;
 
-  for (P::PhysicalPtr node = input; IsApplicable(node); node = node->children()[0]) {
+  for (P::PhysicalPtr node = input; IsApplicable(node); node = node->children().front()) {
     nodes.emplace_back(node);
   }
+  std::reverse(nodes.begin(), nodes.end());
 
-  std::cout << nodes.size() << "\n";
-  exit(0);
+  const std::vector<E::AttributeReferencePtr> base_attrs =
+      nodes.front()->children().front()->getOutputAttributes();
+  for (std::size_t i = 0; i < base_attrs.size(); ++i) {
+    base.emplace(base_attrs[i]->id(), i);
+  }
+
+  for (std::size_t i = 0; i < nodes.size(); ++i) {
+    for (const auto &attr : nodes[i]->getOutputAttributes()) {
+      const E::ExprId attr_id = attr->id();
+      if (gen.find(attr_id) == gen.end()) {
+        gen.emplace(attr_id, i);
+      }
+      kill[attr_id] = i;
+    }
+  }
+
+//  std::cout << "gen: \n";
+//  for (const auto &pair : gen) {
+//    std::cout << pair.first << ": " << pair.second << "\n";
+//  }
+//
+//  std::cout << "kill: \n";
+//  for (const auto &pair : kill) {
+//    std::cout << pair.first << ": " << pair.second << "\n";
+//  }
+
+  const auto comparator = [&gen, &kill, &base](const E::NamedExpressionPtr &lhs,
+                                               const E::NamedExpressionPtr &rhs) -> bool {
+    const E::ExprId lhs_id = lhs->id();
+    const E::ExprId rhs_id = rhs->id();
+
+    const std::size_t lhs_gen = gen.at(lhs_id);
+    const std::size_t rhs_gen = gen.at(rhs_id);
+    if (lhs_gen != rhs_gen) {
+      return lhs_gen < rhs_gen;
+    }
+
+    const std::size_t lhs_kill = kill.at(lhs_id);
+    const std::size_t rhs_kill = kill.at(rhs_id);
+    if (lhs_kill != rhs_kill) {
+      return lhs_kill < rhs_kill;
+    }
+
+    const auto lhs_base_it = base.find(lhs_id);
+    const auto rhs_base_it = base.find(rhs_id);
+    const std::size_t lhs_base =
+        lhs_base_it == base.end() ? std::numeric_limits<std::size_t>::max()
+                                  : lhs_base_it->second;
+    const std::size_t rhs_base =
+        rhs_base_it == base.end() ? std::numeric_limits<std::size_t>::max()
+                                  : rhs_base_it->second;
+    if (lhs_base != rhs_base) {
+      return lhs_base < rhs_base;
+    }
+
+    return lhs_id < rhs_id;
+  };
+
+  P::PhysicalPtr output = nodes.front()->children().front();
+
+  for (const auto &node : nodes) {
+    std::vector<E::NamedExpressionPtr> project_expressions;
+    switch (node->getPhysicalType()) {
+      case P::PhysicalType::kHashJoin: {
+        project_expressions =
+            std::static_pointer_cast<const P::HashJoin>(node)->project_expressions();
+        break;
+      }
+      case P::PhysicalType::kSelection: {
+        project_expressions =
+            std::static_pointer_cast<const P::Selection>(node)->project_expressions();
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unsupported physical type";
+    }
+
+    std::sort(project_expressions.begin(), project_expressions.end(), comparator);
+
+    switch (node->getPhysicalType()) {
+      case P::PhysicalType::kHashJoin: {
+        const P::HashJoinPtr old_node =
+            std::static_pointer_cast<const P::HashJoin>(node);
+        output = P::HashJoin::Create(output,
+                                     applyInternal(old_node->right()),
+                                     old_node->left_join_attributes(),
+                                     old_node->right_join_attributes(),
+                                     old_node->residual_predicate(),
+                                     project_expressions,
+                                     old_node->join_type());
+        break;
+      }
+      case P::PhysicalType::kSelection: {
+        const P::SelectionPtr old_node =
+            std::static_pointer_cast<const P::Selection>(node);
+        output = P::Selection::Create(output,
+                                      project_expressions,
+                                      old_node->filter_predicate());
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unsupported physical type";
+    }
+  }
 
-  return nullptr;
+  return output;
 }
 
 }  // namespace optimizer

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/relational_operators/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/relational_operators/CMakeLists.txt b/relational_operators/CMakeLists.txt
index 5e173a4..b792da3 100644
--- a/relational_operators/CMakeLists.txt
+++ b/relational_operators/CMakeLists.txt
@@ -228,6 +228,7 @@ target_link_libraries(quickstep_relationaloperators_HashJoinOperator
                       quickstep_types_TypedValue
                       quickstep_types_containers_ColumnVector
                       quickstep_types_containers_ColumnVectorsValueAccessor
+                      quickstep_utility_EventProfiler
                       quickstep_utility_Macros
                       quickstep_utility_lipfilter_LIPFilterAdaptiveProber
                       quickstep_utility_lipfilter_LIPFilterUtil

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/relational_operators/HashJoinOperator.cpp
----------------------------------------------------------------------
diff --git a/relational_operators/HashJoinOperator.cpp b/relational_operators/HashJoinOperator.cpp
index dae2ba7..8b5c511 100644
--- a/relational_operators/HashJoinOperator.cpp
+++ b/relational_operators/HashJoinOperator.cpp
@@ -51,6 +51,7 @@
 #include "types/TypedValue.hpp"
 #include "types/containers/ColumnVector.hpp"
 #include "types/containers/ColumnVectorsValueAccessor.hpp"
+#include "utility/EventProfiler.hpp"
 #include "utility/lip_filter/LIPFilterAdaptiveProber.hpp"
 #include "utility/lip_filter/LIPFilterUtil.hpp"
 
@@ -466,7 +467,146 @@ void HashInnerJoinWorkOrder::execute() {
         base_accessor->createSharedTupleIdSequenceAdapterVirtual(*existence_map));
   }
 
-  PairsOfVectorsJoinedTuplesCollector collector;
+//  PairsOfVectorsJoinedTuplesCollector collector;
+//  if (join_key_attributes_.size() == 1) {
+//    hash_table_.getAllFromValueAccessor(
+//        probe_accessor.get(),
+//        join_key_attributes_.front(),
+//        any_join_key_attributes_nullable_,
+//        &collector);
+//  } else {
+//    hash_table_.getAllFromValueAccessorCompositeKey(
+//        probe_accessor.get(),
+//        join_key_attributes_,
+//        any_join_key_attributes_nullable_,
+//        &collector);
+//  }
+//
+//  const relation_id build_relation_id = build_relation_.getID();
+//  const relation_id probe_relation_id = probe_relation_.getID();
+//
+//  std::map<attribute_id, attribute_id> build_attribute_map;
+//  std::map<attribute_id, attribute_id> probe_attribute_map;
+//  std::map<attribute_id, attribute_id> non_trivial_attribute_map;
+//  std::vector<const Scalar *> non_trivial_expressions;
+//  for (std::size_t i = 0; i < selection_.size(); ++i) {
+//    const Scalar *scalar = selection_[i].get();
+//    if (scalar->getDataSource() == Scalar::ScalarDataSource::kAttribute) {
+//      const ScalarAttribute *scalar_attr =
+//          static_cast<const ScalarAttribute *>(scalar);
+//      const relation_id scalar_attr_relation_id =
+//          scalar_attr->getRelationIdForValueAccessor();
+//      const attribute_id scalar_attr_id =
+//          scalar_attr->getAttributeIdForValueAccessor();
+//
+//      if (scalar_attr_relation_id == build_relation_id) {
+//        build_attribute_map.emplace(scalar_attr_id, i);
+//      } else {
+//        DCHECK_EQ(probe_relation_id, scalar_attr->getRelationIdForValueAccessor());
+//        probe_attribute_map.emplace(scalar_attr_id, i);
+//      }
+//    } else {
+//      non_trivial_attribute_map.emplace(non_trivial_expressions.size(), i);
+//      non_trivial_expressions.emplace_back(scalar);
+//    }
+//  }
+//
+//  std::unique_ptr<InsertContext> insert_context(
+//      new InsertContext(output_destination_->getRelation()));
+//  insert_context->addSource(build_attribute_map);
+//  insert_context->addSource(probe_attribute_map);
+//  insert_context->addSource(non_trivial_attribute_map);
+//
+//  MutableBlockReference output_block;
+//  for (std::pair<const block_id, PairOfVectors>
+//           &build_block_entry : *collector.getJoinedTuples()) {
+//    BlockReference build_block =
+//        storage_manager_->getBlock(build_block_entry.first, build_relation_);
+//    const TupleStorageSubBlock &build_store = build_block->getTupleStorageSubBlock();
+//    std::unique_ptr<ValueAccessor> build_accessor(build_store.createValueAccessor());
+//
+//    const std::vector<tuple_id> &build_tids = build_block_entry.second.first;
+//    const std::vector<tuple_id> &probe_tids = build_block_entry.second.second;
+//
+//    // Evaluate '*residual_predicate_', if any.
+//    //
+//    // TODO(chasseur): We might consider implementing true vectorized
+//    // evaluation for join predicates that are not equijoins (although in
+//    // general that would require evaluating and materializing some expressions
+//    // over the cross-product of all tuples in a pair of blocks in order to
+//    // evaluate the predicate). We could use a heuristic where we only do the
+//    // vectorized materialization and evaluation if the set of matches from the
+//    // hash join is below a reasonable threshold so that we don't blow up
+//    // temporary memory requirements to an unreasonable degree.
+//    if (residual_predicate_ != nullptr) {
+//      PairOfVectors filtered_matches;
+//
+//      for (std::size_t i = 0; i < build_tids.size(); ++i) {
+//        const tuple_id build_tid = build_tids[i];
+//        const tuple_id probe_tid = probe_tids[i];
+//        if (residual_predicate_->matchesForJoinedTuples(*build_accessor,
+//                                                        build_relation_id,
+//                                                        build_tid,
+//                                                        *probe_accessor,
+//                                                        probe_relation_id,
+//                                                        probe_tid)) {
+//          filtered_matches.first.emplace_back(build_tid);
+//          filtered_matches.second.emplace_back(probe_tid);
+//        }
+//      }
+//
+//      build_block_entry.second = std::move(filtered_matches);
+//    }
+//
+//    // TODO(chasseur): If all the output expressions are ScalarAttributes,
+//    // we could implement a similar fast-path to StorageBlock::selectSimple()
+//    // that avoids a copy.
+//    //
+//    // TODO(chasseur): See TODO in NestedLoopsJoinOperator.cpp about limiting
+//    // the size of materialized temporary results. In common usage, this
+//    // probably won't be an issue for hash-joins, but in the worst case a hash
+//    // join can still devolve into a cross-product.
+//    //
+//    // NOTE(chasseur): We could also create one big ColumnVectorsValueAccessor
+//    // and accumulate all the results across multiple block pairs into it
+//    // before inserting anything into output blocks, but this would require
+//    // some significant API extensions to the expressions system for a dubious
+//    // benefit (probably only a real performance win when there are very few
+//    // matching tuples in each individual inner block but very many inner
+//    // blocks with at least one match).
+//    ColumnVectorsValueAccessor temp_result;
+//    if (non_trivial_expressions.size() > 0) {
+//      VectorOfPairs zipped_joined_tuple_ids;
+//      zipped_joined_tuple_ids.reserve(build_tids.size());
+//      for (std::size_t i = 0; i < build_tids.size(); ++i) {
+//        zipped_joined_tuple_ids.emplace_back(build_tids[i], probe_tids[i]);
+//      }
+//
+//      for (auto selection_cit = non_trivial_expressions.begin();
+//           selection_cit != non_trivial_expressions.end();
+//           ++selection_cit) {
+//        temp_result.addColumn((*selection_cit)->getAllValuesForJoin(build_relation_id,
+//                                                                    build_accessor.get(),
+//                                                                    probe_relation_id,
+//                                                                    probe_accessor.get(),
+//                                                                    zipped_joined_tuple_ids));
+//      }
+//    }
+//
+//    std::unique_ptr<ValueAccessor> ordered_build_accessor(
+//        build_accessor->createSharedOrderedTupleIdSequenceAdapterVirtual(build_tids));
+//    std::unique_ptr<ValueAccessor> ordered_probe_accessor(
+//        probe_accessor->createSharedOrderedTupleIdSequenceAdapterVirtual(probe_tids));
+//
+//    output_destination_->bulkInsertTuples(
+//        { ordered_build_accessor.get(), ordered_probe_accessor.get(), &temp_result },
+//        insert_context.get(),
+//        &output_block);
+//  }
+//
+//  output_destination_->returnBlock(&output_block);
+
+  VectorsOfPairsJoinedTuplesCollector collector;
   if (join_key_attributes_.size() == 1) {
     hash_table_.getAllFromValueAccessor(
         probe_accessor.get(),
@@ -484,49 +624,17 @@ void HashInnerJoinWorkOrder::execute() {
   const relation_id build_relation_id = build_relation_.getID();
   const relation_id probe_relation_id = probe_relation_.getID();
 
-  std::map<attribute_id, attribute_id> build_attribute_map;
-  std::map<attribute_id, attribute_id> probe_attribute_map;
-  std::map<attribute_id, attribute_id> non_trivial_attribute_map;
-  std::vector<const Scalar *> non_trivial_expressions;
-  for (std::size_t i = 0; i < selection_.size(); ++i) {
-    const Scalar *scalar = selection_[i].get();
-    if (scalar->getDataSource() == Scalar::ScalarDataSource::kAttribute) {
-      const ScalarAttribute *scalar_attr =
-          static_cast<const ScalarAttribute *>(scalar);
-      const relation_id scalar_attr_relation_id =
-          scalar_attr->getRelationIdForValueAccessor();
-      const attribute_id scalar_attr_id =
-          scalar_attr->getAttributeIdForValueAccessor();
-
-      if (scalar_attr_relation_id == build_relation_id) {
-        build_attribute_map.emplace(scalar_attr_id, i);
-      } else {
-        DCHECK_EQ(probe_relation_id, scalar_attr->getRelationIdForValueAccessor());
-        probe_attribute_map.emplace(scalar_attr_id, i);
-      }
-    } else {
-      non_trivial_attribute_map.emplace(non_trivial_expressions.size(), i);
-      non_trivial_expressions.emplace_back(scalar);
-    }
-  }
-
-  std::unique_ptr<InsertContext> insert_context(
-      new InsertContext(output_destination_->getRelation()));
-  insert_context->addSource(build_attribute_map);
-  insert_context->addSource(probe_attribute_map);
-  insert_context->addSource(non_trivial_attribute_map);
-
-  MutableBlockReference output_block;
-  for (std::pair<const block_id, PairOfVectors>
+  auto *container = simple_profiler.getContainer();
+  auto *overall_line = container->getEventLine("overall");
+  auto *bulk_insert_line = container->getEventLine("bulk_insert");
+  overall_line->emplace_back();
+  for (std::pair<const block_id, std::vector<std::pair<tuple_id, tuple_id>>>
            &build_block_entry : *collector.getJoinedTuples()) {
     BlockReference build_block =
         storage_manager_->getBlock(build_block_entry.first, build_relation_);
     const TupleStorageSubBlock &build_store = build_block->getTupleStorageSubBlock();
     std::unique_ptr<ValueAccessor> build_accessor(build_store.createValueAccessor());
 
-    const std::vector<tuple_id> &build_tids = build_block_entry.second.first;
-    const std::vector<tuple_id> &probe_tids = build_block_entry.second.second;
-
     // Evaluate '*residual_predicate_', if any.
     //
     // TODO(chasseur): We might consider implementing true vectorized
@@ -538,19 +646,17 @@ void HashInnerJoinWorkOrder::execute() {
     // hash join is below a reasonable threshold so that we don't blow up
     // temporary memory requirements to an unreasonable degree.
     if (residual_predicate_ != nullptr) {
-      PairOfVectors filtered_matches;
+      std::vector<std::pair<tuple_id, tuple_id>> filtered_matches;
 
-      for (std::size_t i = 0; i < build_tids.size(); ++i) {
-        const tuple_id build_tid = build_tids[i];
-        const tuple_id probe_tid = probe_tids[i];
+      for (const std::pair<tuple_id, tuple_id> &hash_match
+           : build_block_entry.second) {
         if (residual_predicate_->matchesForJoinedTuples(*build_accessor,
                                                         build_relation_id,
-                                                        build_tid,
+                                                        hash_match.first,
                                                         *probe_accessor,
                                                         probe_relation_id,
-                                                        probe_tid)) {
-          filtered_matches.first.emplace_back(build_tid);
-          filtered_matches.second.emplace_back(probe_tid);
+                                                        hash_match.second)) {
+          filtered_matches.emplace_back(hash_match);
         }
       }
 
@@ -574,36 +680,25 @@ void HashInnerJoinWorkOrder::execute() {
     // matching tuples in each individual inner block but very many inner
     // blocks with at least one match).
     ColumnVectorsValueAccessor temp_result;
-    if (non_trivial_expressions.size() > 0) {
-      VectorOfPairs zipped_joined_tuple_ids;
-      zipped_joined_tuple_ids.reserve(build_tids.size());
-      for (std::size_t i = 0; i < build_tids.size(); ++i) {
-        zipped_joined_tuple_ids.emplace_back(build_tids[i], probe_tids[i]);
-      }
-
-      for (auto selection_cit = non_trivial_expressions.begin();
-           selection_cit != non_trivial_expressions.end();
-           ++selection_cit) {
-        temp_result.addColumn((*selection_cit)->getAllValuesForJoin(build_relation_id,
-                                                                    build_accessor.get(),
-                                                                    probe_relation_id,
-                                                                    probe_accessor.get(),
-                                                                    zipped_joined_tuple_ids));
-      }
+    for (vector<unique_ptr<const Scalar>>::const_iterator selection_cit = selection_.begin();
+         selection_cit != selection_.end();
+         ++selection_cit) {
+      temp_result.addColumn((*selection_cit)->getAllValuesForJoin(build_relation_id,
+                                                                  build_accessor.get(),
+                                                                  probe_relation_id,
+                                                                  probe_accessor.get(),
+                                                                  build_block_entry.second));
     }
 
-    std::unique_ptr<ValueAccessor> ordered_build_accessor(
-        build_accessor->createSharedOrderedTupleIdSequenceAdapterVirtual(build_tids));
-    std::unique_ptr<ValueAccessor> ordered_probe_accessor(
-        probe_accessor->createSharedOrderedTupleIdSequenceAdapterVirtual(probe_tids));
-
-    output_destination_->bulkInsertTuples(
-        { ordered_build_accessor.get(), ordered_probe_accessor.get(), &temp_result },
-        insert_context.get(),
-        &output_block);
+    // NOTE(chasseur): calling the bulk-insert method of InsertDestination once
+    // for each pair of joined blocks incurs some extra overhead that could be
+    // avoided by keeping checked-out MutableBlockReferences across iterations
+    // of this loop, but that would get messy when combined with partitioning.
+    bulk_insert_line->emplace_back();
+    output_destination_->bulkInsertTuples(&temp_result);
+    bulk_insert_line->back().endEvent();
   }
-
-  output_destination_->returnBlock(&output_block);
+  overall_line->back().endEvent();
 }
 
 void HashSemiJoinWorkOrder::execute() {

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/storage/InsertContext.hpp
----------------------------------------------------------------------
diff --git a/storage/InsertContext.hpp b/storage/InsertContext.hpp
index b8d7178..643294d 100644
--- a/storage/InsertContext.hpp
+++ b/storage/InsertContext.hpp
@@ -159,6 +159,70 @@ class CopyList {
         return CreateStrideCopyFunctor<31>(std::forward<ArgTypes>(args)...);
       case 32:
         return CreateStrideCopyFunctor<32>(std::forward<ArgTypes>(args)...);
+      case 33:
+        return CreateStrideCopyFunctor<33>(std::forward<ArgTypes>(args)...);
+      case 34:
+        return CreateStrideCopyFunctor<34>(std::forward<ArgTypes>(args)...);
+      case 35:
+        return CreateStrideCopyFunctor<35>(std::forward<ArgTypes>(args)...);
+      case 36:
+        return CreateStrideCopyFunctor<36>(std::forward<ArgTypes>(args)...);
+      case 37:
+        return CreateStrideCopyFunctor<37>(std::forward<ArgTypes>(args)...);
+      case 38:
+        return CreateStrideCopyFunctor<38>(std::forward<ArgTypes>(args)...);
+      case 39:
+        return CreateStrideCopyFunctor<39>(std::forward<ArgTypes>(args)...);
+      case 40:
+        return CreateStrideCopyFunctor<40>(std::forward<ArgTypes>(args)...);
+      case 41:
+        return CreateStrideCopyFunctor<41>(std::forward<ArgTypes>(args)...);
+      case 42:
+        return CreateStrideCopyFunctor<42>(std::forward<ArgTypes>(args)...);
+      case 43:
+        return CreateStrideCopyFunctor<43>(std::forward<ArgTypes>(args)...);
+      case 44:
+        return CreateStrideCopyFunctor<44>(std::forward<ArgTypes>(args)...);
+      case 45:
+        return CreateStrideCopyFunctor<45>(std::forward<ArgTypes>(args)...);
+      case 46:
+        return CreateStrideCopyFunctor<46>(std::forward<ArgTypes>(args)...);
+      case 47:
+        return CreateStrideCopyFunctor<47>(std::forward<ArgTypes>(args)...);
+      case 48:
+        return CreateStrideCopyFunctor<48>(std::forward<ArgTypes>(args)...);
+      case 49:
+        return CreateStrideCopyFunctor<49>(std::forward<ArgTypes>(args)...);
+      case 50:
+        return CreateStrideCopyFunctor<50>(std::forward<ArgTypes>(args)...);
+      case 51:
+        return CreateStrideCopyFunctor<51>(std::forward<ArgTypes>(args)...);
+      case 52:
+        return CreateStrideCopyFunctor<52>(std::forward<ArgTypes>(args)...);
+      case 53:
+        return CreateStrideCopyFunctor<53>(std::forward<ArgTypes>(args)...);
+      case 54:
+        return CreateStrideCopyFunctor<54>(std::forward<ArgTypes>(args)...);
+      case 55:
+        return CreateStrideCopyFunctor<55>(std::forward<ArgTypes>(args)...);
+      case 56:
+        return CreateStrideCopyFunctor<56>(std::forward<ArgTypes>(args)...);
+      case 57:
+        return CreateStrideCopyFunctor<57>(std::forward<ArgTypes>(args)...);
+      case 58:
+        return CreateStrideCopyFunctor<58>(std::forward<ArgTypes>(args)...);
+      case 59:
+        return CreateStrideCopyFunctor<59>(std::forward<ArgTypes>(args)...);
+      case 60:
+        return CreateStrideCopyFunctor<60>(std::forward<ArgTypes>(args)...);
+      case 61:
+        return CreateStrideCopyFunctor<61>(std::forward<ArgTypes>(args)...);
+      case 62:
+        return CreateStrideCopyFunctor<62>(std::forward<ArgTypes>(args)...);
+      case 63:
+        return CreateStrideCopyFunctor<63>(std::forward<ArgTypes>(args)...);
+      case 64:
+        return CreateStrideCopyFunctor<64>(std::forward<ArgTypes>(args)...);
       default:
         return CreateStrideCopyFunctor(bytes_to_copy, std::forward<ArgTypes>(args)...);
     }

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/utility/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/utility/CMakeLists.txt b/utility/CMakeLists.txt
index 6a656fb..2e397f3 100644
--- a/utility/CMakeLists.txt
+++ b/utility/CMakeLists.txt
@@ -179,6 +179,7 @@ add_library(quickstep_utility_CompositeHash ../empty_src.cpp CompositeHash.hpp)
 add_library(quickstep_utility_DAG ../empty_src.cpp DAG.hpp)
 add_library(quickstep_utility_DisjointTreeForest ../empty_src.cpp DisjointTreeForest.hpp)
 add_library(quickstep_utility_EqualsAnyConstant ../empty_src.cpp EqualsAnyConstant.hpp)
+add_library(quickstep_utility_EventProfiler EventProfiler.cpp EventProfiler.hpp)
 add_library(quickstep_utility_ExecutionDAGVisualizer
             ExecutionDAGVisualizer.cpp
             ExecutionDAGVisualizer.hpp)
@@ -247,6 +248,8 @@ target_link_libraries(quickstep_utility_DAG
                       quickstep_utility_Macros)
 target_link_libraries(quickstep_utility_DisjointTreeForest
                       glog)
+target_link_libraries(quickstep_utility_EventProfiler
+                      quickstep_threading_Mutex)
 target_link_libraries(quickstep_utility_ExecutionDAGVisualizer
                       quickstep_catalog_CatalogRelationSchema
                       quickstep_queryexecution_QueryExecutionTypedefs
@@ -280,6 +283,7 @@ target_link_libraries(quickstep_utility_PlanVisualizer
                       quickstep_queryoptimizer_physical_PhysicalType
                       quickstep_queryoptimizer_physical_TableReference
                       quickstep_queryoptimizer_physical_TopLevelPlan
+                      quickstep_types_Type
                       quickstep_utility_Macros
                       quickstep_utility_StringUtil)
 target_link_libraries(quickstep_utility_PtrList
@@ -344,6 +348,7 @@ target_link_libraries(quickstep_utility
                       quickstep_utility_DAG
                       quickstep_utility_DisjointTreeForest
                       quickstep_utility_EqualsAnyConstant
+                      quickstep_utility_EventProfiler
                       quickstep_utility_ExecutionDAGVisualizer
                       quickstep_utility_Glob
                       quickstep_utility_HashPair

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/utility/EventProfiler.cpp
----------------------------------------------------------------------
diff --git a/utility/EventProfiler.cpp b/utility/EventProfiler.cpp
new file mode 100644
index 0000000..cf89cb9
--- /dev/null
+++ b/utility/EventProfiler.cpp
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ **/
+
+#include "utility/EventProfiler.hpp"
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace quickstep {
+
+EventProfiler<std::string> simple_profiler;
+
+}  // namespace quickstep

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/utility/EventProfiler.hpp
----------------------------------------------------------------------
diff --git a/utility/EventProfiler.hpp b/utility/EventProfiler.hpp
new file mode 100644
index 0000000..c28f49b
--- /dev/null
+++ b/utility/EventProfiler.hpp
@@ -0,0 +1,190 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ **/
+
+#ifndef QUICKSTEP_UTILITY_EVENT_PROFILER_HPP_
+#define QUICKSTEP_UTILITY_EVENT_PROFILER_HPP_
+
+#include <chrono>
+#include <cstddef>
+#include <cstring>
+#include <ctime>
+#include <iomanip>
+#include <map>
+#include <ostream>
+#include <thread>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "threading/Mutex.hpp"
+
+#include "glog/logging.h"
+
+namespace quickstep {
+
+/** \addtogroup Utility
+ *  @{
+ */
+
+using clock = std::chrono::steady_clock;
+
+template <typename TagT, typename ...PayloadT>
+class EventProfiler {
+
+ public:
+  EventProfiler()
+      : zero_time_(clock::now()) {
+  }
+
+  struct EventInfo {
+    clock::time_point start_time;
+    clock::time_point end_time;
+    bool is_finished;
+    std::tuple<PayloadT...> payload;
+
+    explicit EventInfo(const clock::time_point &start_time_in)
+        : start_time(start_time_in),
+          is_finished(false) {
+    }
+
+    EventInfo()
+        : start_time(clock::now()),
+          is_finished(false) {
+    }
+
+    inline void setPayload(PayloadT &&...in_payload) {
+      payload = std::make_tuple(in_payload...);
+    }
+
+    inline void endEvent() {
+      end_time = clock::now();
+      is_finished = true;
+    }
+  };
+
+  struct EventContainer {
+    EventContainer()
+        : context(0) {}
+
+    inline void startEvent(const TagT &tag) {
+      events[tag].emplace_back(clock::now());
+    }
+
+    inline void endEvent(const TagT &tag) {
+      auto &event_info = events.at(tag).back();
+      event_info.is_finished = true;
+      event_info.end_time = clock::now();
+    }
+
+    inline std::vector<EventInfo> *getEventLine(const TagT &tag) {
+      return &events[tag];
+    }
+
+    inline void setContext(int context_in) {
+      context = context_in;
+    }
+
+    inline int getContext() const {
+      return context;
+    }
+
+    std::map<TagT, std::vector<EventInfo>> events;
+    int context;
+  };
+
+  EventContainer *getContainer() {
+    MutexLock lock(mutex_);
+    return &thread_map_[std::this_thread::get_id()];
+  }
+
+  void writeToStream(std::ostream &os) const {
+    time_t rawtime;
+    time(&rawtime);
+    char event_id[32];
+    strftime(event_id, sizeof event_id, "%Y-%m-%d %H:%M:%S", localtime(&rawtime));
+
+    int thread_id = 0;
+    for (const auto &thread_ctx : thread_map_) {
+      for (const auto &event_group : thread_ctx.second.events) {
+        for (const auto &event_info : event_group.second) {
+          CHECK(event_info.is_finished)
+              << "Unfinished profiling event at thread " << thread_id
+              << ": " << event_group.first;
+
+          os << std::setprecision(12)
+             << event_id << ","
+             << thread_id << "," << event_group.first << ",";
+
+          PrintTuple(os, event_info.payload, ",");
+
+          os << std::chrono::duration<double>(event_info.start_time - zero_time_).count()
+             << ","
+             << std::chrono::duration<double>(event_info.end_time - zero_time_).count()
+             << "\n";
+        }
+      }
+      ++thread_id;
+    }
+  }
+
+  void clear() {
+    zero_time_ = clock::now();
+    thread_map_.clear();
+  }
+
+  const std::map<std::thread::id, EventContainer> &containers() {
+    return thread_map_;
+  }
+
+  const clock::time_point &zero_time() {
+    return zero_time_;
+  }
+
+ private:
+  template<class Tuple, std::size_t N>
+  struct TuplePrinter {
+    static void Print(std::ostream &os, const Tuple &t, const std::string &sep) {
+      TuplePrinter<Tuple, N-1>::Print(os, t, sep);
+      os << std::get<N-1>(t) << sep;
+    }
+  };
+
+  template<class Tuple>
+  struct TuplePrinter<Tuple, 0> {
+    static void Print(std::ostream &os, const Tuple &t, const std::string &sep) {
+    }
+  };
+
+  template<class... Args>
+  static void PrintTuple(std::ostream &os, const std::tuple<Args...>& t, const std::string &sep) {
+    TuplePrinter<decltype(t), sizeof...(Args)>::Print(os, t, sep);
+  }
+
+  clock::time_point zero_time_;
+  std::map<std::thread::id, EventContainer> thread_map_;
+  Mutex mutex_;
+};
+
+extern EventProfiler<std::string> simple_profiler;
+
+/** @} */
+
+}  // namespace quickstep
+
+#endif  // QUICKSTEP_UTILITY_EVENT_PROFILER_HPP_

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/4b15e619/utility/PlanVisualizer.cpp
----------------------------------------------------------------------
diff --git a/utility/PlanVisualizer.cpp b/utility/PlanVisualizer.cpp
index df7a20c..ff0fb24 100644
--- a/utility/PlanVisualizer.cpp
+++ b/utility/PlanVisualizer.cpp
@@ -37,6 +37,7 @@
 #include "query_optimizer/physical/PhysicalType.hpp"
 #include "query_optimizer/physical/TableReference.hpp"
 #include "query_optimizer/physical/TopLevelPlan.hpp"
+#include "types/Type.hpp"
 #include "utility/StringUtil.hpp"
 
 #include "glog/logging.h"
@@ -133,7 +134,9 @@ void PlanVisualizer::visit(const P::PhysicalPtr &input) {
 
     for (const auto &attr : child->getOutputAttributes()) {
       if (referenced_ids.find(attr->id()) != referenced_ids.end()) {
-        edge_info.labels.emplace_back(attr->attribute_alias());
+        const std::size_t attr_size = attr->getValueType().maximumByteLength();
+        edge_info.labels.emplace_back(
+            attr->attribute_alias() + " : " + std::to_string(attr_size));
       }
     }
   }