You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by zu...@apache.org on 2016/05/27 03:23:03 UTC

[12/50] [abbrv] incubator-quickstep git commit: Fixed the incorrect distinctify hash table problem related to DISTINCT aggregation

Fixed the incorrect distinctify hash table problem related to DISTINCT aggregation


Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/70fcdb5d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/70fcdb5d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/70fcdb5d

Branch: refs/heads/change-aggregation-hashtable
Commit: 70fcdb5d6c0741db6382c42ee546f418e2a2cee6
Parents: 41cea5d
Author: Jianqiao Zhu <ji...@cs.wisc.edu>
Authored: Wed May 4 22:20:41 2016 -0500
Committer: Jianqiao Zhu <ji...@cs.wisc.edu>
Committed: Wed May 4 22:25:11 2016 -0500

----------------------------------------------------------------------
 query_optimizer/ExecutionGenerator.cpp          | 67 ++++++++++++--------
 .../tests/execution_generator/Select.test       | 23 +++----
 storage/AggregationOperationState.cpp           | 28 +++++++-
 storage/AggregationOperationState.hpp           |  5 ++
 storage/AggregationOperationState.proto         |  5 ++
 5 files changed, 89 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/70fcdb5d/query_optimizer/ExecutionGenerator.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/ExecutionGenerator.cpp b/query_optimizer/ExecutionGenerator.cpp
index 3698701..077d35d 100644
--- a/query_optimizer/ExecutionGenerator.cpp
+++ b/query_optimizer/ExecutionGenerator.cpp
@@ -1285,6 +1285,35 @@ void ExecutionGenerator::convertAggregate(
       findRelationInfoOutputByPhysical(physical_plan->input());
   aggr_state_proto->set_relation_id(input_relation_info->relation->getID());
 
+  std::vector<const Type*> group_by_types;
+  for (const E::NamedExpressionPtr &grouping_expression : physical_plan->grouping_expressions()) {
+    unique_ptr<const Scalar> execution_group_by_expression;
+    E::AliasPtr alias;
+    if (E::SomeAlias::MatchesWithConditionalCast(grouping_expression, &alias)) {
+      E::ScalarPtr scalar;
+      // NOTE(zuyu): For aggregate expressions, all child expressions of an
+      // Alias should be a Scalar.
+      CHECK(E::SomeScalar::MatchesWithConditionalCast(alias->expression(), &scalar))
+          << alias->toString();
+      execution_group_by_expression.reset(scalar->concretize(attribute_substitution_map_));
+    } else {
+      execution_group_by_expression.reset(
+          grouping_expression->concretize(attribute_substitution_map_));
+    }
+    aggr_state_proto->add_group_by_expressions()->CopyFrom(execution_group_by_expression->getProto());
+    group_by_types.push_back(&execution_group_by_expression->getType());
+  }
+
+  if (!group_by_types.empty()) {
+    // SimplifyHashTableImplTypeProto() switches the hash table implementation
+    // from SeparateChaining to SimpleScalarSeparateChaining when there is a
+    // single scalar key type with a reversible hash function.
+    aggr_state_proto->set_hash_table_impl_type(
+        SimplifyHashTableImplTypeProto(
+            HashTableImplTypeProtoFromString(FLAGS_aggregate_hashtable_type),
+            group_by_types));
+  }
+
   for (const E::AliasPtr &named_aggregate_expression : physical_plan->aggregate_expressions()) {
     const E::AggregateFunctionPtr unnamed_aggregate_expression =
         std::static_pointer_cast<const E::AggregateFunction>(named_aggregate_expression->expression());
@@ -1304,25 +1333,19 @@ void ExecutionGenerator::convertAggregate(
 
     // Set whether it is a DISTINCT aggregation.
     aggr_proto->set_is_distinct(unnamed_aggregate_expression->is_distinct());
-  }
 
-  std::vector<const Type*> group_by_types;
-  for (const E::NamedExpressionPtr &grouping_expression : physical_plan->grouping_expressions()) {
-    unique_ptr<const Scalar> execution_group_by_expression;
-    E::AliasPtr alias;
-    if (E::SomeAlias::MatchesWithConditionalCast(grouping_expression, &alias)) {
-      E::ScalarPtr scalar;
-      // NOTE(zuyu): For aggregate expressions, all child expressions of an
-      // Alias should be a Scalar.
-      CHECK(E::SomeScalar::MatchesWithConditionalCast(alias->expression(), &scalar))
-          << alias->toString();
-      execution_group_by_expression.reset(scalar->concretize(attribute_substitution_map_));
-    } else {
-      execution_group_by_expression.reset(
-          grouping_expression->concretize(attribute_substitution_map_));
+    // Add distinctify hash table impl type if it is a DISTINCT aggregation.
+    if (unnamed_aggregate_expression->is_distinct()) {
+      if (group_by_types.empty()) {
+        aggr_state_proto->add_distinctify_hash_table_impl_types(
+            SimplifyHashTableImplTypeProto(
+                HashTableImplTypeProtoFromString(FLAGS_aggregate_hashtable_type),
+                {&unnamed_aggregate_expression->getValueType()}));
+      } else {
+        aggr_state_proto->add_distinctify_hash_table_impl_types(
+            HashTableImplTypeProtoFromString(FLAGS_aggregate_hashtable_type));
+      }
     }
-    aggr_state_proto->add_group_by_expressions()->CopyFrom(execution_group_by_expression->getProto());
-    group_by_types.push_back(&execution_group_by_expression->getType());
   }
 
   if (physical_plan->filter_predicate() != nullptr) {
@@ -1332,16 +1355,6 @@ void ExecutionGenerator::convertAggregate(
 
   aggr_state_proto->set_estimated_num_entries(cost_model_->estimateCardinality(physical_plan));
 
-  if (!group_by_types.empty()) {
-    // SimplifyHashTableImplTypeProto() switches the hash table implementation
-    // from SeparateChaining to SimpleScalarSeparateChaining when there is a
-    // single scalar key type with a reversible hash function.
-    aggr_state_proto->set_hash_table_impl_type(
-        SimplifyHashTableImplTypeProto(
-            HashTableImplTypeProtoFromString(FLAGS_aggregate_hashtable_type),
-            group_by_types));
-  }
-
   const QueryPlan::DAGNodeIndex aggregation_operator_index =
       execution_plan_->addRelationalOperator(
           new AggregationOperator(

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/70fcdb5d/query_optimizer/tests/execution_generator/Select.test
----------------------------------------------------------------------
diff --git a/query_optimizer/tests/execution_generator/Select.test b/query_optimizer/tests/execution_generator/Select.test
index a08b012..390b7b6 100644
--- a/query_optimizer/tests/execution_generator/Select.test
+++ b/query_optimizer/tests/execution_generator/Select.test
@@ -535,21 +535,12 @@ WHERE int_col = 2;
 
 SELECT int_col
 FROM test
-GROUP BY int_col;
+GROUP BY int_col
+ORDER BY int_col;
 --
 +-----------+
 |int_col    |
 +-----------+
-|          2|
-|          4|
-|          6|
-|          8|
-|         12|
-|         14|
-|         16|
-|         18|
-|         22|
-|         24|
 |        -23|
 |        -21|
 |        -19|
@@ -562,6 +553,16 @@ GROUP BY int_col;
 |         -5|
 |         -3|
 |         -1|
+|          2|
+|          4|
+|          6|
+|          8|
+|         12|
+|         14|
+|         16|
+|         18|
+|         22|
+|         24|
 +-----------+
 ==
 

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/70fcdb5d/storage/AggregationOperationState.cpp
----------------------------------------------------------------------
diff --git a/storage/AggregationOperationState.cpp b/storage/AggregationOperationState.cpp
index a3a669c..d209ceb 100644
--- a/storage/AggregationOperationState.cpp
+++ b/storage/AggregationOperationState.cpp
@@ -1,6 +1,8 @@
 /**
  *   Copyright 2011-2015 Quickstep Technologies LLC.
  *   Copyright 2015-2016 Pivotal Software, Inc.
+ *   Copyright 2016, Quickstep Research Group, Computer Sciences Department,
+ *     University of Wisconsin\u2014Madison.
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -64,6 +66,7 @@ AggregationOperationState::AggregationOperationState(
     const Predicate *predicate,
     const std::size_t estimated_num_entries,
     const HashTableImplType hash_table_impl_type,
+    const std::vector<HashTableImplType> &distinctify_hash_table_impl_types,
     StorageManager *storage_manager)
     : input_relation_(input_relation),
       predicate_(predicate),
@@ -101,6 +104,8 @@ AggregationOperationState::AggregationOperationState(
     std::vector<std::vector<std::unique_ptr<const Scalar>>>::const_iterator args_it
         = arguments_.begin();
     std::vector<bool>::const_iterator is_distinct_it = is_distinct_.begin();
+    std::vector<HashTableImplType>::const_iterator distinctify_hash_table_impl_types_it
+        = distinctify_hash_table_impl_types.begin();
     for (; agg_func_it != aggregate_functions.end(); ++agg_func_it, ++args_it, ++is_distinct_it) {
       // Get the Types of this aggregate's arguments so that we can create an
       // AggregationHandle.
@@ -161,10 +166,11 @@ AggregationOperationState::AggregationOperationState(
         // query optimization, if it worths.
         distinctify_hashtables_.emplace_back(
             handles_.back()->createDistinctifyHashTable(
-                hash_table_impl_type,
+                *distinctify_hash_table_impl_types_it,
                 key_types,
                 estimated_num_entries,
                 storage_manager));
+        ++distinctify_hash_table_impl_types_it;
       } else {
         distinctify_hashtables_.emplace_back(nullptr);
       }
@@ -182,6 +188,8 @@ AggregationOperationState* AggregationOperationState::ReconstructFromProto(
   std::vector<const AggregateFunction*> aggregate_functions;
   std::vector<std::vector<std::unique_ptr<const Scalar>>> arguments;
   std::vector<bool> is_distinct;
+  std::vector<HashTableImplType> distinctify_hash_table_impl_types;
+  std::size_t distinctify_hash_table_impl_type_index = 0;
   for (int agg_idx = 0; agg_idx < proto.aggregates_size(); ++agg_idx) {
     const serialization::Aggregate &agg_proto = proto.aggregates(agg_idx);
 
@@ -197,6 +205,13 @@ AggregationOperationState* AggregationOperationState::ReconstructFromProto(
     }
 
     is_distinct.emplace_back(agg_proto.is_distinct());
+
+    if (agg_proto.is_distinct()) {
+      distinctify_hash_table_impl_types.emplace_back(
+          HashTableImplTypeFromProto(
+              proto.distinctify_hash_table_impl_types(distinctify_hash_table_impl_type_index)));
+      ++distinctify_hash_table_impl_type_index;
+    }
   }
 
   std::vector<std::unique_ptr<const Scalar>> group_by_expressions;
@@ -223,6 +238,7 @@ AggregationOperationState* AggregationOperationState::ReconstructFromProto(
                                        predicate.release(),
                                        proto.estimated_num_entries(),
                                        HashTableImplTypeFromProto(proto.hash_table_impl_type()),
+                                       distinctify_hash_table_impl_types,
                                        storage_manager);
 }
 
@@ -234,6 +250,8 @@ bool AggregationOperationState::ProtoIsValid(const serialization::AggregationOpe
     return false;
   }
 
+  std::size_t num_distinctify_hash_tables = proto.distinctify_hash_table_impl_types_size();
+  std::size_t distinctify_hash_table_impl_type_index = 0;
   for (int i = 0; i < proto.aggregates_size(); ++i) {
     if (!AggregateFunctionFactory::ProtoIsValid(proto.aggregates(i).function())) {
       return false;
@@ -251,6 +269,14 @@ bool AggregationOperationState::ProtoIsValid(const serialization::AggregationOpe
         return false;
       }
     }
+
+    if (proto.aggregates(i).is_distinct()) {
+      if (distinctify_hash_table_impl_type_index >= num_distinctify_hash_tables ||
+          !serialization::HashTableImplType_IsValid(
+              proto.distinctify_hash_table_impl_types(distinctify_hash_table_impl_type_index))) {
+        return false;
+      }
+    }
   }
 
   for (int i = 0; i < proto.group_by_expressions_size(); ++i) {

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/70fcdb5d/storage/AggregationOperationState.hpp
----------------------------------------------------------------------
diff --git a/storage/AggregationOperationState.hpp b/storage/AggregationOperationState.hpp
index b883ed1..c3a1278 100644
--- a/storage/AggregationOperationState.hpp
+++ b/storage/AggregationOperationState.hpp
@@ -1,6 +1,8 @@
 /**
  *   Copyright 2011-2015 Quickstep Technologies LLC.
  *   Copyright 2015-2016 Pivotal Software, Inc.
+ *   Copyright 2016, Quickstep Research Group, Computer Sciences Department,
+ *     University of Wisconsin\u2014Madison.
  *
  *   Licensed under the Apache License, Version 2.0 (the "License");
  *   you may not use this file except in compliance with the License.
@@ -93,6 +95,8 @@ class AggregationOperationState {
    *        in the input relation.
    * @param hash_table_impl_type The HashTable implementation to use for
    *        GROUP BY. Ignored if group_by is empty.
+   * @param distinctify_hash_table_impl_type The HashTable implementation to use
+   *        for the distinctify phase of each DISTINCT aggregation.
    * @param storage_manager The StorageManager to use for allocating hash
    *        tables. Single aggregation state (when GROUP BY list is not
    *        specified) is not allocated using memory from storage manager.
@@ -105,6 +109,7 @@ class AggregationOperationState {
                             const Predicate *predicate,
                             const std::size_t estimated_num_entries,
                             const HashTableImplType hash_table_impl_type,
+                            const std::vector<HashTableImplType> &distinctify_hash_table_impl_types,
                             StorageManager *storage_manager);
 
   ~AggregationOperationState() {}

http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/70fcdb5d/storage/AggregationOperationState.proto
----------------------------------------------------------------------
diff --git a/storage/AggregationOperationState.proto b/storage/AggregationOperationState.proto
index 031f782..bf78e3a 100644
--- a/storage/AggregationOperationState.proto
+++ b/storage/AggregationOperationState.proto
@@ -1,5 +1,7 @@
 //   Copyright 2011-2015 Quickstep Technologies LLC.
 //   Copyright 2015-2016 Pivotal Software, Inc.
+//   Copyright 2016, Quickstep Research Group, Computer Sciences Department,
+//     University of Wisconsin\u2014Madison.
 //
 //   Licensed under the Apache License, Version 2.0 (the "License");
 //   you may not use this file except in compliance with the License.
@@ -37,4 +39,7 @@ message AggregationOperationState {
   // NOTE(chasseur): 'hash_table_impl_type' is marked optional, but it is
   // needed if 'group_by_expressions' is non-empty, and ignored otherwise.
   optional HashTableImplType hash_table_impl_type = 6;
+
+  // Each DISTINCT aggregation has its distinctify hash table impl type.
+  repeated HashTableImplType distinctify_hash_table_impl_types = 7;
 }