You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@quickstep.apache.org by hb...@apache.org on 2016/10/06 19:21:27 UTC
[1/2] incubator-quickstep git commit: Improve
StarSchemaSimpleCostModel to provide closer estimation of number of groups
for hash aggregation. [Forced Update!]
Repository: incubator-quickstep
Updated Branches:
refs/heads/fix-fasthash-resize 3e8f6c9a3 -> 2e0233333 (forced update)
Improve StarSchemaSimpleCostModel to provide closer estimation of number of groups for hash aggregation.
Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/ad1f8c40
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/ad1f8c40
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/ad1f8c40
Branch: refs/heads/fix-fasthash-resize
Commit: ad1f8c406954778d332f02b680e08e0118727436
Parents: 4126c4f
Author: Jianqiao Zhu <ji...@cs.wisc.edu>
Authored: Mon Oct 3 23:58:30 2016 -0500
Committer: Jianqiao Zhu <ji...@cs.wisc.edu>
Committed: Tue Oct 4 17:06:04 2016 -0500
----------------------------------------------------------------------
query_optimizer/CMakeLists.txt | 2 +-
query_optimizer/ExecutionGenerator.cpp | 8 +-
query_optimizer/ExecutionGenerator.hpp | 1 -
query_optimizer/cost_model/CMakeLists.txt | 3 +
query_optimizer/cost_model/CostModel.hpp | 11 +
.../cost_model/StarSchemaSimpleCostModel.cpp | 243 ++++++++++++++-----
.../cost_model/StarSchemaSimpleCostModel.hpp | 70 ++++--
query_optimizer/expressions/ExpressionUtil.hpp | 19 ++
.../StarSchemaHashJoinOrderOptimization.hpp | 2 +-
9 files changed, 280 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/query_optimizer/CMakeLists.txt b/query_optimizer/CMakeLists.txt
index 32f7885..988ffd8 100644
--- a/query_optimizer/CMakeLists.txt
+++ b/query_optimizer/CMakeLists.txt
@@ -78,7 +78,7 @@ target_link_libraries(quickstep_queryoptimizer_ExecutionGenerator
quickstep_queryoptimizer_QueryHandle
quickstep_queryoptimizer_QueryPlan
quickstep_queryoptimizer_costmodel_CostModel
- quickstep_queryoptimizer_costmodel_SimpleCostModel
+ quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostModel
quickstep_queryoptimizer_expressions_AggregateFunction
quickstep_queryoptimizer_expressions_Alias
quickstep_queryoptimizer_expressions_AttributeReference
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/ExecutionGenerator.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/ExecutionGenerator.cpp b/query_optimizer/ExecutionGenerator.cpp
index 968314e..9347c9c 100644
--- a/query_optimizer/ExecutionGenerator.cpp
+++ b/query_optimizer/ExecutionGenerator.cpp
@@ -58,7 +58,7 @@
#include "query_optimizer/OptimizerContext.hpp"
#include "query_optimizer/QueryHandle.hpp"
#include "query_optimizer/QueryPlan.hpp"
-#include "query_optimizer/cost_model/SimpleCostModel.hpp"
+#include "query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp"
#include "query_optimizer/expressions/AggregateFunction.hpp"
#include "query_optimizer/expressions/Alias.hpp"
#include "query_optimizer/expressions/AttributeReference.hpp"
@@ -167,7 +167,7 @@ void ExecutionGenerator::generatePlan(const P::PhysicalPtr &physical_plan) {
<< "The physical plan must be rooted by a TopLevelPlan";
cost_model_.reset(
- new cost::SimpleCostModel(top_level_physical_plan_->shared_subplans()));
+ new cost::StarSchemaSimpleCostModel(top_level_physical_plan_->shared_subplans()));
const CatalogRelation *result_relation = nullptr;
@@ -1411,7 +1411,9 @@ void ExecutionGenerator::convertAggregate(
aggr_state_proto->mutable_predicate()->CopyFrom(predicate->getProto());
}
- aggr_state_proto->set_estimated_num_entries(cost_model_->estimateCardinality(physical_plan));
+ const std::size_t estimated_num_groups =
+ cost_model_->estimateNumGroupsForAggregate(physical_plan);
+ aggr_state_proto->set_estimated_num_entries(std::max(16uL, estimated_num_groups));
const QueryPlan::DAGNodeIndex aggregation_operator_index =
execution_plan_->addRelationalOperator(
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/ExecutionGenerator.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/ExecutionGenerator.hpp b/query_optimizer/ExecutionGenerator.hpp
index 6017aa5..2aaf5ab 100644
--- a/query_optimizer/ExecutionGenerator.hpp
+++ b/query_optimizer/ExecutionGenerator.hpp
@@ -37,7 +37,6 @@
#include "query_optimizer/QueryHandle.hpp"
#include "query_optimizer/QueryPlan.hpp"
#include "query_optimizer/cost_model/CostModel.hpp"
-#include "query_optimizer/cost_model/SimpleCostModel.hpp"
#include "query_optimizer/expressions/ExprId.hpp"
#include "query_optimizer/expressions/NamedExpression.hpp"
#include "query_optimizer/expressions/Predicate.hpp"
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/CMakeLists.txt b/query_optimizer/cost_model/CMakeLists.txt
index abbc3da..ba99de3 100644
--- a/query_optimizer/cost_model/CMakeLists.txt
+++ b/query_optimizer/cost_model/CMakeLists.txt
@@ -24,6 +24,7 @@ add_library(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostModel
# Link dependencies:
target_link_libraries(quickstep_queryoptimizer_costmodel_CostModel
+ quickstep_queryoptimizer_physical_Aggregate
quickstep_queryoptimizer_physical_Physical
quickstep_utility_Macros)
target_link_libraries(quickstep_queryoptimizer_costmodel_SimpleCostModel
@@ -50,6 +51,7 @@ target_link_libraries(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostMod
quickstep_queryoptimizer_expressions_ComparisonExpression
quickstep_queryoptimizer_expressions_ExprId
quickstep_queryoptimizer_expressions_ExpressionType
+ quickstep_queryoptimizer_expressions_ExpressionUtil
quickstep_queryoptimizer_expressions_LogicalAnd
quickstep_queryoptimizer_expressions_LogicalOr
quickstep_queryoptimizer_expressions_PatternMatcher
@@ -57,6 +59,7 @@ target_link_libraries(quickstep_queryoptimizer_costmodel_StarSchemaSimpleCostMod
quickstep_queryoptimizer_physical_Aggregate
quickstep_queryoptimizer_physical_HashJoin
quickstep_queryoptimizer_physical_NestedLoopsJoin
+ quickstep_queryoptimizer_physical_PatternMatcher
quickstep_queryoptimizer_physical_Physical
quickstep_queryoptimizer_physical_PhysicalType
quickstep_queryoptimizer_physical_Selection
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/CostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/CostModel.hpp b/query_optimizer/cost_model/CostModel.hpp
index ca49733..20bc208 100644
--- a/query_optimizer/cost_model/CostModel.hpp
+++ b/query_optimizer/cost_model/CostModel.hpp
@@ -22,6 +22,7 @@
#include <cstddef>
+#include "query_optimizer/physical/Aggregate.hpp"
#include "query_optimizer/physical/Physical.hpp"
#include "utility/Macros.hpp"
@@ -53,6 +54,16 @@ class CostModel {
virtual std::size_t estimateCardinality(
const physical::PhysicalPtr &physical_plan) = 0;
+ /**
+ * @brief Estimate the number of groups of an aggregation.
+ *
+ * @param aggregate The physical plan of the aggregation.
+ * @return The estimated number of groups.
+ */
+ virtual std::size_t estimateNumGroupsForAggregate(const physical::AggregatePtr &aggregate) {
+ return estimateCardinality(aggregate);
+ }
+
protected:
/**
* @brief Constructor.
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
index 911a765..8d254fa 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.cpp
@@ -29,6 +29,7 @@
#include "query_optimizer/expressions/ComparisonExpression.hpp"
#include "query_optimizer/expressions/ExprId.hpp"
#include "query_optimizer/expressions/ExpressionType.hpp"
+#include "query_optimizer/expressions/ExpressionUtil.hpp"
#include "query_optimizer/expressions/LogicalAnd.hpp"
#include "query_optimizer/expressions/LogicalOr.hpp"
#include "query_optimizer/expressions/Predicate.hpp"
@@ -36,6 +37,7 @@
#include "query_optimizer/physical/Aggregate.hpp"
#include "query_optimizer/physical/NestedLoopsJoin.hpp"
#include "query_optimizer/physical/HashJoin.hpp"
+#include "query_optimizer/physical/PatternMatcher.hpp"
#include "query_optimizer/physical/Physical.hpp"
#include "query_optimizer/physical/PhysicalType.hpp"
#include "query_optimizer/physical/Selection.hpp"
@@ -85,8 +87,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinality(
shared_subplans_[shared_subplan_reference->subplan_id()]);
}
case P::PhysicalType::kSort:
- return estimateCardinality(
- std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+ return estimateCardinalityForSort(
+ std::static_pointer_cast<const P::Sort>(physical_plan));
case P::PhysicalType::kWindowAggregate:
return estimateCardinalityForWindowAggregate(
std::static_pointer_cast<const P::WindowAggregate>(physical_plan));
@@ -111,9 +113,17 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableReference(
std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSelection(
const P::SelectionPtr &physical_plan) {
- double selectivity = estimateSelectivityForSelection(physical_plan);
- return std::max(static_cast<std::size_t>(estimateCardinality(physical_plan->input()) * selectivity),
- static_cast<std::size_t>(1));
+ double selectivity =
+ estimateSelectivityForPredicate(physical_plan->filter_predicate(), physical_plan);
+ return static_cast<std::size_t>(
+ estimateCardinality(physical_plan->input()) * selectivity);
+}
+
+std::size_t StarSchemaSimpleCostModel::estimateCardinalityForSort(
+ const physical::SortPtr &physical_plan) {
+ std::size_t cardinality = estimateCardinality(
+ std::static_pointer_cast<const P::Sort>(physical_plan)->input());
+ return std::min(cardinality, static_cast<std::size_t>(physical_plan->limit()));
}
std::size_t StarSchemaSimpleCostModel::estimateCardinalityForTableGenerator(
@@ -127,8 +137,8 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForHashJoin(
std::size_t right_cardinality = estimateCardinality(physical_plan->right());
double left_selectivity = estimateSelectivity(physical_plan->left());
double right_selectivity = estimateSelectivity(physical_plan->right());
- return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity) + 1,
- static_cast<std::size_t>(right_cardinality * left_selectivity) + 1);
+ return std::max(static_cast<std::size_t>(left_cardinality * right_selectivity + 0.5),
+ static_cast<std::size_t>(right_cardinality * left_selectivity + 0.5));
}
std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin(
@@ -139,11 +149,10 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForNestedLoopsJoin(
std::size_t StarSchemaSimpleCostModel::estimateCardinalityForAggregate(
const P::AggregatePtr &physical_plan) {
- if (physical_plan->grouping_expressions().empty()) {
- return 1;
- }
- return std::max(static_cast<std::size_t>(1),
- estimateCardinality(physical_plan->input()) / 10);
+ double filter_selectivity =
+ estimateSelectivityForPredicate(physical_plan->filter_predicate(), physical_plan);
+ return static_cast<std::size_t>(
+ estimateNumGroupsForAggregate(physical_plan) * filter_selectivity);
}
std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
@@ -151,24 +160,115 @@ std::size_t StarSchemaSimpleCostModel::estimateCardinalityForWindowAggregate(
return estimateCardinality(physical_plan->input());
}
+
+std::size_t StarSchemaSimpleCostModel::estimateNumGroupsForAggregate(
+ const physical::AggregatePtr &aggregate) {
+ if (aggregate->grouping_expressions().empty()) {
+ return 1uL;
+ }
+
+ std::size_t estimated_child_cardinality = estimateCardinality(aggregate->input());
+ std::size_t estimated_num_groups = 1;
+ std::size_t max_attr_num_distinct_values = 0;
+ for (const auto &expr : aggregate->grouping_expressions()) {
+ E::AttributeReferencePtr attr;
+ if (E::SomeAttributeReference::MatchesWithConditionalCast(expr, &attr)) {
+ std::size_t attr_num_distinct_values =
+ estimateNumDistinctValues(attr->id(), aggregate->input());
+ estimated_num_groups *= std::max(1uL, attr_num_distinct_values);
+ max_attr_num_distinct_values =
+ std::max(max_attr_num_distinct_values, attr_num_distinct_values);
+ } else {
+ // TODO(jianqiao): implement estimateNumDistinctValues() for expressions.
+ estimated_num_groups *= 64uL;
+ }
+ }
+ estimated_num_groups = std::max(
+ std::min(estimated_num_groups, estimated_child_cardinality / 10),
+ max_attr_num_distinct_values);
+ return estimated_num_groups;
+}
+
+
+std::size_t StarSchemaSimpleCostModel::estimateNumDistinctValues(
+ const expressions::ExprId attribute_id,
+ const physical::PhysicalPtr &physical_plan) {
+ DCHECK(E::ContainsExprId(physical_plan->getOutputAttributes(), attribute_id));
+
+ P::TableReferencePtr table_reference;
+ if (P::SomeTableReference::MatchesWithConditionalCast(physical_plan, &table_reference)) {
+ return getNumDistinctValues(attribute_id, table_reference);
+ }
+
+ double filter_selectivity = estimateSelectivityForFilterPredicate(physical_plan);
+ switch (physical_plan->getPhysicalType()) {
+ case P::PhysicalType::kSelection: // Fall through
+ case P::PhysicalType::kAggregate: {
+ const P::PhysicalPtr &child = physical_plan->children()[0];
+ if (E::ContainsExprId(child->getOutputAttributes(), attribute_id)) {
+ std::size_t child_num_distinct_values =
+ estimateNumDistinctValues(attribute_id, child);
+ return static_cast<std::size_t>(
+ child_num_distinct_values * filter_selectivity + 0.5);
+ }
+ break;
+ }
+ case P::PhysicalType::kHashJoin: {
+ const P::HashJoinPtr &hash_join =
+ std::static_pointer_cast<const P::HashJoin>(physical_plan);
+ if (E::ContainsExprId(hash_join->left()->getOutputAttributes(), attribute_id)) {
+ std::size_t left_child_num_distinct_values =
+ estimateNumDistinctValues(attribute_id, hash_join->left());
+ double right_child_selectivity =
+ estimateSelectivity(hash_join->right());
+ return static_cast<std::size_t>(
+ left_child_num_distinct_values * right_child_selectivity * filter_selectivity + 0.5);
+ }
+ if (E::ContainsExprId(hash_join->right()->getOutputAttributes(), attribute_id)) {
+ std::size_t right_child_num_distinct_values =
+ estimateNumDistinctValues(attribute_id, hash_join->right());
+ double left_child_selectivity =
+ estimateSelectivity(hash_join->left());
+ return static_cast<std::size_t>(
+ right_child_num_distinct_values * left_child_selectivity * filter_selectivity + 0.5);
+ }
+ }
+ default:
+ break;
+ }
+
+ return 16uL;
+}
+
double StarSchemaSimpleCostModel::estimateSelectivity(
const physical::PhysicalPtr &physical_plan) {
switch (physical_plan->getPhysicalType()) {
case P::PhysicalType::kSelection: {
- return estimateSelectivityForSelection(
- std::static_pointer_cast<const P::Selection>(physical_plan));
+ const P::SelectionPtr &selection =
+ std::static_pointer_cast<const P::Selection>(physical_plan);
+ double filter_selectivity =
+ estimateSelectivityForPredicate(selection->filter_predicate(), selection);
+ double child_selectivity = estimateSelectivity(selection->input());
+ return filter_selectivity * child_selectivity;
}
case P::PhysicalType::kHashJoin: {
const P::HashJoinPtr &hash_join =
std::static_pointer_cast<const P::HashJoin>(physical_plan);
- return std::min(estimateSelectivity(hash_join->left()),
- estimateSelectivity(hash_join->right()));
+ double filter_selectivity =
+ estimateSelectivityForPredicate(hash_join->residual_predicate(), hash_join);
+ double child_selectivity =
+ estimateSelectivity(hash_join->left()) * estimateSelectivity(hash_join->right());
+ return filter_selectivity * child_selectivity;
}
case P::PhysicalType::kNestedLoopsJoin: {
const P::NestedLoopsJoinPtr &nested_loop_join =
std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan);
- return std::min(estimateSelectivity(nested_loop_join->left()),
- estimateSelectivity(nested_loop_join->right()));
+ double filter_selectivity =
+ estimateSelectivityForPredicate(nested_loop_join->join_predicate(), nested_loop_join);
+ double child_selectivity = std::min(
+ estimateSelectivity(nested_loop_join->left()),
+ estimateSelectivity(nested_loop_join->right()));
+ return filter_selectivity * child_selectivity;
}
case P::PhysicalType::kSharedSubplanReference: {
const P::SharedSubplanReferencePtr shared_subplan_reference =
@@ -177,36 +277,51 @@ double StarSchemaSimpleCostModel::estimateSelectivity(
shared_subplans_[shared_subplan_reference->subplan_id()]);
}
default:
- return 1.0;
+ break;
}
+
+ if (physical_plan->getNumChildren() == 1) {
+ return estimateSelectivity(physical_plan->children()[0]);
+ }
+
+ return 1.0;
}
-double StarSchemaSimpleCostModel::estimateSelectivityForSelection(
- const physical::SelectionPtr &physical_plan) {
- const E::PredicatePtr &filter_predicate = physical_plan->filter_predicate();
-
- // If the subplan is a table reference, gather the number of distinct values
- // statistics for each column (attribute).
- std::unordered_map<E::ExprId, std::size_t> num_distinct_values_map;
- if (physical_plan->input()->getPhysicalType() == P::PhysicalType::kTableReference) {
- const P::TableReferencePtr &table_reference =
- std::static_pointer_cast<const P::TableReference>(physical_plan->input());
- const CatalogRelation &relation = *table_reference->relation();
- const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list();
- for (std::size_t i = 0; i < attributes.size(); ++i) {
- std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i);
- if (num_distinct_values > 0) {
- num_distinct_values_map[attributes[i]->id()] = num_distinct_values;
- }
- }
+double StarSchemaSimpleCostModel::estimateSelectivityForFilterPredicate(
+ const physical::PhysicalPtr &physical_plan) {
+ E::PredicatePtr filter_predicate = nullptr;
+ switch (physical_plan->getPhysicalType()) {
+ case P::PhysicalType::kSelection:
+ filter_predicate =
+ std::static_pointer_cast<const P::Selection>(physical_plan)->filter_predicate();
+ break;
+ case P::PhysicalType::kAggregate:
+ filter_predicate =
+ std::static_pointer_cast<const P::Aggregate>(physical_plan)->filter_predicate();
+ break;
+ case P::PhysicalType::kHashJoin:
+ filter_predicate =
+ std::static_pointer_cast<const P::HashJoin>(physical_plan)->residual_predicate();
+ break;
+ case P::PhysicalType::kNestedLoopsJoin:
+ filter_predicate =
+ std::static_pointer_cast<const P::NestedLoopsJoin>(physical_plan)->join_predicate();
+ break;
+ default:
+ break;
}
- return estimateSelectivityForPredicate(num_distinct_values_map, filter_predicate);
+ if (filter_predicate == nullptr) {
+ return 1.0;
+ } else {
+ return estimateSelectivityForPredicate(filter_predicate, physical_plan);
+ }
}
+
double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
- const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map,
- const expressions::PredicatePtr &filter_predicate) {
+ const expressions::PredicatePtr &filter_predicate,
+ const P::PhysicalPtr &physical_plan) {
if (filter_predicate == nullptr) {
return 1.0;
}
@@ -215,10 +330,8 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
case E::ExpressionType::kComparisonExpression: {
// Case 1 - Number of distinct values statistics available
// Case 1.1 - Equality comparison: 1.0 / num_distinct_values
- // Case 1.2 - Otherwise: 5.0 / num_distinct_values
- // Case 2 - Number of distinct values statistics not available
- // Case 2.1 - Equality comparison: 0.1
- // Case 2.2 - Otherwise: 0.5
+ // Case 1.2 - Otherwise: 0.1
+ // Case 2 - Otherwise: 0.5
const E::ComparisonExpressionPtr &comparison_expression =
std::static_pointer_cast<const E::ComparisonExpression>(filter_predicate);
E::AttributeReferencePtr attr;
@@ -226,25 +339,26 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
E::SomeScalarLiteral::Matches(comparison_expression->right())) ||
(E::SomeAttributeReference::MatchesWithConditionalCast(comparison_expression->right(), &attr) &&
E::SomeScalarLiteral::Matches(comparison_expression->left()))) {
- const auto it = num_distinct_values_map.find(attr->id());
- if (it != num_distinct_values_map.end() && it->second > 0) {
- double unit_selectivity = 1.0 / it->second;
- return comparison_expression->isEqualityComparisonPredicate()
- ? unit_selectivity
- : std::min(0.5, unit_selectivity * 5.0);
+ for (const auto &child : physical_plan->children()) {
+ if (E::ContainsExprId(child->getOutputAttributes(), attr->id())) {
+ const std::size_t child_num_distinct_values = estimateNumDistinctValues(attr->id(), child);
+ if (comparison_expression->isEqualityComparisonPredicate()) {
+ return 1.0 / child_num_distinct_values;
+ } else {
+ return 1.0 / std::max(std::min(child_num_distinct_values / 100.0, 10.0), 2.0);
+ }
+ }
}
+ return 0.1;
}
-
- return comparison_expression->isEqualityComparisonPredicate() ? 0.1 : 0.5;
+ return 0.5;
}
case E::ExpressionType::kLogicalAnd: {
const E::LogicalAndPtr &logical_and =
std::static_pointer_cast<const E::LogicalAnd>(filter_predicate);
double selectivity = 1.0;
for (const auto &predicate : logical_and->operands()) {
- selectivity =
- std::min(selectivity,
- estimateSelectivityForPredicate(num_distinct_values_map, predicate));
+ selectivity = selectivity * estimateSelectivityForPredicate(predicate, physical_plan);
}
return selectivity;
}
@@ -253,7 +367,7 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
std::static_pointer_cast<const E::LogicalOr>(filter_predicate);
double selectivity = 0;
for (const auto &predicate : logical_or->operands()) {
- selectivity += estimateSelectivityForPredicate(num_distinct_values_map, predicate);
+ selectivity += estimateSelectivityForPredicate(predicate, physical_plan);
}
return std::min(selectivity, 1.0);
}
@@ -263,6 +377,23 @@ double StarSchemaSimpleCostModel::estimateSelectivityForPredicate(
return 1.0;
}
+std::size_t StarSchemaSimpleCostModel::getNumDistinctValues(
+ const E::ExprId attribute_id,
+ const P::TableReferencePtr &table_reference) {
+ const CatalogRelation &relation = *table_reference->relation();
+ const std::vector<E::AttributeReferencePtr> &attributes = table_reference->attribute_list();
+ for (std::size_t i = 0; i < attributes.size(); ++i) {
+ if (attributes[i]->id() == attribute_id) {
+ std::size_t num_distinct_values = relation.getStatistics().getNumDistinctValues(i);
+ if (num_distinct_values > 0) {
+ return num_distinct_values;
+ }
+ break;
+ }
+ }
+ return estimateCardinalityForTableReference(table_reference);
+}
+
} // namespace cost
} // namespace optimizer
} // namespace quickstep
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
index 4314b92..6f6aa29 100644
--- a/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
+++ b/query_optimizer/cost_model/StarSchemaSimpleCostModel.hpp
@@ -21,7 +21,6 @@
#define QUERY_OPTIMIZER_COST_MODEL_STAR_SCHEMA_SIMPLE_COST_MODEL_HPP_
#include <cstddef>
-#include <unordered_map>
#include <vector>
#include "query_optimizer/cost_model/CostModel.hpp"
@@ -32,6 +31,7 @@
#include "query_optimizer/physical/HashJoin.hpp"
#include "query_optimizer/physical/Physical.hpp"
#include "query_optimizer/physical/Selection.hpp"
+#include "query_optimizer/physical/Sort.hpp"
#include "query_optimizer/physical/TableGenerator.hpp"
#include "query_optimizer/physical/TableReference.hpp"
#include "query_optimizer/physical/TopLevelPlan.hpp"
@@ -67,6 +67,25 @@ class StarSchemaSimpleCostModel : public CostModel {
const physical::PhysicalPtr &physical_plan) override;
/**
+ * @brief Estimate the number of groups in an aggregation.
+ *
+ * @param aggregate The physical plan of the aggregation.
+ * @return The estimated number of groups.
+ */
+ std::size_t estimateNumGroupsForAggregate(
+ const physical::AggregatePtr &aggregate) override;
+
+ /**
+ * @brief Estimate the number of distinct values of an attribute in a relation.
+ *
+ * @param attribute_id The expression id of the target attribute.
+ * @param physical_plan The physical plan of the attribute's relation.
+ * @return The estimated number of distinct values for the attribute.
+ */
+ std::size_t estimateNumDistinctValues(const expressions::ExprId attribute_id,
+ const physical::PhysicalPtr &physical_plan);
+
+ /**
* @brief Estimate the "selectivity" of a physical plan under the assumption
* that it acts as a filtered dimension table in a hash join.
*
@@ -75,40 +94,57 @@ class StarSchemaSimpleCostModel : public CostModel {
*/
double estimateSelectivity(const physical::PhysicalPtr &physical_plan);
+ /**
+ * @brief Estimate the filter predicate's selectivity if it is present in
+ * the input plan's root node.
+ *
+ * @param physical_plan The input physical plan.
+ * @return The estimated selectivity of the filter predicate if physical_plan
+ * has such a filter predicate; 1.0 otherwise.
+ */
+ double estimateSelectivityForFilterPredicate(
+ const physical::PhysicalPtr &physical_plan);
+
private:
- std::size_t estimateCardinalityForTopLevelPlan(
- const physical::TopLevelPlanPtr &physical_plan);
+ std::size_t estimateCardinalityForAggregate(
+ const physical::AggregatePtr &physical_plan);
- std::size_t estimateCardinalityForTableReference(
- const physical::TableReferencePtr &physical_plan);
+ std::size_t estimateCardinalityForHashJoin(
+ const physical::HashJoinPtr &physical_plan);
+
+ std::size_t estimateCardinalityForNestedLoopsJoin(
+ const physical::NestedLoopsJoinPtr &physical_plan);
std::size_t estimateCardinalityForSelection(
const physical::SelectionPtr &physical_plan);
+ std::size_t estimateCardinalityForSort(
+ const physical::SortPtr &physical_plan);
+
std::size_t estimateCardinalityForTableGenerator(
const physical::TableGeneratorPtr &physical_plan);
- std::size_t estimateCardinalityForHashJoin(
- const physical::HashJoinPtr &physical_plan);
-
- std::size_t estimateCardinalityForNestedLoopsJoin(
- const physical::NestedLoopsJoinPtr &physical_plan);
+ std::size_t estimateCardinalityForTableReference(
+ const physical::TableReferencePtr &physical_plan);
- std::size_t estimateCardinalityForAggregate(
- const physical::AggregatePtr &physical_plan);
+ std::size_t estimateCardinalityForTopLevelPlan(
+ const physical::TopLevelPlanPtr &physical_plan);
std::size_t estimateCardinalityForWindowAggregate(
const physical::WindowAggregatePtr &physical_plan);
- double estimateSelectivityForSelection(
- const physical::SelectionPtr &physical_plan);
-
double estimateSelectivityForPredicate(
- const std::unordered_map<expressions::ExprId, std::size_t> &num_distinct_values_map,
- const expressions::PredicatePtr &filter_predicate);
+ const expressions::PredicatePtr &filter_predicate,
+ const physical::PhysicalPtr &physical_plan);
const std::vector<physical::PhysicalPtr> &shared_subplans_;
+ // Get the number of distinct values of an attribute in the table reference.
+ // If the stat is not avaiable, simply returns the table's cardinality.
+ std::size_t getNumDistinctValues(const expressions::ExprId attribute_id,
+ const physical::TableReferencePtr &table_reference);
+
+
DISALLOW_COPY_AND_ASSIGN(StarSchemaSimpleCostModel);
};
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/expressions/ExpressionUtil.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/expressions/ExpressionUtil.hpp b/query_optimizer/expressions/ExpressionUtil.hpp
index e9a4067..422d5ab 100644
--- a/query_optimizer/expressions/ExpressionUtil.hpp
+++ b/query_optimizer/expressions/ExpressionUtil.hpp
@@ -95,6 +95,25 @@ bool ContainsExpression(
}
/**
+ * @brief Checks whether expr_id equals any expression's id in \p expressions.
+ *
+ * @param expressions A list of expressions to be checked with.
+ * @param expr_id The ExprId that is to be checked if it is in \p expressions.
+ * @return True if \p expr_id is contained by \p expressions.
+ */
+template <class NamedExpressionType>
+bool ContainsExprId(
+ const std::vector<std::shared_ptr<const NamedExpressionType>> &expressions,
+ const ExprId expr_id) {
+ for (const std::shared_ptr<const NamedExpressionType> &expression : expressions) {
+ if (expression->id() == expr_id) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
* @brief Checks whether \p left is a subset of \p right.
*
* @param left The left operand of the subset operator (i.e. the one that may be
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/ad1f8c40/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
----------------------------------------------------------------------
diff --git a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
index 4d6765c..c1a7bae 100644
--- a/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
+++ b/query_optimizer/rules/StarSchemaHashJoinOrderOptimization.hpp
@@ -103,7 +103,7 @@ class StarSchemaHashJoinOrderOptimization : public Rule<physical::Physical> {
if (lhs->estimated_selectivity < rhs->estimated_selectivity) {
return !swapped;
- } else if (lhs->estimated_cardinality < 1000u &&
+ } else if (lhs->estimated_cardinality < 100u &&
rhs->estimated_cardinality > 10000u &&
lhs->estimated_selectivity < rhs->estimated_selectivity * 1.5) {
return !swapped;
[2/2] incubator-quickstep git commit: Remove unnecessary code from
resize()
Posted by hb...@apache.org.
Remove unnecessary code from resize()
- In resize, all the values are moved from original hash table are
copied to the new hash table using memcpy().
- Removed the for loop where values are constructed using 'new' operator, as
all the values are trivially constructible in the specialized
aggregation hash table implementation.
Project: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/commit/2e023333
Tree: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/tree/2e023333
Diff: http://git-wip-us.apache.org/repos/asf/incubator-quickstep/diff/2e023333
Branch: refs/heads/fix-fasthash-resize
Commit: 2e0233333eb41d8f21f9e3f19029d08be8a01fdc
Parents: ad1f8c4
Author: Harshad Deshmukh <hb...@apache.org>
Authored: Wed Oct 5 16:05:05 2016 -0500
Committer: Harshad Deshmukh <hb...@apache.org>
Committed: Thu Oct 6 14:21:07 2016 -0500
----------------------------------------------------------------------
storage/FastSeparateChainingHashTable.hpp | 22 +++-------------------
1 file changed, 3 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-quickstep/blob/2e023333/storage/FastSeparateChainingHashTable.hpp
----------------------------------------------------------------------
diff --git a/storage/FastSeparateChainingHashTable.hpp b/storage/FastSeparateChainingHashTable.hpp
index 886a8ca..a41535c 100644
--- a/storage/FastSeparateChainingHashTable.hpp
+++ b/storage/FastSeparateChainingHashTable.hpp
@@ -1450,27 +1450,11 @@ void FastSeparateChainingHashTable<
// d. Relative pointers are not used with resizable hash tables.
// 4. If values are not trivially copyable, then we invoke ValueT's copy
// or move constructor with placement new.
+ // NOTE(harshad) - Regarding point 4 above, as this is a specialized
+ // hash table implemented for aggregation, the values are trivially copyable,
+ // therefore we don't need to invoke payload values' copy/move constructors.
std::memcpy(resized_buckets, buckets_, original_buckets_used * bucket_size_);
- // TODO(chasseur): std::is_trivially_copyable is not yet implemented in
- // GCC 4.8.3, so we assume we need to invoke ValueT's copy or move
- // constructor, even though the plain memcpy above could suffice for many
- // possible ValueTs.
- void *current_value_original = static_cast<char *>(buckets_) + kValueOffset;
- void *current_value_resized =
- static_cast<char *>(resized_buckets) + kValueOffset;
- for (std::size_t bucket_num = 0; bucket_num < original_buckets_used;
- ++bucket_num) {
- // Use a move constructor if available to avoid a deep-copy, since resizes
- // always succeed.
- new (current_value_resized) std::uint8_t(
- std::move(*static_cast<std::uint8_t *>(current_value_original)));
- current_value_original =
- static_cast<char *>(current_value_original) + bucket_size_;
- current_value_resized =
- static_cast<char *>(current_value_resized) + bucket_size_;
- }
-
// Copy over variable-length key components, if any.
if (original_variable_storage_used > 0) {
DEBUG_ASSERT(original_variable_storage_used ==