You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/09/04 17:57:11 UTC
[impala] 02/02: IMPALA-8913: Add query option to disable hbase row
estimation
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit f36ea767c8eac608d37dfc954e426a7da2a07251
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sat Aug 31 20:09:44 2019 -0700
IMPALA-8913: Add query option to disable hbase row estimation
IMPALA-8058 added a flag, disable_hbase_row_est, in the QueryCtx for
tests to bypass the codes of estimating HBase row stats from HBase RPCs
and fall back to HMS row count stats. This patch exposes it as a query
option.
Tests:
- Replace the usage of disable_hbase_row_est in QueryCtx by the new
query option in tests. Verified relative tests pass as before.
Change-Id: I768ea1f560c5faab74d8772bc8aa9ddefdcf2e10
Reviewed-on: http://gerrit.cloudera.org:8080/14169
Reviewed-by: Quanlong Huang <hu...@gmail.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/service/query-options.cc | 4 ++++
be/src/service/query-options.h | 6 ++++--
common/thrift/ImpalaInternalService.thrift | 5 ++++-
common/thrift/ImpalaService.thrift | 4 ++++
.../java/org/apache/impala/planner/HBaseScanNode.java | 18 ++++++++++++++----
.../java/org/apache/impala/planner/PlannerTest.java | 5 +++--
.../org/apache/impala/planner/PlannerTestBase.java | 7 -------
7 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 9b6b91d..ea73b80 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -861,6 +861,10 @@ Status impala::SetQueryOption(const string& key, const string& value,
query_options->__set_disable_data_cache(IsTrue(value));
break;
}
+ case TImpalaQueryOptions::DISABLE_HBASE_NUM_ROWS_ESTIMATE: {
+ query_options->__set_disable_hbase_num_rows_estimate(IsTrue(value));
+ break;
+ }
default:
if (IsRemovedQueryOption(key)) {
LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'";
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 72c9f81..ce509a7 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -47,7 +47,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
// time we add or remove a query option to/from the enum TImpalaQueryOptions.
#define QUERY_OPTS_TABLE\
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
- TImpalaQueryOptions::MAX_SPILLED_RESULT_SPOOLING_MEM + 1);\
+ TImpalaQueryOptions::DISABLE_HBASE_NUM_ROWS_ESTIMATE + 1);\
REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -184,7 +184,9 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
QUERY_OPT_FN(max_result_spooling_mem, MAX_RESULT_SPOOLING_MEM,\
TQueryOptionLevel::DEVELOPMENT)\
QUERY_OPT_FN(max_spilled_result_spooling_mem, MAX_SPILLED_RESULT_SPOOLING_MEM,\
- TQueryOptionLevel::DEVELOPMENT)
+ TQueryOptionLevel::DEVELOPMENT)\
+ QUERY_OPT_FN(disable_hbase_num_rows_estimate, DISABLE_HBASE_NUM_ROWS_ESTIMATE,\
+ TQueryOptionLevel::ADVANCED)
;
/// Enforce practical limits on some query options to avoid undesired query state.
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 3dbc2c8..7231d4f 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -388,6 +388,9 @@ struct TQueryOptions {
// See comment in ImpalaService.thrift
92: optional i64 max_spilled_result_spooling_mem = 1073741824;
+
+ // See comment in ImpalaService.thrift
+ 93: optional bool disable_hbase_num_rows_estimate = false;
}
// Impala currently has two types of sessions: Beeswax and HiveServer2
@@ -540,7 +543,7 @@ struct TQueryCtx {
// When disabled, scan cardinality is estimated from HMS table row count
// stats and key column predicate selectivity. Generally only disabled
// for testing.
- 20: optional bool disable_hbase_row_est = false;
+ 20: optional bool disable_hbase_num_rows_estimate = false;
// Flag to enable tracing of resource usage consumption for all fragment instances of a
// query. Set in ImpalaServer::PrepareQueryContext().
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index f5c979f..d6feb8f 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -448,6 +448,10 @@ enum TImpalaQueryOptions {
// SPOOL_QUERY_RESULTS is true. Setting this to 0 or -1 means the memory is unbounded.
// Cannot be set to values below -1.
MAX_SPILLED_RESULT_SPOOLING_MEM = 91
+
+ // Disable the normal key sampling of HBase tables in row count and row size estimation.
+ // Set this to true will force the use of HMS table stats.
+ DISABLE_HBASE_NUM_ROWS_ESTIMATE = 92
}
// The summary of a DML statement.
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
index 36f7b63..d0b5b1d 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
@@ -311,7 +311,7 @@ public class HBaseScanNode extends ScanNode {
}
} else {
Pair<Long, Long> estimate;
- if (analyzer.getQueryCtx().isDisable_hbase_row_est()) {
+ if (analyzer.getQueryOptions().isDisable_hbase_num_rows_estimate()) {
estimate = new Pair<>(-1L, -1L);
} else {
// Set maxCaching so that each fetch from hbase won't return a batch of more than
@@ -324,10 +324,13 @@ public class HBaseScanNode extends ScanNode {
// This works only if HBase stats are available in HMS. This is true
// for the Impala tests, and may be true for some applications.
cardinality_ = tbl.getTTableStats().getNum_rows();
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Fallback to use table stats in HMS: num_rows=" + cardinality_);
+ }
// TODO: What do do if neither HBase nor HMS provide a row count estimate?
// Is there some third, ulitimate fallback?
// Apply estimated key range selectivity from original key conjuncts
- if (cardinality_ != -1 && keyConjuncts_ != null) {
+ if (cardinality_ > 0 && keyConjuncts_ != null) {
cardinality_ *= computeCombinedSelectivity(keyConjuncts_);
}
} else {
@@ -343,8 +346,15 @@ public class HBaseScanNode extends ScanNode {
}
inputCardinality_ = cardinality_;
- cardinality_ *= computeSelectivity();
- cardinality_ = Math.max(1, cardinality_);
+ if (cardinality_ > 0) {
+ cardinality_ = Math.round(cardinality_ * computeSelectivity());
+ // IMPALA-2165: Avoid setting the cardinality to 0 after rounding.
+ cardinality_ = Math.max(1, cardinality_);
+ } else {
+ // Safe guard for cardinality_ < -1, e.g. when hbase sampling fails and numRows
+ // in HMS is abnormally set to be < -1.
+ cardinality_ = Math.max(-1, cardinality_);
+ }
cardinality_ = capCardinalityAtLimit(cardinality_);
if (LOG.isTraceEnabled()) {
LOG.trace("computeStats HbaseScan: cardinality=" + cardinality_);
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index 3d7782a..cf6e802 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -191,8 +191,9 @@ public class PlannerTest extends PlannerTestBase {
*/
@Test
public void testHbaseNoKeyEstimate() {
- runPlannerTestFile("hbase-no-key-est",
- ImmutableSet.of(PlannerTestOption.DISABLE_HBASE_KEY_ESTIMATE));
+ TQueryOptions options = defaultQueryOptions();
+ options.setDisable_hbase_num_rows_estimate(true);
+ runPlannerTestFile("hbase-no-key-est", options);
}
@Test
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java b/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
index c550771..2dc9f81 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
@@ -419,8 +419,6 @@ public class PlannerTestBase extends FrontendTestBase {
TQueryCtx queryCtx = TestUtils.createQueryContext(
dbName, System.getProperty("user.name"));
queryCtx.client_request.query_options = testCase.getOptions();
- queryCtx.setDisable_hbase_row_est(
- testOptions.contains(PlannerTestOption.DISABLE_HBASE_KEY_ESTIMATE));
// Test single node plan, scan range locations, and column lineage.
TExecRequest singleNodeExecRequest = testPlan(testCase, Section.PLAN, queryCtx.deepCopy(),
testOptions, errorLog, actualOutput);
@@ -826,11 +824,6 @@ public class PlannerTestBase extends FrontendTestBase {
// on for test that validate cardinality calculations: joins, scan
// cardinality, etc.
VALIDATE_CARDINALITY,
- // If set, disables the normal HBase key estimate scan in favor of using
- // HMS table stats and key predicate selectivity. Enable this to test
- // the case when HBase key stats are unavailable (such as due to overly
- // restrictive key predicates).
- DISABLE_HBASE_KEY_ESTIMATE,
// Validate the filesystem schemes shown in the scan node plan. An example of a scan
// node profile that contains fs specific information is:
//