You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2019/09/04 17:57:11 UTC

[impala] 02/02: IMPALA-8913: Add query option to disable hbase row estimation

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f36ea767c8eac608d37dfc954e426a7da2a07251
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Sat Aug 31 20:09:44 2019 -0700

    IMPALA-8913: Add query option to disable hbase row estimation
    
    IMPALA-8058 added a flag, disable_hbase_row_est, in the QueryCtx for
    tests to bypass the codes of estimating HBase row stats from HBase RPCs
    and fall back to HMS row count stats. This patch exposes it as a query
    option.
    
    Tests:
     - Replace the usage of disable_hbase_row_est in QueryCtx by the new
     query option in tests. Verified relative tests pass as before.
    
    Change-Id: I768ea1f560c5faab74d8772bc8aa9ddefdcf2e10
    Reviewed-on: http://gerrit.cloudera.org:8080/14169
    Reviewed-by: Quanlong Huang <hu...@gmail.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/service/query-options.cc                        |  4 ++++
 be/src/service/query-options.h                         |  6 ++++--
 common/thrift/ImpalaInternalService.thrift             |  5 ++++-
 common/thrift/ImpalaService.thrift                     |  4 ++++
 .../java/org/apache/impala/planner/HBaseScanNode.java  | 18 ++++++++++++++----
 .../java/org/apache/impala/planner/PlannerTest.java    |  5 +++--
 .../org/apache/impala/planner/PlannerTestBase.java     |  7 -------
 7 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 9b6b91d..ea73b80 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -861,6 +861,10 @@ Status impala::SetQueryOption(const string& key, const string& value,
         query_options->__set_disable_data_cache(IsTrue(value));
         break;
       }
+      case TImpalaQueryOptions::DISABLE_HBASE_NUM_ROWS_ESTIMATE: {
+        query_options->__set_disable_hbase_num_rows_estimate(IsTrue(value));
+        break;
+      }
       default:
         if (IsRemovedQueryOption(key)) {
           LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'";
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 72c9f81..ce509a7 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -47,7 +47,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
 // time we add or remove a query option to/from the enum TImpalaQueryOptions.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::MAX_SPILLED_RESULT_SPOOLING_MEM + 1);\
+      TImpalaQueryOptions::DISABLE_HBASE_NUM_ROWS_ESTIMATE + 1);\
   REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
   REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\
@@ -184,7 +184,9 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
   QUERY_OPT_FN(max_result_spooling_mem, MAX_RESULT_SPOOLING_MEM,\
       TQueryOptionLevel::DEVELOPMENT)\
   QUERY_OPT_FN(max_spilled_result_spooling_mem, MAX_SPILLED_RESULT_SPOOLING_MEM,\
-      TQueryOptionLevel::DEVELOPMENT)
+      TQueryOptionLevel::DEVELOPMENT)\
+  QUERY_OPT_FN(disable_hbase_num_rows_estimate, DISABLE_HBASE_NUM_ROWS_ESTIMATE,\
+      TQueryOptionLevel::ADVANCED)
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query state.
diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift
index 3dbc2c8..7231d4f 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -388,6 +388,9 @@ struct TQueryOptions {
 
   // See comment in ImpalaService.thrift
   92: optional i64 max_spilled_result_spooling_mem = 1073741824;
+
+  // See comment in ImpalaService.thrift
+  93: optional bool disable_hbase_num_rows_estimate = false;
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2
@@ -540,7 +543,7 @@ struct TQueryCtx {
   // When disabled, scan cardinality is estimated from HMS table row count
   // stats and key column predicate selectivity. Generally only disabled
   // for testing.
-  20: optional bool disable_hbase_row_est = false;
+  20: optional bool disable_hbase_num_rows_estimate = false;
 
   // Flag to enable tracing of resource usage consumption for all fragment instances of a
   // query. Set in ImpalaServer::PrepareQueryContext().
diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift
index f5c979f..d6feb8f 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -448,6 +448,10 @@ enum TImpalaQueryOptions {
   // SPOOL_QUERY_RESULTS is true. Setting this to 0 or -1 means the memory is unbounded.
   // Cannot be set to values below -1.
   MAX_SPILLED_RESULT_SPOOLING_MEM = 91
+
+  // Disable the normal key sampling of HBase tables in row count and row size estimation.
+  // Set this to true will force the use of HMS table stats.
+  DISABLE_HBASE_NUM_ROWS_ESTIMATE = 92
 }
 
 // The summary of a DML statement.
diff --git a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
index 36f7b63..d0b5b1d 100644
--- a/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HBaseScanNode.java
@@ -311,7 +311,7 @@ public class HBaseScanNode extends ScanNode {
       }
     } else {
       Pair<Long, Long> estimate;
-      if (analyzer.getQueryCtx().isDisable_hbase_row_est()) {
+      if (analyzer.getQueryOptions().isDisable_hbase_num_rows_estimate()) {
         estimate = new Pair<>(-1L, -1L);
       } else {
         // Set maxCaching so that each fetch from hbase won't return a batch of more than
@@ -324,10 +324,13 @@ public class HBaseScanNode extends ScanNode {
         // This works only if HBase stats are available in HMS. This is true
         // for the Impala tests, and may be true for some applications.
         cardinality_ = tbl.getTTableStats().getNum_rows();
+        if (LOG.isTraceEnabled()) {
+          LOG.trace("Fallback to use table stats in HMS: num_rows=" + cardinality_);
+        }
         // TODO: What do do if neither HBase nor HMS provide a row count estimate?
         // Is there some third, ulitimate fallback?
         // Apply estimated key range selectivity from original key conjuncts
-        if (cardinality_ != -1 && keyConjuncts_ != null) {
+        if (cardinality_ > 0 && keyConjuncts_ != null) {
           cardinality_ *= computeCombinedSelectivity(keyConjuncts_);
         }
       } else {
@@ -343,8 +346,15 @@ public class HBaseScanNode extends ScanNode {
     }
     inputCardinality_ = cardinality_;
 
-    cardinality_ *= computeSelectivity();
-    cardinality_ = Math.max(1, cardinality_);
+    if (cardinality_ > 0) {
+      cardinality_ = Math.round(cardinality_ * computeSelectivity());
+      // IMPALA-2165: Avoid setting the cardinality to 0 after rounding.
+      cardinality_ = Math.max(1, cardinality_);
+    } else {
+      // Safe guard for cardinality_ < -1, e.g. when hbase sampling fails and numRows
+      // in HMS is abnormally set to be < -1.
+      cardinality_ = Math.max(-1, cardinality_);
+    }
     cardinality_ = capCardinalityAtLimit(cardinality_);
     if (LOG.isTraceEnabled()) {
       LOG.trace("computeStats HbaseScan: cardinality=" + cardinality_);
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
index 3d7782a..cf6e802 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java
@@ -191,8 +191,9 @@ public class PlannerTest extends PlannerTestBase {
    */
   @Test
   public void testHbaseNoKeyEstimate() {
-    runPlannerTestFile("hbase-no-key-est",
-        ImmutableSet.of(PlannerTestOption.DISABLE_HBASE_KEY_ESTIMATE));
+    TQueryOptions options = defaultQueryOptions();
+    options.setDisable_hbase_num_rows_estimate(true);
+    runPlannerTestFile("hbase-no-key-est", options);
   }
 
   @Test
diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java b/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
index c550771..2dc9f81 100644
--- a/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
+++ b/fe/src/test/java/org/apache/impala/planner/PlannerTestBase.java
@@ -419,8 +419,6 @@ public class PlannerTestBase extends FrontendTestBase {
     TQueryCtx queryCtx = TestUtils.createQueryContext(
         dbName, System.getProperty("user.name"));
     queryCtx.client_request.query_options = testCase.getOptions();
-    queryCtx.setDisable_hbase_row_est(
-        testOptions.contains(PlannerTestOption.DISABLE_HBASE_KEY_ESTIMATE));
     // Test single node plan, scan range locations, and column lineage.
     TExecRequest singleNodeExecRequest = testPlan(testCase, Section.PLAN, queryCtx.deepCopy(),
         testOptions, errorLog, actualOutput);
@@ -826,11 +824,6 @@ public class PlannerTestBase extends FrontendTestBase {
     // on for test that validate cardinality calculations: joins, scan
     // cardinality, etc.
     VALIDATE_CARDINALITY,
-    // If set, disables the normal HBase key estimate scan in favor of using
-    // HMS table stats and key predicate selectivity. Enable this to test
-    // the case when HBase key stats are unavailable (such as due to overly
-    // restrictive key predicates).
-    DISABLE_HBASE_KEY_ESTIMATE,
     // Validate the filesystem schemes shown in the scan node plan. An example of a scan
     // node profile that contains fs specific information is:
     //