You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by lv...@apache.org on 2018/02/04 18:46:26 UTC

[1/6] impala git commit: IMPALA-6113: Skip row groups with predicates on NULL columns

Repository: impala
Updated Branches:
  refs/heads/master a018038df -> c856b30e3


IMPALA-6113: Skip row groups with predicates on NULL columns

Based on the existing Parquet column chunk level statistics null_count,
Impala's Parquet scanner is enhanced to skip an entire row group if the
null_count statistics indicate that all the values under the predicated
column are NULL as we wouldn't get any result rows from that row group
anyway.

Change-Id: I141317af0e0df30da8f220b29b0bfba364f40ddf
Reviewed-on: http://gerrit.cloudera.org:8080/9140
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/097f2f3f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/097f2f3f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/097f2f3f

Branch: refs/heads/master
Commit: 097f2f3f3b25a0f148bb8f5465debe7d690b07fe
Parents: a018038
Author: Gabor Kaszab <ga...@cloudera.com>
Authored: Wed Jan 24 17:01:34 2018 +0100
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 03:24:37 2018 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |  7 +++++
 be/src/exec/parquet-column-stats.cc             | 13 ++++++++
 be/src/exec/parquet-column-stats.h              |  6 ++++
 .../queries/QueryTest/parquet-stats.test        | 33 ++++++++++++++++++++
 4 files changed, 59 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index c14edd7..7a10f3c 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -584,6 +584,13 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
       DCHECK(false) << "Unsupported function name for statistics evaluation: " << fn_name;
     }
 
+    int64_t null_count = 0;
+    bool null_count_result = ColumnStatsBase::ReadNullCountStat(col_chunk, &null_count);
+    if (null_count_result && null_count == col_chunk.meta_data.num_values) {
+      *skip_row_group = true;
+      break;
+    }
+
     if (stats_read) {
       TupleRow row;
       row.SetTuple(0, min_max_tuple_);

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/parquet-column-stats.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.cc b/be/src/exec/parquet-column-stats.cc
index 4443309..a1d1155 100644
--- a/be/src/exec/parquet-column-stats.cc
+++ b/be/src/exec/parquet-column-stats.cc
@@ -129,6 +129,19 @@ bool ColumnStatsBase::ReadFromThrift(const parquet::ColumnChunk& col_chunk,
   return false;
 }
 
+bool ColumnStatsBase::ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+    int64_t* null_count) {
+  if (!(col_chunk.__isset.meta_data && col_chunk.meta_data.__isset.statistics)) {
+    return false;
+  }
+  const parquet::Statistics& stats = col_chunk.meta_data.statistics;
+  if (stats.__isset.null_count) {
+    *null_count = stats.null_count;
+    return true;
+  }
+  return false;
+}
+
 Status ColumnStatsBase::CopyToBuffer(StringBuffer* buffer, StringValue* value) {
   if (value->ptr == buffer->buffer()) return Status::OK();
   buffer->Clear();

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/parquet-column-stats.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.h b/be/src/exec/parquet-column-stats.h
index 0ff277c..e9cf801 100644
--- a/be/src/exec/parquet-column-stats.h
+++ b/be/src/exec/parquet-column-stats.h
@@ -73,6 +73,12 @@ class ColumnStatsBase {
       const ColumnType& col_type, const parquet::ColumnOrder* col_order,
       StatsField stats_field, void* slot);
 
+  // Gets the null_count statistics from the given column chunk's metadata and returns
+  // it via an output parameter.
+  // Returns true if the null_count stats were read successfully, false otherwise.
+  static bool ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+      int64_t* null_count);
+
   /// Merges this statistics object with values from 'other'. If other has not been
   /// initialized, then this object will not be changed.
   virtual void Merge(const ColumnStatsBase& other) = 0;

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index d03b4c9..70b5f27 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -458,3 +458,36 @@ select count(*) from functional_parquet.alltypes where id < 0;
 aggregation(SUM, NumRowGroups): 24
 aggregation(SUM, NumStatsFilteredRowGroups): 0
 ====
+---- QUERY
+# Check that all the row groups are skipped using null_count stat
+create table table_for_null_count_test (i int, j int) stored as parquet;
+insert into table_for_null_count_test values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Insert another row group where not all the 'j' values are NULL
+insert into table_for_null_count_test values (4, 1), (5, NULL);
+select i from table_for_null_count_test where j < 3;
+---- RESULTS
+4
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Turning off parquet stats and verifying that no row groups are skipped
+set PARQUET_READ_STATISTICS=0;
+create table table_for_null_count_test2 (i int, j int) stored as parquet;
+insert into table_for_null_count_test2 values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test2 where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====


[4/6] impala git commit: IMPALA-5293: Turn insert clustering on by default

Posted by lv...@apache.org.
IMPALA-5293: Turn insert clustering on by default

This change enables clustering by default. IMPALA-2521 introduced the
'clustered' hint which inserts a local sort by the partitioning columns
to a query plan. The hint is only effective for HDFS and Kudu tables.

Like before, the 'noclustered' hint prevents clustering. If a table has
ordering columns defined, the 'noclustered' hint is ignored and we
issue a warning.

This change removes some tests that were added specifically to test
that clustering can be enabled using the 'clustered' hint. It changes
some tests to use the 'noclustered' hint to make sure that clustering
can be disabled. It also adds tests to make sure that we cover the
'noclustered' case properly.

Cherry-picks: not for 2.x.

Change-Id: Idbf2368cf4415e6ecfa65058daf6ff87ef62f9d9
Reviewed-on: http://gerrit.cloudera.org:8080/9153
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/fc529b7f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/fc529b7f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/fc529b7f

Branch: refs/heads/master
Commit: fc529b7f9fce9d1bc1f6da2952380bad3dea0b11
Parents: a493a01
Author: Lars Volker <lv...@cloudera.com>
Authored: Mon Jan 29 11:29:01 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 05:58:50 2018 +0000

----------------------------------------------------------------------
 .../org/apache/impala/analysis/InsertStmt.java  |  14 ++-
 .../java/org/apache/impala/planner/Planner.java |   5 +-
 .../impala/analysis/AnalyzeStmtsTest.java       |   1 +
 .../queries/PlannerTest/constant-folding.test   |   7 +-
 .../queries/PlannerTest/empty.test              |  15 ++-
 .../queries/PlannerTest/insert.test             | 121 ++++++++++++++++---
 .../queries/PlannerTest/kudu.test               |  15 +--
 .../queries/PlannerTest/order.test              |  25 ++++
 .../PlannerTest/resource-requirements.test      |  33 +++--
 .../queries/PlannerTest/with-clause.test        |  47 ++++++-
 .../queries/QueryTest/insert.test               |  28 ++---
 .../queries/QueryTest/stats-extrapolation.test  |   3 +-
 12 files changed, 248 insertions(+), 66 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java
index 462728a..1d361d6 100644
--- a/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/InsertStmt.java
@@ -141,9 +141,8 @@ public class InsertStmt extends StatementBase {
   // Indicates whether this insert stmt has a clustered or noclustered hint. Only one of
   // them may be true, not both. If clustering is requested, we add a clustering phase
   // before the data sink, so that partitions can be written sequentially. The default
-  // behavior is to not perform an additional clustering step.
-  // TODO: hasClusteredHint_ can be removed once we enable clustering by default
-  // (IMPALA-5293).
+  // behavior is to not perform an additional clustering step. Both are required to detect
+  // conflicting hints.
   private boolean hasClusteredHint_ = false;
   private boolean hasNoClusteredHint_ = false;
 
@@ -877,6 +876,13 @@ public class InsertStmt extends StatementBase {
   public ArrayList<Expr> getPrimaryKeyExprs() { return primaryKeyExprs_; }
   public List<Expr> getSortExprs() { return sortExprs_; }
 
+  // Clustering is enabled by default. If the table has a 'sort.columns' property and the
+  // query has a 'noclustered' hint, we issue a warning during analysis and ignore the
+  // 'noclustered' hint.
+  public boolean requiresClustering() {
+    return !hasNoClusteredHint_ || !sortExprs_.isEmpty();
+  }
+
   public List<String> getMentionedColumns() {
     List<String> result = Lists.newArrayList();
     List<Column> columns = table_.getColumns();
@@ -888,7 +894,7 @@ public class InsertStmt extends StatementBase {
     // analyze() must have been called before.
     Preconditions.checkState(table_ != null);
     return TableSink.create(table_, isUpsert_ ? TableSink.Op.UPSERT : TableSink.Op.INSERT,
-        partitionKeyExprs_, mentionedColumns_, overwrite_, hasClusteredHint_,
+        partitionKeyExprs_, mentionedColumns_, overwrite_, requiresClustering(),
         sortColumns_);
   }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/fe/src/main/java/org/apache/impala/planner/Planner.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/Planner.java b/fe/src/main/java/org/apache/impala/planner/Planner.java
index 71182f5..fdae919 100644
--- a/fe/src/main/java/org/apache/impala/planner/Planner.java
+++ b/fe/src/main/java/org/apache/impala/planner/Planner.java
@@ -622,10 +622,7 @@ public class Planner {
         orderingExprs.addAll(insertStmt.getPrimaryKeyExprs());
         partialSort = true;
       }
-    } else if (insertStmt.hasClusteredHint() || !insertStmt.getSortExprs().isEmpty()) {
-      // NOTE: If the table has a 'sort.columns' property and the query has a
-      // 'noclustered' hint, we issue a warning during analysis and ignore the
-      // 'noclustered' hint.
+    } else if (insertStmt.requiresClustering()) {
       orderingExprs.addAll(insertStmt.getPartitionKeyExprs());
     }
     orderingExprs.addAll(insertStmt.getSortExprs());

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java
index 65639b2..b4c2e43 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java
@@ -1897,6 +1897,7 @@ public class AnalyzeStmtsTest extends AnalyzerTest {
       AnalyzesOk(String.format(
           "insert into table functional.alltypesnopart %sclustered%s " +
           "select * from functional.alltypesnopart", prefix, suffix));
+      // Test that noclustered is accepted.
       AnalyzesOk(String.format(
           "insert into table functional.alltypesnopart %snoclustered%s " +
           "select * from functional.alltypesnopart", prefix, suffix));

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
index 37b3163..59980b5 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test
@@ -341,11 +341,16 @@ select id, int_col, cast(1 + 1 + 1 + year as int), cast(month - (1 - 1 - 1) as i
 from functional.alltypessmall
 ---- PLAN
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=32.00MB mem-reservation=0B
+|  Per-Host Resources: mem-estimate=38.00MB mem-reservation=6.00MB
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(CAST(3 + year AS INT),CAST(month - -1 AS INT))]
 |  partitions=4
 |  mem-estimate=1.56KB mem-reservation=0B
 |
+01:SORT
+|  order by: CAST(3 + year AS INT) ASC NULLS LAST, CAST(month - -1 AS INT) ASC NULLS LAST
+|  mem-estimate=6.00MB mem-reservation=6.00MB spill-buffer=2.00MB
+|  tuple-ids=1 row-size=16B cardinality=100
+|
 00:SCAN HDFS [functional.alltypessmall]
    partitions=4/4 files=4 size=6.32KB
    stored statistics:

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/empty.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/empty.test b/testdata/workloads/functional-planner/queries/PlannerTest/empty.test
index c85d8ec..43d1fcf 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/empty.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/empty.test
@@ -302,9 +302,12 @@ PLAN-ROOT SINK
 insert into functional.alltypes partition(year, month)
 select * from functional.alltypes where 1 = 0
 ---- PLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:EMPTYSET
 ====
 # IMPALA-1860: INSERT/CTAS should evaluate and apply constant predicates.
@@ -312,9 +315,12 @@ with t as (select * from functional.alltypes where coalesce(NULL) > 10)
 insert into functional.alltypes partition(year, month)
 select * from t
 ---- PLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:EMPTYSET
 ====
 # IMPALA-1860: INSERT/CTAS should evaluate and apply constant predicates.
@@ -499,7 +505,7 @@ PLAN-ROOT SINK
 |  03:EMPTYSET
 |
 00:SCAN HDFS [tpch_nested_parquet.customer c]
-   partitions=1/1 files=4 size=577.87MB
+   partitions=1/1 files=4 size=292.36MB
    predicates: c_custkey < 10
 ====
 # IMPALA-2539: Test empty union operands containing relative table refs.
@@ -533,7 +539,7 @@ PLAN-ROOT SINK
 |  04:UNNEST [c.c_orders o1]
 |
 00:SCAN HDFS [tpch_nested_parquet.customer c]
-   partitions=1/1 files=4 size=577.87MB
+   partitions=1/1 files=4 size=292.36MB
    predicates: c_custkey = 1
 ====
 # IMPALA-2215: Having clause without aggregation.
@@ -550,7 +556,6 @@ PLAN-ROOT SINK
 |
 00:EMPTYSET
 ====
----- QUERY
 # IMPALA-5812: Test that a cross join with a constant select that returns an empty result
 # set translates into an EMPTYSET in the final plan
 select count(*) from functional.alltypes x cross join (select 1 as j) y where j is null

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/insert.test b/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
index 522b058..ea0ec2a 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
@@ -77,6 +77,38 @@ where year=2009 and month>10
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+00:SCAN HDFS [functional.alltypes]
+   partitions=2/24 files=2 size=40.07KB
+---- SCANRANGELOCATIONS
+NODE 0:
+  HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
+  HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
+---- DISTRIBUTEDPLAN
+WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=24
+|
+02:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+01:EXCHANGE [HASH(year,month)]
+|
+00:SCAN HDFS [functional.alltypes]
+   partitions=2/24 files=2 size=40.07KB
+====
+# IMPALA-5293: noclustered hint prevents adding sort node
+insert into table functional.alltypessmall
+partition (year, month) /* +noclustered */
+select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
+float_col, double_col, date_string_col, string_col, timestamp_col, year, month
+from functional.alltypes
+where year=2009 and month>10
+---- PLAN
+WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=24
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=2/24 files=2 size=40.07KB
 ---- SCANRANGELOCATIONS
@@ -104,12 +136,18 @@ where year=2009 and month>10
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(int_col,int_col)]
 |  partitions=unavailable
 |
+01:SORT
+|  order by: int_col ASC NULLS LAST, int_col ASC NULLS LAST
+|
 00:SCAN HDFS [functional_seq_snap.alltypes]
    partitions=2/24 files=2 size=11.34KB
 ---- DISTRIBUTEDPLAN
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(int_col,int_col)]
 |  partitions=unavailable
 |
+02:SORT
+|  order by: int_col ASC NULLS LAST, int_col ASC NULLS LAST
+|
 01:EXCHANGE [HASH(int_col,int_col)]
 |
 00:SCAN HDFS [functional_seq_snap.alltypes]
@@ -129,6 +167,9 @@ group by year, month
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+02:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 01:AGGREGATE [FINALIZE]
 |  output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
 |  group by: year, month
@@ -143,6 +184,9 @@ NODE 0:
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+04:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 03:AGGREGATE [FINALIZE]
 |  output: min:merge(id), min:merge(bool_col), min:merge(tinyint_col), min:merge(smallint_col), min:merge(int_col), min:merge(bigint_col), min:merge(float_col), min:merge(double_col), min:merge(date_string_col), min:merge(string_col), min:merge(timestamp_col)
 |  group by: year, month
@@ -167,6 +211,9 @@ where year=2009 and month>10
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
 |  partitions=12
 |
+01:SORT
+|  order by: month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=2/24 files=2 size=40.07KB
 ---- SCANRANGELOCATIONS
@@ -177,6 +224,9 @@ NODE 0:
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
 |  partitions=12
 |
+02:SORT
+|  order by: month ASC NULLS LAST
+|
 01:EXCHANGE [HASH(month)]
 |
 00:SCAN HDFS [functional.alltypes]
@@ -196,6 +246,9 @@ group by month
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
 |  partitions=12
 |
+02:SORT
+|  order by: month ASC NULLS LAST
+|
 01:AGGREGATE [FINALIZE]
 |  output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
 |  group by: month
@@ -210,6 +263,9 @@ NODE 0:
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
 |  partitions=12
 |
+04:SORT
+|  order by: month ASC NULLS LAST
+|
 03:AGGREGATE [FINALIZE]
 |  output: min:merge(id), min:merge(bool_col), min:merge(tinyint_col), min:merge(smallint_col), min:merge(int_col), min:merge(bigint_col), min:merge(float_col), min:merge(double_col), min:merge(date_string_col), min:merge(string_col), min:merge(timestamp_col)
 |  group by: month
@@ -234,6 +290,9 @@ where year>2009 and month=4
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,4)]
 |  partitions=2
 |
+01:SORT
+|  order by: year ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=1/24 files=1 size=19.71KB
 ---- SCANRANGELOCATIONS
@@ -243,6 +302,9 @@ NODE 0:
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,4)]
 |  partitions=2
 |
+01:SORT
+|  order by: year ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=1/24 files=1 size=19.71KB
 ====
@@ -337,12 +399,18 @@ partition (year, month) values
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
 |  partitions=9
 |
+01:SORT
+|  order by: 2010 ASC NULLS LAST, 4 ASC NULLS LAST
+|
 00:UNION
    constant-operands=3
 ---- DISTRIBUTEDPLAN
 WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
 |  partitions=9
 |
+01:SORT
+|  order by: 2010 ASC NULLS LAST, 4 ASC NULLS LAST
+|
 00:UNION
    constant-operands=3
 ====
@@ -429,9 +497,12 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2010,10)]
 insert into table functional.alltypes partition(year, month)
 select * from functional.alltypes
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+02:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)]
 |
 00:SCAN HDFS [functional.alltypes]
@@ -441,9 +512,12 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.
 insert into table functional.alltypes partition(year, month) [noshuffle]
 select * from functional.alltypes
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
@@ -451,9 +525,12 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.
 insert into table functional.alltypes partition(year, month) /* +noshuffle */
 select * from functional.alltypes
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
@@ -461,9 +538,12 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.
 insert /* +noshuffle */ into table functional.alltypes partition(year, month)
 select * from functional.alltypes
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
@@ -472,9 +552,12 @@ insert into table functional.alltypes partition(year, month)
 -- +noshuffle
 select * from functional.alltypes
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=24
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
@@ -489,6 +572,9 @@ from functional.alltypes
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,1)]
 |  partitions=2
 |
+01:SORT
+|  order by: year ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
@@ -501,6 +587,9 @@ from functional.alltypes
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,1)]
 |  partitions=2
 |
+02:SORT
+|  order by: year ASC NULLS LAST
+|
 01:EXCHANGE [HASH(year)]
 |
 00:SCAN HDFS [functional.alltypes]
@@ -586,8 +675,8 @@ WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
-# IMPALA-2521: clustered insert into partitioned table adds sort node.
-insert into table functional.alltypes partition(year, month) /*+ clustered */
+# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
+insert into table functional.alltypes partition(year, month)
 select * from functional.alltypes
 ---- PLAN
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
@@ -610,8 +699,8 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
-# IMPALA-2521: clustered insert into partitioned table adds sort node.
-insert into table functional.alltypes partition(year, month) /*+ clustered,noshuffle */
+# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
+insert into table functional.alltypes partition(year, month) /*+ noshuffle */
 select * from functional.alltypes
 ---- PLAN
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
@@ -632,9 +721,9 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)
 00:SCAN HDFS [functional.alltypes]
    partitions=24/24 files=24 size=478.45KB
 ====
-# IMPALA-2521: clustered insert into partitioned table adds sort node. Subquery in
-# WHERE-clause exercises the reset() + analyze() path during rewrite.
-insert into table functional.alltypes partition(year, month) /*+ clustered */
+# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
+# Subquery in WHERE-clause exercises the reset() + analyze() path during rewrite.
+insert into table functional.alltypes partition(year, month)
 select * from functional.alltypes
 where int_col = (select max(int_col) from functional.alltypes)
 ---- PLAN
@@ -687,8 +776,8 @@ WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)
    partitions=24/24 files=24 size=478.45KB
    runtime filters: RF000 -> int_col
 ====
-# IMPALA-2521: clustered insert into non-partitioned table does not add sort node.
-insert into table functional.alltypesnopart /*+ clustered */
+# IMPALA-5293: ensure insert into non-partitioned table does not add sort node.
+insert into table functional.alltypesnopart
 select * from functional.alltypesnopart
 ---- PLAN
 WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
@@ -703,8 +792,8 @@ WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
 00:SCAN HDFS [functional.alltypesnopart]
    partitions=1/1 files=0 size=0B
 ====
-# IMPALA-2521: clustered insert into non-partitioned table does not add sort node.
-insert into table functional.alltypesnopart /*+ clustered,shuffle */
+# IMPALA-5293: ensure insert into non-partitioned table does not add sort node.
+insert into table functional.alltypesnopart /*+ shuffle */
 select * from functional.alltypesnopart
 ---- PLAN
 WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
index 835c41b..0e1b2c6 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/kudu.test
@@ -473,7 +473,7 @@ INSERT INTO KUDU [tpch_kudu.nation]
 00:SCAN HDFS [tpch_parquet.nation]
    partitions=1/1 files=1 size=2.74KB
 ====
-# Unpartitioned table, still has a sort due to clustered hint.
+# Unpartitioned table, clustered hint forces sort node.
 insert into tpch_kudu.nation /* +clustered */
 select * from tpch_parquet.nation
 ---- DISTRIBUTEDPLAN
@@ -483,21 +483,16 @@ INSERT INTO KUDU [tpch_kudu.nation]
 |  order by: n_nationkey ASC NULLS LAST
 |
 00:SCAN HDFS [tpch_parquet.nation]
-   partitions=1/1 files=1 size=2.74KB
+   partitions=1/1 files=1 size=2.75KB
 ====
-# Unpartitioned table, both hints.
-insert into tpch_kudu.nation /* +shuffle,clustered */
+# Unpartitioned table, no sort node without clustered hint.
+insert into tpch_kudu.nation
 select * from tpch_parquet.nation
 ---- DISTRIBUTEDPLAN
 INSERT INTO KUDU [tpch_kudu.nation]
 |
-02:PARTIAL SORT
-|  order by: n_nationkey ASC NULLS LAST
-|
-01:EXCHANGE [UNPARTITIONED]
-|
 00:SCAN HDFS [tpch_parquet.nation]
-   partitions=1/1 files=1 size=2.74KB
+   partitions=1/1 files=1 size=2.75KB
 ====
 # Partition and primary key exprs are all constant, so don't partition/sort.
 insert into functional_kudu.alltypes (id)

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/order.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/order.test b/testdata/workloads/functional-planner/queries/PlannerTest/order.test
index c64be15..cd60a3a 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/order.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/order.test
@@ -930,6 +930,31 @@ PLAN-ROOT SINK
 insert into functional.alltypes partition(year, month)
 select * from functional.alltypes order by int_col
 ---- PLAN
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=24
+|
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+00:SCAN HDFS [functional.alltypes]
+   partitions=24/24 files=24 size=478.45KB
+---- DISTRIBUTEDPLAN
+WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=24
+|
+02:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)]
+|
+00:SCAN HDFS [functional.alltypes]
+   partitions=24/24 files=24 size=478.45KB
+====
+# IMPALA-5293: test that noclustered hint prevents sort node when order by is used without
+# limit in insert.
+insert into functional.alltypes partition(year, month) /* +noclustered */
+select * from functional.alltypes order by int_col
+---- PLAN
 WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(functional.alltypes.year,functional.alltypes.month)]
 |  partitions=24
 |

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
index 3c1fd44..e82d841 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/resource-requirements.test
@@ -2408,15 +2408,20 @@ create table dummy_insert
 partitioned by (l_partkey) as
 select l_comment, l_partkey from tpch.lineitem
 ---- PLAN
-Max Per-Host Resource Reservation: Memory=0B
-Per-Host Resource Estimates: Memory=376.99MB
+Max Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=306.99MB
 
 F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1
-|  Per-Host Resources: mem-estimate=376.99MB mem-reservation=0B
+|  Per-Host Resources: mem-estimate=306.99MB mem-reservation=12.00MB
 WRITE TO HDFS [default.dummy_insert, OVERWRITE=false, PARTITION-KEYS=(l_partkey)]
 |  partitions=200516
 |  mem-estimate=288.99MB mem-reservation=0B
 |
+01:SORT
+|  order by: l_partkey ASC NULLS LAST
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
+|  tuple-ids=1 row-size=50B cardinality=6001215
+|
 00:SCAN HDFS [tpch.lineitem]
    partitions=1/1 files=1 size=718.94MB
    stored statistics:
@@ -2426,15 +2431,20 @@ WRITE TO HDFS [default.dummy_insert, OVERWRITE=false, PARTITION-KEYS=(l_partkey)
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=50B cardinality=6001215
 ---- DISTRIBUTEDPLAN
-Max Per-Host Resource Reservation: Memory=0B
-Per-Host Resource Estimates: Memory=184.33MB
+Max Per-Host Resource Reservation: Memory=12.00MB
+Per-Host Resource Estimates: Memory=202.33MB
 
 F01:PLAN FRAGMENT [HASH(l_partkey)] hosts=3 instances=3
-|  Per-Host Resources: mem-estimate=96.33MB mem-reservation=0B
+|  Per-Host Resources: mem-estimate=114.33MB mem-reservation=12.00MB
 WRITE TO HDFS [default.dummy_insert, OVERWRITE=false, PARTITION-KEYS=(l_partkey)]
 |  partitions=200516
 |  mem-estimate=96.33MB mem-reservation=0B
 |
+02:SORT
+|  order by: l_partkey ASC NULLS LAST
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
+|  tuple-ids=1 row-size=50B cardinality=6001215
+|
 01:EXCHANGE [HASH(l_partkey)]
 |  mem-estimate=0B mem-reservation=0B
 |  tuple-ids=0 row-size=50B cardinality=6001215
@@ -2450,15 +2460,20 @@ Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B
    mem-estimate=88.00MB mem-reservation=0B
    tuple-ids=0 row-size=50B cardinality=6001215
 ---- PARALLELPLANS
-Max Per-Host Resource Reservation: Memory=0B
-Per-Host Resource Estimates: Memory=272.33MB
+Max Per-Host Resource Reservation: Memory=24.00MB
+Per-Host Resource Estimates: Memory=308.33MB
 
 F01:PLAN FRAGMENT [HASH(l_partkey)] hosts=3 instances=6
-|  Per-Host Resources: mem-estimate=96.33MB mem-reservation=0B
+|  Per-Host Resources: mem-estimate=132.33MB mem-reservation=24.00MB
 WRITE TO HDFS [default.dummy_insert, OVERWRITE=false, PARTITION-KEYS=(l_partkey)]
 |  partitions=200516
 |  mem-estimate=48.16MB mem-reservation=0B
 |
+02:SORT
+|  order by: l_partkey ASC NULLS LAST
+|  mem-estimate=18.00MB mem-reservation=12.00MB spill-buffer=2.00MB
+|  tuple-ids=1 row-size=50B cardinality=6001215
+|
 01:EXCHANGE [HASH(l_partkey)]
 |  mem-estimate=0B mem-reservation=0B
 |  tuple-ids=0 row-size=50B cardinality=6001215

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test b/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
index 0a966c4..9c5f577 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/with-clause.test
@@ -490,15 +490,21 @@ PLAN-ROOT SINK
 with t1 as (select * from functional.alltypestiny)
 insert into functional.alltypesinsert partition(year, month) select * from t1
 ---- PLAN
-WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(functional.alltypestiny.year,functional.alltypestiny.month)]
+WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=4
 |
+01:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 00:SCAN HDFS [functional.alltypestiny]
    partitions=4/4 files=4 size=460B
 ---- DISTRIBUTEDPLAN
-WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(functional.alltypestiny.year,functional.alltypestiny.month)]
+WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=4
 |
+02:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
 01:EXCHANGE [HASH(functional.alltypestiny.year,functional.alltypestiny.month)]
 |
 00:SCAN HDFS [functional.alltypestiny]
@@ -513,6 +519,43 @@ select * from t1 union all select * from t2
 WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(year,month)]
 |  partitions=16
 |
+03:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+00:UNION
+|
+|--02:SCAN HDFS [functional.alltypestiny]
+|     partitions=4/4 files=4 size=460B
+|
+01:SCAN HDFS [functional.alltypestiny]
+   partitions=4/4 files=4 size=460B
+---- DISTRIBUTEDPLAN
+WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=16
+|
+04:SORT
+|  order by: year ASC NULLS LAST, month ASC NULLS LAST
+|
+03:EXCHANGE [HASH(year,month)]
+|
+00:UNION
+|
+|--02:SCAN HDFS [functional.alltypestiny]
+|     partitions=4/4 files=4 size=460B
+|
+01:SCAN HDFS [functional.alltypestiny]
+   partitions=4/4 files=4 size=460B
+====
+# IMPALA-5293: Test with clause in an insert statement and in its query statement. Make
+# sure that noclustered hint prevents addition of a sort node before writing to HDFS.
+with t1 as (select * from functional.alltypestiny)
+insert into functional.alltypesinsert partition(year, month) /* +noclustered */
+with t2 as (select * from functional.alltypestiny)
+select * from t1 union all select * from t2
+---- PLAN
+WRITE TO HDFS [functional.alltypesinsert, OVERWRITE=false, PARTITION-KEYS=(year,month)]
+|  partitions=16
+|
 00:UNION
 |
 |--02:SCAN HDFS [functional.alltypestiny]

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-query/queries/QueryTest/insert.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/insert.test b/testdata/workloads/functional-query/queries/QueryTest/insert.test
index 07a665a..f4c2ff2 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/insert.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test
@@ -848,7 +848,7 @@ bigint
 # Check that hdfs writers respects mem_limit.
 set mem_limit=64m;
 insert into table alltypesinsert
-partition (year, month)
+partition (year, month) /* +noclustered */
 select at1.id, at1.bool_col, at1.tinyint_col, at1.smallint_col, at1.int_col, at1.bigint_col,
   at1.float_col, at1.double_col, at1.date_string_col, at1.string_col, at1.timestamp_col,
   at1.year, at2.id as month
@@ -859,9 +859,9 @@ DROP PARTITIONS alltypesinsert
 Memory limit exceeded
 ====
 ---- QUERY
-# IMPALA-2521: clustered insert into table
+# IMPALA-5293: noclustered insert into table
 insert into table alltypesinsert
-partition (year, month) /*+ clustered,shuffle */
+partition (year, month) /*+ noclustered,shuffle */
 select * from alltypes;
 ---- SETUP
 DROP PARTITIONS alltypesinsert
@@ -893,9 +893,9 @@ year=2010/month=8/: 310
 year=2010/month=9/: 300
 ====
 ---- QUERY
-# IMPALA-2521: clustered insert into table
+# IMPALA-5293: noclustered insert into table
 insert into table alltypesinsert
-partition (year, month) /*+ clustered,shuffle */
+partition (year, month) /*+ noclustered,shuffle */
 select * from alltypestiny;
 ---- SETUP
 DROP PARTITIONS alltypesinsert
@@ -907,9 +907,9 @@ year=2009/month=3/: 2
 year=2009/month=4/: 2
 ====
 ---- QUERY
-# IMPALA-2521: clustered insert into table
+# IMPALA-5293: noclustered insert into table
 insert into table alltypesinsert
-partition (year, month) /*+ clustered,noshuffle */
+partition (year, month) /*+ noclustered,noshuffle */
 select * from alltypestiny;
 ---- SETUP
 DROP PARTITIONS alltypesinsert
@@ -921,9 +921,9 @@ year=2009/month=3/: 2
 year=2009/month=4/: 2
 ====
 ---- QUERY
-# IMPALA-2521: clustered insert into table
+# IMPALA-5293: noclustered insert into table
 insert into table alltypesinsert
-partition (year, month) /*+ clustered,shuffle */
+partition (year, month) /*+ noclustered,shuffle */
 select * from alltypestiny where int_col = 0;
 ---- SETUP
 DROP PARTITIONS alltypesinsert
@@ -935,9 +935,9 @@ year=2009/month=3/: 1
 year=2009/month=4/: 1
 ====
 ---- QUERY
-# IMPALA-2521: clustered, unpartitioned insert into table
+# IMPALA-5293: noclustered, unpartitioned insert into table
 insert into table alltypesnopart_insert
- /*+ clustered,shuffle */
+ /*+ noclustered,shuffle */
 select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col,
 double_col, date_string_col, string_col, timestamp_col from alltypessmall;
 ---- SETUP
@@ -946,13 +946,13 @@ RESET alltypesnopart_insert
 : 100
 ====
 ---- QUERY
-# IMPALA-6280: clustered with outer join, inline view, and TupleisNullPredicate
+# IMPALA-6280: clustered (default) with outer join, inline view, and TupleisNullPredicate
 insert into table alltypesinsert (int_col)
-partition (year, month) /*+ clustered,shuffle */
+partition (year, month) /*+ shuffle */
 select v.id, t1.id, t1.month from
 (select coalesce(id, 10) id from functional.alltypessmall) v
 right outer join functional.alltypestiny t1 on t1.id = v.id
 where v.id = 0
 ---- RESULTS
 year=0/month=1/: 1
-====
\ No newline at end of file
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/fc529b7f/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
index 814a605..b8081ee 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
@@ -1,6 +1,7 @@
 ====
 ---- QUERY
-create table alltypes like functional_parquet.alltypes;
+# This test relies on a deterministic row order so we use "sort by (id)".
+create table alltypes sort by (id) like functional_parquet.alltypes;
 insert into alltypes partition(year, month)
 select * from functional_parquet.alltypes where year = 2009;
 ====


[6/6] impala git commit: IMPALA-6475: Enable running TPCH on Kudu

Posted by lv...@apache.org.
IMPALA-6475: Enable running TPCH on Kudu

Change-Id: I88b66f5db105694b3bcf33360887265996f9059c
Reviewed-on: http://gerrit.cloudera.org:8080/9206
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c856b30e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c856b30e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c856b30e

Branch: refs/heads/master
Commit: c856b30e36e7bdf5160e7a71b8e84a82d8258934
Parents: 1a1927b
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Fri Feb 2 16:27:17 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 23:06:13 2018 +0000

----------------------------------------------------------------------
 testdata/workloads/tpch/tpch_core.csv       | 1 +
 testdata/workloads/tpch/tpch_exhaustive.csv | 1 +
 testdata/workloads/tpch/tpch_pairwise.csv   | 1 +
 3 files changed, 3 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c856b30e/testdata/workloads/tpch/tpch_core.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_core.csv b/testdata/workloads/tpch/tpch_core.csv
index 04d7896..86804ac 100644
--- a/testdata/workloads/tpch/tpch_core.csv
+++ b/testdata/workloads/tpch/tpch_core.csv
@@ -7,3 +7,4 @@ file_format:rc, dataset:tpch, compression_codec:none, compression_type:none
 file_format:avro, dataset:tpch, compression_codec: none, compression_type: none
 file_format:avro, dataset:tpch, compression_codec: snap, compression_type: block
 file_format:parquet, dataset:tpch, compression_codec: none, compression_type: none
+file_format:kudu, dataset:tpch, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/c856b30e/testdata/workloads/tpch/tpch_exhaustive.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_exhaustive.csv b/testdata/workloads/tpch/tpch_exhaustive.csv
index b9cb980..32085bf 100644
--- a/testdata/workloads/tpch/tpch_exhaustive.csv
+++ b/testdata/workloads/tpch/tpch_exhaustive.csv
@@ -22,3 +22,4 @@ file_format: avro, dataset: tpch, compression_codec: snap, compression_type: blo
 file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpch, compression_codec: def, compression_type: block
 file_format: parquet, dataset: tpch, compression_codec: snap, compression_type: block
+file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none

http://git-wip-us.apache.org/repos/asf/impala/blob/c856b30e/testdata/workloads/tpch/tpch_pairwise.csv
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/tpch_pairwise.csv b/testdata/workloads/tpch/tpch_pairwise.csv
index 46f085c..0744cf5 100644
--- a/testdata/workloads/tpch/tpch_pairwise.csv
+++ b/testdata/workloads/tpch/tpch_pairwise.csv
@@ -13,3 +13,4 @@ file_format: rc, dataset: tpch, compression_codec: def, compression_type: block
 file_format: avro, dataset: tpch, compression_codec: none, compression_type: none
 file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
 file_format: rc, dataset: tpch, compression_codec: none, compression_type: none
+file_format: kudu, dataset:tpch, compression_codec: none, compression_type: none


[2/6] impala git commit: IMPALA-6476: Skip krpc mem test that uses stress options

Posted by lv...@apache.org.
IMPALA-6476: Skip krpc mem test that uses stress options

Some of the tests added in IMPALA-6193 rely on flags that are only
compiled for debug binaries. This change marks those tests as debug-only
so that they do not break the release tests.

Change-Id: I89ae25ee8c1aca3833c2d98e902ddaad2dd01aad
Reviewed-on: http://gerrit.cloudera.org:8080/9207
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/c201753b
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/c201753b
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/c201753b

Branch: refs/heads/master
Commit: c201753b2e5e50a6afbc01b734583138bf964302
Parents: 097f2f3
Author: Lars Volker <lv...@cloudera.com>
Authored: Fri Feb 2 17:24:00 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 05:10:59 2018 +0000

----------------------------------------------------------------------
 tests/custom_cluster/test_krpc_mem_usage.py | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/c201753b/tests/custom_cluster/test_krpc_mem_usage.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_krpc_mem_usage.py b/tests/custom_cluster/test_krpc_mem_usage.py
index ed7b056..07efe2f 100644
--- a/tests/custom_cluster/test_krpc_mem_usage.py
+++ b/tests/custom_cluster/test_krpc_mem_usage.py
@@ -71,6 +71,7 @@ class TestKrpcMemUsage(CustomClusterTestSuite):
     # show up in time.
     self.execute_query_verify_mem_usage(self.TEST_QUERY, [DATA_STREAM_SVC_METRIC])
 
+  @SkipIfBuildType.not_dev_build
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--use_krpc --stress_datastream_recvr_delay_ms=1000")
   def test_krpc_deferred_memory_usage(self, vector):
@@ -79,6 +80,7 @@ class TestKrpcMemUsage(CustomClusterTestSuite):
     """
     self.execute_query_verify_mem_usage(self.TEST_QUERY, ALL_METRICS)
 
+  @SkipIfBuildType.not_dev_build
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args("--use_krpc --stress_datastream_recvr_delay_ms=1000")
   def test_krpc_deferred_memory_cancellation(self, vector):


[5/6] impala git commit: IMPALA-6228: Control stats extrapolation via tbl prop.

Posted by lv...@apache.org.
IMPALA-6228: Control stats extrapolation via tbl prop.

Introduces a new TBLPROPERTY for controlling stats
extrapolation on a per-table basis:

impala.enable.stats.extrapolation=true/false

The property key was chosen to be consistent with
the impalad startup flag --enable_stats_extrapolation
and to indicate that the property was set and is used
by Impala.

Behavior:
- If the property is not set, then the extrapolation
  behavior is determined by the impalad startup flag.
- If the property is set, it overrides the impalad
  startup flag, i.e., extrapolation can be explicitly
  enabled or disabled regardless of the startup flag.

Testing:
- added new unit tests
- code/hdfs run passed

Change-Id: Ie49597bf1b93b7572106abc620d91f199cba0cfd
Reviewed-on: http://gerrit.cloudera.org:8080/9139
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1a1927b0
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1a1927b0
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1a1927b0

Branch: refs/heads/master
Commit: 1a1927b07de816826e73231d93560ba95a8e2cf6
Parents: fc529b7
Author: Alex Behm <al...@cloudera.com>
Authored: Wed Jan 24 11:58:53 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 22:56:13 2018 +0000

----------------------------------------------------------------------
 .../impala/analysis/ComputeStatsStmt.java       |  26 ++-
 .../org/apache/impala/catalog/HdfsTable.java    |  22 ++-
 .../org/apache/impala/planner/HdfsScanNode.java |   2 +-
 .../apache/impala/service/BackendConfig.java    |   2 +-
 .../apache/impala/analysis/AnalyzeDDLTest.java  |  80 ++++++---
 .../impala/planner/StatsExtrapolationTest.java  |  91 +++++++---
 .../queries/QueryTest/stats-extrapolation.test  |   1 +
 .../custom_cluster/test_stats_extrapolation.py  | 135 ++------------
 tests/metadata/test_stats_extrapolation.py      | 175 +++++++++++++++++++
 9 files changed, 359 insertions(+), 175 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index 6ca8dc9..54daf7f 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -52,7 +52,10 @@ import com.google.common.collect.Sets;
  * Represents the following statements for statistics collection. Which statistics
  * are computed and stored depends on the statement type (incremental or not), the
  * clauses used (sampling, partition spec), as well as whether stats extrapolation
- * is enabled or not (--enable_stats_extrapolation).
+ * is enabled or not.
+ * Stats extrapolation can be configured:
+ * - at the impalad level with --enable_stats_extrapolation
+ * - at the table level HdfsTable.TBL_PROP_ENABLE_STATS_EXTRAPOLATION
  *
  * 1. COMPUTE STATS <table> [(col_list)] [TABLESAMPLE SYSTEM(<perc>) [REPEATABLE(<seed>)]]
  * - Stats extrapolation enabled:
@@ -481,8 +484,10 @@ public class ComputeStatsStmt extends StatementBase {
       }
     } else {
       // Not computing incremental stats.
-      expectAllPartitions_ = !(table_ instanceof HdfsTable) ||
-          !BackendConfig.INSTANCE.enableStatsExtrapolation();
+      expectAllPartitions_ = true;
+      if (table_ instanceof HdfsTable) {
+        expectAllPartitions_ = !((HdfsTable) table_).isStatsExtrapolationEnabled();
+      }
     }
 
     if (filterPreds.size() > MAX_INCREMENTAL_PARTITIONS) {
@@ -576,10 +581,14 @@ public class ComputeStatsStmt extends StatementBase {
     if (!(table_ instanceof HdfsTable)) {
       throw new AnalysisException("TABLESAMPLE is only supported on HDFS tables.");
     }
-    if (!BackendConfig.INSTANCE.enableStatsExtrapolation()) {
-      throw new AnalysisException(
-          "COMPUTE STATS TABLESAMPLE requires --enable_stats_extrapolation=true. " +
-          "Stats extrapolation is currently disabled.");
+    HdfsTable hdfsTable = (HdfsTable) table_;
+    if (!hdfsTable.isStatsExtrapolationEnabled()) {
+      throw new AnalysisException(String.format(
+          "COMPUTE STATS TABLESAMPLE requires stats extrapolation which is disabled.\n" +
+          "Stats extrapolation can be enabled service-wide with %s=true or by altering " +
+          "the table to have tblproperty %s=true",
+          "--enable_stats_extrapolation",
+          HdfsTable.TBL_PROP_ENABLE_STATS_EXTRAPOLATION));
     }
     sampleParams_.analyze(analyzer);
     long sampleSeed;
@@ -592,7 +601,6 @@ public class ComputeStatsStmt extends StatementBase {
     // Compute the sample of files and set 'sampleFileBytes_'.
     long minSampleBytes = analyzer.getQueryOptions().compute_stats_min_sample_size;
     long samplePerc = sampleParams_.getPercentBytes();
-    HdfsTable hdfsTable = (HdfsTable) table_;
     Map<Long, List<FileDescriptor>> sample = hdfsTable.getFilesSample(
         hdfsTable.getPartitions(), samplePerc, minSampleBytes, sampleSeed);
     long sampleFileBytes = 0;
@@ -696,7 +704,7 @@ public class ComputeStatsStmt extends StatementBase {
    */
   private boolean updateTableStatsOnly() {
     if (!(table_ instanceof HdfsTable)) return true;
-    return !isIncremental_ && BackendConfig.INSTANCE.enableStatsExtrapolation();
+    return !isIncremental_ && ((HdfsTable) table_).isStatsExtrapolationEnabled();
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index adc6aef..0f782be 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -126,6 +126,12 @@ public class HdfsTable extends Table {
   // Table property key for skip.header.line.count
   public static final String TBL_PROP_SKIP_HEADER_LINE_COUNT = "skip.header.line.count";
 
+  // Table property key for overriding the Impalad-wide --enable_stats_extrapolation
+  // setting for a specific table. By default, tables do not have the property set and
+  // rely on the Impalad-wide --enable_stats_extrapolation flag.
+  public static final String TBL_PROP_ENABLE_STATS_EXTRAPOLATION =
+      "impala.enable.stats.extrapolation";
+
   // Average memory requirements (in bytes) for storing the metadata of a partition.
   private static final long PER_PARTITION_MEM_USAGE_BYTES = 2048;
 
@@ -1951,7 +1957,7 @@ public class HdfsTable extends Table {
    * Otherwise, returns a value >= 1.
    */
   public long getExtrapolatedNumRows(long fileBytes) {
-    if (!BackendConfig.INSTANCE.enableStatsExtrapolation()) return -1;
+    if (!isStatsExtrapolationEnabled()) return -1;
     if (fileBytes == 0) return 0;
     if (fileBytes < 0) return -1;
     if (tableStats_.num_rows < 0 || tableStats_.total_file_bytes <= 0) return -1;
@@ -1962,6 +1968,18 @@ public class HdfsTable extends Table {
   }
 
   /**
+   * Returns true if stats extrapolation is enabled for this table, false otherwise.
+   * Reconciles the Impalad-wide --enable_stats_extrapolation flag and the
+   * TBL_PROP_ENABLE_STATS_EXTRAPOLATION table property
+   */
+  public boolean isStatsExtrapolationEnabled() {
+    org.apache.hadoop.hive.metastore.api.Table msTbl = getMetaStoreTable();
+    String propVal = msTbl.getParameters().get(TBL_PROP_ENABLE_STATS_EXTRAPOLATION);
+    if (propVal == null) return BackendConfig.INSTANCE.isStatsExtrapolationEnabled();
+    return Boolean.parseBoolean(propVal);
+  }
+
+  /**
    * Returns statistics on this table as a tabular result set. Used for the
    * SHOW TABLE STATS statement. The schema of the returned TResultSet is set
    * inside this method.
@@ -1978,7 +1996,7 @@ public class HdfsTable extends Table {
       resultSchema.addToColumns(colDesc);
     }
 
-    boolean statsExtrap = BackendConfig.INSTANCE.enableStatsExtrapolation();
+    boolean statsExtrap = isStatsExtrapolationEnabled();
 
     resultSchema.addToColumns(new TColumn("#Rows", Type.BIGINT.toThrift()));
     if (statsExtrap) {

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 4bcf112..45ad8d6 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -1067,7 +1067,7 @@ public class HdfsScanNode extends ScanNode {
       output.append(getStatsExplainString(detailPrefix));
       output.append("\n");
       String extrapRows = String.valueOf(extrapolatedNumRows_);
-      if (!BackendConfig.INSTANCE.enableStatsExtrapolation()) {
+      if (!tbl_.isStatsExtrapolationEnabled()) {
         extrapRows = "disabled";
       } else if (extrapolatedNumRows_ == -1) {
         extrapRows = "unavailable";

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/main/java/org/apache/impala/service/BackendConfig.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/BackendConfig.java b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
index 659e717..48d417a 100644
--- a/fe/src/main/java/org/apache/impala/service/BackendConfig.java
+++ b/fe/src/main/java/org/apache/impala/service/BackendConfig.java
@@ -54,7 +54,7 @@ public class BackendConfig {
     return !Strings.isNullOrEmpty(backendCfg_.lineage_event_log_dir);
   }
   public long getIncStatsMaxSize() { return backendCfg_.inc_stats_size_limit_bytes; }
-  public boolean enableStatsExtrapolation() {
+  public boolean isStatsExtrapolationEnabled() {
     return backendCfg_.enable_stats_extrapolation;
   }
   public boolean isAuthToLocalEnabled() {

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
index 1c02306..80c6916 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeDDLTest.java
@@ -1279,7 +1279,67 @@ public class AnalyzeDDLTest extends FrontendTestBase {
     TBackendGflags gflags = BackendConfig.INSTANCE.getBackendCfg();
     boolean origEnableStatsExtrapolation = gflags.isEnable_stats_extrapolation();
     try {
+      // Setup for testing combinations of extrapolation config options.
+      addTestDb("extrap_config", null);
+      addTestTable("create table extrap_config.tbl_prop_unset (i int)");
+      addTestTable("create table extrap_config.tbl_prop_false (i int) " +
+          "tblproperties('impala.enable.stats.extrapolation'='false')");
+      addTestTable("create table extrap_config.tbl_prop_true (i int) " +
+          "tblproperties('impala.enable.stats.extrapolation'='true')");
+      String stmt = "compute stats %s tablesample system (10)";
+      String err = "COMPUTE STATS TABLESAMPLE requires stats extrapolation";
+
+      // Test --enable_stats_extrapolation=false
+      gflags.setEnable_stats_extrapolation(false);
+      // Table property unset --> Extrapolation disabled
+      AnalysisError(String.format(stmt, "extrap_config.tbl_prop_unset"), err);
+      // Table property false --> Extrapolation disabled
+      AnalysisError(String.format(stmt, "extrap_config.tbl_prop_false"), err);
+      // Table property true --> Extrapolation enabled
+      AnalyzesOk(String.format(stmt, "extrap_config.tbl_prop_true"));
+
+      // Test --enable_stats_extrapolation=true
+      gflags.setEnable_stats_extrapolation(true);
+      // Table property unset --> Extrapolation enabled
+      AnalyzesOk(String.format(stmt, "extrap_config.tbl_prop_unset"));
+      // Table property false --> Extrapolation disabled
+      AnalysisError(String.format(stmt, "extrap_config.tbl_prop_false"), err);
+      // Table property true --> Extrapolation enabled
+      AnalyzesOk(String.format(stmt, "extrap_config.tbl_prop_true"));
+
+      // Test file formats.
       gflags.setEnable_stats_extrapolation(true);
+      checkComputeStatsStmt("compute stats functional.alltypes tablesample system (10)");
+      checkComputeStatsStmt(
+          "compute stats functional.alltypes tablesample system (55) repeatable(1)");
+      AnalysisError("compute stats functional.alltypes tablesample system (101)",
+          "Invalid percent of bytes value '101'. " +
+          "The percent of bytes to sample must be between 0 and 100.");
+      AnalysisError("compute stats functional_kudu.alltypes tablesample system (1)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+      AnalysisError("compute stats functional_hbase.alltypes tablesample system (2)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+      AnalysisError(
+          "compute stats functional.alltypes_datasource tablesample system (3)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+
+      // Test file formats with columns whitelist.
+      gflags.setEnable_stats_extrapolation(true);
+      checkComputeStatsStmt(
+          "compute stats functional.alltypes (int_col, double_col) tablesample " +
+          "system (55) repeatable(1)",
+          Lists.newArrayList("int_col", "double_col"));
+      AnalysisError("compute stats functional.alltypes tablesample system (101)",
+          "Invalid percent of bytes value '101'. " +
+          "The percent of bytes to sample must be between 0 and 100.");
+      AnalysisError("compute stats functional_kudu.alltypes tablesample system (1)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+      AnalysisError("compute stats functional_hbase.alltypes tablesample system (2)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+      AnalysisError(
+          "compute stats functional.alltypes_datasource tablesample system (3)",
+          "TABLESAMPLE is only supported on HDFS tables.");
+
       // Test different COMPUTE_STATS_MIN_SAMPLE_BYTES.
       TQueryOptions queryOpts = new TQueryOptions();
 
@@ -1328,26 +1388,6 @@ public class AnalyzeDDLTest extends FrontendTestBase {
       // changes. Expect a sample between 4 and 6 of the 24 total files.
       Assert.assertTrue(adjustedStmt.getEffectiveSamplingPerc() >= 4.0 / 24);
       Assert.assertTrue(adjustedStmt.getEffectiveSamplingPerc() <= 6.0 / 24);
-      // Checks that whitelisted columns works with tablesample.
-      checkComputeStatsStmt(
-          "compute stats functional.alltypes (int_col, double_col) tablesample " +
-          "system (55) repeatable(1)",
-          Lists.newArrayList("int_col", "double_col"));
-      AnalysisError("compute stats functional.alltypes tablesample system (101)",
-          "Invalid percent of bytes value '101'. " +
-          "The percent of bytes to sample must be between 0 and 100.");
-      AnalysisError("compute stats functional_kudu.alltypes tablesample system (1)",
-          "TABLESAMPLE is only supported on HDFS tables.");
-      AnalysisError("compute stats functional_hbase.alltypes tablesample system (2)",
-          "TABLESAMPLE is only supported on HDFS tables.");
-      AnalysisError(
-          "compute stats functional.alltypes_datasource tablesample system (3)",
-          "TABLESAMPLE is only supported on HDFS tables.");
-
-      gflags.setEnable_stats_extrapolation(false);
-      AnalysisError("compute stats functional.alltypes tablesample system (10)",
-          "COMPUTE STATS TABLESAMPLE requires --enable_stats_extrapolation=true. " +
-          "Stats extrapolation is currently disabled.");
     } finally {
       gflags.setEnable_stats_extrapolation(origEnableStatsExtrapolation);
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java b/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
index bee6a32..a0a8566 100644
--- a/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/StatsExtrapolationTest.java
@@ -20,6 +20,7 @@ package org.apache.impala.planner;
 import static org.junit.Assert.assertEquals;
 
 import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.impala.catalog.HdfsTable;
@@ -32,28 +33,36 @@ import org.junit.Test;
 import com.google.common.base.Preconditions;
 
 /**
- * Tests the behavior of stats extrapolation with valid, invalid, and unset stats,
- * as well as extreme values and other edge cases.
+ * Tests the configuration options and behavior of stats extrapolation with valid,
+ * invalid, and unset stats, as well as extreme values and other edge cases.
  */
 public class StatsExtrapolationTest extends FrontendTestBase {
 
   /**
    * Sets the row count and total file size stats in the given table.
    * Unsets the corresponding statistic if a null value is passed.
+   * Preserves existing table properties.
    */
   private void setStats(Table tbl, Long rowCount, Long totalSize) {
-    org.apache.hadoop.hive.metastore.api.Table msTbl =
-        new org.apache.hadoop.hive.metastore.api.Table();
-    msTbl.setParameters(new HashMap<String, String>());
+    org.apache.hadoop.hive.metastore.api.Table msTbl = tbl.getMetaStoreTable();
+    if (msTbl == null) {
+      msTbl = new org.apache.hadoop.hive.metastore.api.Table();
+      msTbl.setParameters(new HashMap<String, String>());
+    }
+    if (msTbl.getParameters() == null) {
+      msTbl.setParameters(new HashMap<String, String>());
+    }
+    Map<String, String> params = msTbl.getParameters();
     if (rowCount != null) {
-      msTbl.getParameters().put(StatsSetupConst.ROW_COUNT,
-          String.valueOf(rowCount));
+      params.put(StatsSetupConst.ROW_COUNT, String.valueOf(rowCount));
+    } else {
+      params.remove(StatsSetupConst.ROW_COUNT);
     }
     if (totalSize != null) {
-      msTbl.getParameters().put(StatsSetupConst.TOTAL_SIZE,
-          String.valueOf(totalSize));
+      params.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(totalSize));
+    } else {
+      params.remove(StatsSetupConst.TOTAL_SIZE);
     }
-    tbl.setMetaStoreTable(msTbl);
     tbl.setTableStats(msTbl);
   }
 
@@ -61,8 +70,8 @@ public class StatsExtrapolationTest extends FrontendTestBase {
       long fileBytes, long expectedExtrapNumRows) {
     Preconditions.checkState(tbl instanceof HdfsTable);
     setStats(tbl, rowCount, totalSize);
-    long actualExrtapNumRows = ((HdfsTable)tbl).getExtrapolatedNumRows(fileBytes);
-    assertEquals(expectedExtrapNumRows, actualExrtapNumRows);
+    long actualExtrapNumRows = ((HdfsTable)tbl).getExtrapolatedNumRows(fileBytes);
+    assertEquals(expectedExtrapNumRows, actualExtrapNumRows);
   }
 
   private void testInvalidStats(Table tbl, Long rowCount, Long totalSize) {
@@ -79,7 +88,7 @@ public class StatsExtrapolationTest extends FrontendTestBase {
     addTestDb("extrap_stats", null);
     Table tbl = addTestTable("create table extrap_stats.t (i int)");
 
-    // Modify/restore the backend config for this test.
+    // Replace/restore the static backend config for this test.
     TBackendGflags gflags = BackendConfig.INSTANCE.getBackendCfg();
     boolean origEnableStatsExtrapolation = gflags.isEnable_stats_extrapolation();
     try {
@@ -134,24 +143,56 @@ public class StatsExtrapolationTest extends FrontendTestBase {
   }
 
   @Test
-  public void TestStatsExtrapolationDisabled() {
-    addTestDb("extrap_stats", null);
-    Table tbl = addTestTable("create table extrap_stats.t (i int)");
-
-    // Modify/restore the backend config for this test.
+  public void TestStatsExtrapolationConfig() {
+    addTestDb("extrap_config", null);
+    Table propUnsetTbl =
+        addTestTable("create table extrap_config.tbl_prop_unset (i int)");
+    Table propFalseTbl =
+        addTestTable("create table extrap_config.tbl_prop_false (i int) " +
+        "tblproperties('impala.enable.stats.extrapolation'='false')");
+    Table propTrueTbl =
+        addTestTable("create table extrap_config.tbl_prop_true (i int) " +
+        "tblproperties('impala.enable.stats.extrapolation'='true')");
+
+    // Replace/restore the static backend config for this test.
     TBackendGflags gflags = BackendConfig.INSTANCE.getBackendCfg();
     boolean origEnableStatsExtrapolation = gflags.isEnable_stats_extrapolation();
     try {
+      // Test --enable_stats_extrapolation=false
       gflags.setEnable_stats_extrapolation(false);
-
-      // Always expect -1 even with legitimate stats.
-      runTest(tbl, 100L, 1000L, 0, -1);
-      runTest(tbl, 100L, 1000L, 100, -1);
-      runTest(tbl, 100L, 1000L, 1000000000, -1);
-      runTest(tbl, 100L, 1000L, Long.MAX_VALUE, -1);
-      runTest(tbl, 100L, 1000L, -100, -1);
+      // Table property unset --> Extrapolation disabled
+      configTestExtrapolationDisabled(propUnsetTbl);
+      // Table property false --> Extrapolation disabled
+      configTestExtrapolationDisabled(propFalseTbl);
+      // Table property true --> Extrapolation enabled
+      configTestExtrapolationEnabled(propTrueTbl);
+
+      // Test --enable_stats_extrapolation=true
+      gflags.setEnable_stats_extrapolation(true);
+      // Table property unset --> Extrapolation enabled
+      configTestExtrapolationEnabled(propUnsetTbl);
+      // Table property false --> Extrapolation disabled
+      configTestExtrapolationDisabled(propFalseTbl);
+      // Table property true --> Extrapolation enabled
+      configTestExtrapolationEnabled(propTrueTbl);
     } finally {
       gflags.setEnable_stats_extrapolation(origEnableStatsExtrapolation);
     }
   }
+
+  private void configTestExtrapolationDisabled(Table tbl) {
+    runTest(tbl, 100L, 1000L, 0, -1);
+    runTest(tbl, 100L, 1000L, 100, -1);
+    runTest(tbl, 100L, 1000L, 1000000000, -1);
+    runTest(tbl, 100L, 1000L, Long.MAX_VALUE, -1);
+    runTest(tbl, 100L, 1000L, -100, -1);
+  }
+
+  private void configTestExtrapolationEnabled(Table tbl) {
+    runTest(tbl, 100L, 1000L, 0, 0);
+    runTest(tbl, 100L, 1000L, 100, 10);
+    runTest(tbl, 100L, 1000L, 1000000000, 100000000);
+    runTest(tbl, 100L, 1000L, Long.MAX_VALUE, 922337203685477632L);
+    runTest(tbl, 100L, 1000L, -100, -1);
+  }
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
index b8081ee..7da7baf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
@@ -2,6 +2,7 @@
 ---- QUERY
 # This test relies on a deterministic row order so we use "sort by (id)".
 create table alltypes sort by (id) like functional_parquet.alltypes;
+alter table alltypes set tblproperties("impala.enable.stats.extrapolation"="true");
 insert into alltypes partition(year, month)
 select * from functional_parquet.alltypes where year = 2009;
 ====

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/tests/custom_cluster/test_stats_extrapolation.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_stats_extrapolation.py b/tests/custom_cluster/test_stats_extrapolation.py
index 42c3820..2910da2 100644
--- a/tests/custom_cluster/test_stats_extrapolation.py
+++ b/tests/custom_cluster/test_stats_extrapolation.py
@@ -15,16 +15,17 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from os import path
+import pytest
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.test_dimensions import (
     create_exec_option_dimension,
     create_single_exec_option_dimension,
     create_uncompressed_text_dimension)
-from tests.util.hdfs_util import NAMENODE
-
 
 class TestStatsExtrapolation(CustomClusterTestSuite):
+  """Minimal end-to-end test for the --enable_stats_extrapolation impalad flag. This test
+  primarly checks that the flag is propagated to the FE. More testing is done in FE unit
+  tests and metadata/test_stats_extrapolation.py."""
 
   @classmethod
   def get_workload(self):
@@ -37,121 +38,21 @@ class TestStatsExtrapolation(CustomClusterTestSuite):
     cls.ImpalaTestMatrix.add_dimension(
         create_uncompressed_text_dimension(cls.get_workload()))
 
-  @CustomClusterTestSuite.with_args(impalad_args=('--enable_stats_extrapolation=true'))
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(impalad_args="--enable_stats_extrapolation=true")
   def test_stats_extrapolation(self, vector, unique_database):
-    vector.get_value('exec_option')['num_nodes'] = 1
-    vector.get_value('exec_option')['explain_level'] = 2
-    self.run_test_case('QueryTest/stats-extrapolation', vector, unique_database)
-
-  @CustomClusterTestSuite.with_args(impalad_args=('--enable_stats_extrapolation=true'))
-  def test_compute_stats_tablesample(self, vector, unique_database):
-    """COMPUTE STATS TABLESAMPLE is inherently non-deterministic due to its use of
-    SAMPLED_NDV() so we test it specially. The goal of this test is to ensure that
-    COMPUTE STATS TABLESAMPLE computes in-the-right-ballpark stats and successfully
-    stores them in the HMS."""
-
-    # Since our test tables are small, set the minimum sample size to 0 to make sure
-    # we exercise the sampling code paths.
-    self.client.execute("set compute_stats_min_sample_size=0")
-
-    # Test partitioned table.
+    # Test row count extrapolation
+    self.client.execute("set explain_level=2")
+    explain_result = self.client.execute("explain select * from functional.alltypes")
+    assert "extrapolated-rows=7300" in " ".join(explain_result.data)
+    # Test COMPUTE STATS TABLESAMPLE
     part_test_tbl = unique_database + ".alltypes"
     self.clone_table("functional.alltypes", part_test_tbl, True, vector)
-    self.__run_sampling_test(part_test_tbl, "", "functional.alltypes", 1, 3)
-    self.__run_sampling_test(part_test_tbl, "", "functional.alltypes", 10, 7)
-    self.__run_sampling_test(part_test_tbl, "", "functional.alltypes", 20, 13)
-    self.__run_sampling_test(part_test_tbl, "", "functional.alltypes", 100, 99)
-
-    # Test unpartitioned table.
-    nopart_test_tbl = unique_database + ".alltypesnopart"
-    self.client.execute("create table {0} as select * from functional.alltypes"\
-      .format(nopart_test_tbl))
-    # Clone to use as a baseline. We run the regular COMPUTE STATS on this table.
-    nopart_test_tbl_exp = unique_database + ".alltypesnopart_exp"
-    self.clone_table(nopart_test_tbl, nopart_test_tbl_exp, False, vector)
-    self.client.execute("compute stats {0}".format(nopart_test_tbl_exp))
-    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_exp, 1, 3)
-    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_exp, 10, 7)
-    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_exp, 20, 13)
-    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_exp, 100, 99)
-
-    # Test empty table.
-    empty_test_tbl = unique_database + ".empty_tbl"
-    self.clone_table("functional.alltypes", empty_test_tbl, False, vector)
-    self.__run_sampling_test(empty_test_tbl, "", empty_test_tbl, 10, 7)
-
-    # Test wide table. Should not crash or error. This takes a few minutes so restrict
-    # to exhaustive.
-    if self.exploration_strategy() == "exhaustive":
-      wide_test_tbl = unique_database + ".wide"
-      self.clone_table("functional.widetable_1000_cols", wide_test_tbl, False, vector)
-      self.client.execute(
-        "compute stats {0} tablesample system(10)".format(wide_test_tbl))
-
-    # Test column subset.
-    column_subset_tbl = unique_database + ".column_subset"
-    columns = "(int_col, string_col)"
-    self.clone_table("functional.alltypes", column_subset_tbl, True, vector)
-    self.__run_sampling_test(column_subset_tbl, columns, "functional.alltypes", 1, 3)
-    self.__run_sampling_test(column_subset_tbl, columns, "functional.alltypes", 10, 7)
-    self.__run_sampling_test(column_subset_tbl, columns, "functional.alltypes", 20, 13)
-    self.__run_sampling_test(column_subset_tbl, columns, "functional.alltypes", 100, 99)
-
-    # Test no columns.
-    no_column_tbl = unique_database + ".no_columns"
-    columns = "()"
-    self.clone_table("functional.alltypes", no_column_tbl, True, vector)
-    self.__run_sampling_test(no_column_tbl, columns, "functional.alltypes", 10, 7)
-
-  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed):
-    """Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' with the
-    given column restriction clause, sampling percent and random seed. Checks that
-    the resulting table and column stats are reasoanbly close to those of
-    'expected_tbl'."""
-    self.client.execute("drop stats {0}".format(tbl))
-    self.client.execute("compute stats {0}{1} tablesample system ({2}) repeatable ({3})"\
-      .format(tbl, cols, perc, seed))
-    self.__check_table_stats(tbl, expected_tbl)
-    self.__check_column_stats(tbl, expected_tbl)
-
-  def __check_table_stats(self, tbl, expected_tbl):
-    """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x
-    of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
-    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
-    actual = self.client.execute("show table stats {0}".format(tbl))
-    expected = self.client.execute("show table stats {0}".format(expected_tbl))
-    assert len(actual.data) == len(expected.data)
-    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
-    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
-    rows_col_idx = col_names.index("#ROWS")
-    extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
-    for i in xrange(0, len(actual.data)):
-      act_cols = actual.data[i].split("\t")
-      exp_cols = expected.data[i].split("\t")
-      assert int(exp_cols[rows_col_idx]) >= 0
-      self.appx_equals(\
-        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
-      # Only the table-level row count is stored. The partition row counts
-      # are extrapolated.
-      if act_cols[0] == "Total":
-        self.appx_equals(
-          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
-      elif len(actual.data) > 1:
-        # Partition row count is expected to not be set.
-        assert int(act_cols[rows_col_idx]) == -1
-
-  def __check_column_stats(self, tbl, expected_tbl):
-    """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those
-    reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
-    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
-    actual = self.client.execute("show column stats {0}".format(tbl))
-    expected = self.client.execute("show column stats {0}".format(expected_tbl))
-    assert len(actual.data) == len(expected.data)
-    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
-    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
+    self.client.execute(
+        "compute stats {0} tablesample system (13)".format(part_test_tbl))
+    # Check that column stats were set.
+    col_stats = self.client.execute("show column stats {0}".format(part_test_tbl))
+    col_names = [fs.name.upper() for fs in col_stats.schema.fieldSchemas]
     ndv_col_idx = col_names.index("#DISTINCT VALUES")
-    for i in xrange(0, len(actual.data)):
-      act_cols = actual.data[i].split("\t")
-      exp_cols = expected.data[i].split("\t")
-      assert int(exp_cols[ndv_col_idx]) >= 0
-      self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 2)
+    for row in col_stats.data:
+      assert int(row.split("\t")[ndv_col_idx]) >= 0

http://git-wip-us.apache.org/repos/asf/impala/blob/1a1927b0/tests/metadata/test_stats_extrapolation.py
----------------------------------------------------------------------
diff --git a/tests/metadata/test_stats_extrapolation.py b/tests/metadata/test_stats_extrapolation.py
new file mode 100644
index 0000000..61bdb39
--- /dev/null
+++ b/tests/metadata/test_stats_extrapolation.py
@@ -0,0 +1,175 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from os import path
+from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.test_dimensions import (
+    create_exec_option_dimension,
+    create_single_exec_option_dimension,
+    create_uncompressed_text_dimension)
+
+class TestStatsExtrapolation(ImpalaTestSuite):
+  """Test stats extrapolation and compute stats tablesample. Stats extrapolation is
+  enabled via table property and not via the impalad startup flag so these tests can be
+  run as regular tests (non-custom-cluster) and in parallel with other tests."""
+
+  @classmethod
+  def get_workload(self):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestStatsExtrapolation, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
+    cls.ImpalaTestMatrix.add_dimension(
+        create_uncompressed_text_dimension(cls.get_workload()))
+
+  def test_stats_extrapolation(self, vector, unique_database):
+    vector.get_value('exec_option')['num_nodes'] = 1
+    vector.get_value('exec_option')['explain_level'] = 2
+    self.run_test_case('QueryTest/stats-extrapolation', vector, unique_database)
+
+  def test_compute_stats_tablesample(self, vector, unique_database):
+    """COMPUTE STATS TABLESAMPLE is inherently non-deterministic due to its use of
+    SAMPLED_NDV() so we test it specially. The goal of this test is to ensure that
+    COMPUTE STATS TABLESAMPLE computes in-the-right-ballpark stats and successfully
+    stores them in the HMS."""
+
+    # Since our test tables are small, set the minimum sample size to 0 to make sure
+    # we exercise the sampling code paths.
+    self.client.execute("set compute_stats_min_sample_size=0")
+
+    # Test partitioned table.
+    part_test_tbl = unique_database + ".alltypes"
+    self.clone_table("functional.alltypes", part_test_tbl, True, vector)
+    # Clone to use as a baseline. We run the regular COMPUTE STATS on this table.
+    part_test_tbl_base = unique_database + ".alltypes_base"
+    self.clone_table(part_test_tbl, part_test_tbl_base, True, vector)
+    self.client.execute("compute stats {0}".format(part_test_tbl_base))
+    # Enable stats extrapolation on both tables to match SHOW output.
+    self.__set_extrapolation_tblprop(part_test_tbl)
+    self.__set_extrapolation_tblprop(part_test_tbl_base)
+    self.__run_sampling_test(part_test_tbl, "", part_test_tbl_base, 1, 3)
+    self.__run_sampling_test(part_test_tbl, "", part_test_tbl_base, 10, 7)
+    self.__run_sampling_test(part_test_tbl, "", part_test_tbl_base, 20, 13)
+    self.__run_sampling_test(part_test_tbl, "", part_test_tbl_base, 100, 99)
+
+    # Test unpartitioned table.
+    nopart_test_tbl = unique_database + ".alltypesnopart"
+    self.client.execute("create table {0} as select * from functional.alltypes"\
+      .format(nopart_test_tbl))
+    # Clone to use as a baseline. We run the regular COMPUTE STATS on this table.
+    nopart_test_tbl_base = unique_database + ".alltypesnopart_base"
+    self.clone_table(nopart_test_tbl, nopart_test_tbl_base, False, vector)
+    self.client.execute("compute stats {0}".format(nopart_test_tbl_base))
+    # Enable stats extrapolation on both tables to match SHOW output.
+    self.__set_extrapolation_tblprop(nopart_test_tbl)
+    self.__set_extrapolation_tblprop(nopart_test_tbl_base)
+    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_base, 1, 3)
+    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_base, 10, 7)
+    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_base, 20, 13)
+    self.__run_sampling_test(nopart_test_tbl, "", nopart_test_tbl_base, 100, 99)
+
+    # Test empty table.
+    empty_test_tbl = unique_database + ".empty_tbl"
+    self.clone_table("functional.alltypes", empty_test_tbl, False, vector)
+    self.__set_extrapolation_tblprop(empty_test_tbl)
+    self.__run_sampling_test(empty_test_tbl, "", empty_test_tbl, 10, 7)
+
+    # Test column subset.
+    column_subset_tbl = unique_database + ".column_subset"
+    columns = "(int_col, string_col)"
+    self.clone_table("functional.alltypes", column_subset_tbl, True, vector)
+    self.__set_extrapolation_tblprop(column_subset_tbl)
+    self.__run_sampling_test(column_subset_tbl, columns, part_test_tbl_base, 1, 3)
+    self.__run_sampling_test(column_subset_tbl, columns, part_test_tbl_base, 10, 7)
+    self.__run_sampling_test(column_subset_tbl, columns, part_test_tbl_base, 20, 13)
+    self.__run_sampling_test(column_subset_tbl, columns, part_test_tbl_base, 100, 99)
+
+    # Test no columns.
+    no_column_tbl = unique_database + ".no_columns"
+    columns = "()"
+    self.clone_table("functional.alltypes", no_column_tbl, True, vector)
+    self.__set_extrapolation_tblprop(no_column_tbl)
+    self.__run_sampling_test(no_column_tbl, columns, part_test_tbl_base, 10, 7)
+
+    # Test wide table. Should not crash or error. This takes a few minutes so restrict
+    # to exhaustive.
+    if self.exploration_strategy() == "exhaustive":
+      wide_test_tbl = unique_database + ".wide"
+      self.clone_table("functional.widetable_1000_cols", wide_test_tbl, False, vector)
+      self.__set_extrapolation_tblprop(wide_test_tbl)
+      self.client.execute(
+        "compute stats {0} tablesample system(10)".format(wide_test_tbl))
+
+  def __set_extrapolation_tblprop(self, tbl):
+    """Alters the given table to enable stats extrapolation via tblproperty."""
+    self.client.execute("alter table {0} set "\
+      "tblproperties('impala.enable.stats.extrapolation'='true')".format(tbl))
+
+  def __run_sampling_test(self, tbl, cols, expected_tbl, perc, seed):
+    """Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' with the
+    given column restriction clause, sampling percent and random seed. Checks that
+    the resulting table and column stats are reasoanbly close to those of
+    'expected_tbl'."""
+    self.client.execute("drop stats {0}".format(tbl))
+    self.client.execute("compute stats {0}{1} tablesample system ({2}) repeatable ({3})"\
+      .format(tbl, cols, perc, seed))
+    self.__check_table_stats(tbl, expected_tbl)
+    self.__check_column_stats(tbl, expected_tbl)
+
+  def __check_table_stats(self, tbl, expected_tbl):
+    """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x
+    of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
+    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
+    actual = self.client.execute("show table stats {0}".format(tbl))
+    expected = self.client.execute("show table stats {0}".format(expected_tbl))
+    assert len(actual.data) == len(expected.data)
+    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
+    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
+    rows_col_idx = col_names.index("#ROWS")
+    extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
+    for i in xrange(0, len(actual.data)):
+      act_cols = actual.data[i].split("\t")
+      exp_cols = expected.data[i].split("\t")
+      assert int(exp_cols[rows_col_idx]) >= 0
+      self.appx_equals(\
+        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+      # Only the table-level row count is stored. The partition row counts
+      # are extrapolated.
+      if act_cols[0] == "Total":
+        self.appx_equals(
+          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+      elif len(actual.data) > 1:
+        # Partition row count is expected to not be set.
+        assert int(act_cols[rows_col_idx]) == -1
+
+  def __check_column_stats(self, tbl, expected_tbl):
+    """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those
+    reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
+    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
+    actual = self.client.execute("show column stats {0}".format(tbl))
+    expected = self.client.execute("show column stats {0}".format(expected_tbl))
+    assert len(actual.data) == len(expected.data)
+    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
+    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
+    ndv_col_idx = col_names.index("#DISTINCT VALUES")
+    for i in xrange(0, len(actual.data)):
+      act_cols = actual.data[i].split("\t")
+      exp_cols = expected.data[i].split("\t")
+      assert int(exp_cols[ndv_col_idx]) >= 0
+      self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 2)


[3/6] impala git commit: IMPALA-4924 addendum: Change result type to decimal in a TPCH query

Posted by lv...@apache.org.
IMPALA-4924 addendum: Change result type to decimal in a TPCH query

Change the expected result type of Kudu TPCH Q17 to Decimal because
DECIMAL_V2 is now enabled by default. This was not done earlier because
we were not running TPCH on Kudu regularly.

Cherry-picks: not for 2.x

Change-Id: I46fc038d40969547622707ce77a037494f0ed0a9
Reviewed-on: http://gerrit.cloudera.org:8080/9208
Reviewed-by: Taras Bobrovytsky <tb...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/a493a016
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/a493a016
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/a493a016

Branch: refs/heads/master
Commit: a493a016450dc217d4d034d49f4a150429828454
Parents: c201753
Author: Taras Bobrovytsky <tb...@cloudera.com>
Authored: Fri Feb 2 17:23:06 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Feb 3 05:22:24 2018 +0000

----------------------------------------------------------------------
 testdata/workloads/tpch/queries/tpch-kudu-q17.test | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/a493a016/testdata/workloads/tpch/queries/tpch-kudu-q17.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/tpch/queries/tpch-kudu-q17.test b/testdata/workloads/tpch/queries/tpch-kudu-q17.test
index 81f9d35..4f66748 100644
--- a/testdata/workloads/tpch/queries/tpch-kudu-q17.test
+++ b/testdata/workloads/tpch/queries/tpch-kudu-q17.test
@@ -21,5 +21,5 @@ where
 ---- RESULTS
 348406.05
 ---- TYPES
-double
-====
\ No newline at end of file
+decimal
+====