You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2017/12/18 17:55:58 UTC
[5/6] impala git commit: IMPALA-5310: Part 3: Use SAMPLED_NDV() in COMPUTE STATS.

IMPALA-5310: Part 3: Use SAMPLED_NDV() in COMPUTE STATS.

Modifies COMPUTE STATS TABLESAMPLE to use the new SAMPLED_NDV()
function.

Testing:
- modified/improved existing functional tests
- core/hdfs run passed

Change-Id: I6ec0831f77698695975e45ec0bc0364c765d819b
Reviewed-on: http://gerrit.cloudera.org:8080/8840
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1f7b3b00
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1f7b3b00
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1f7b3b00

Branch: refs/heads/master
Commit: 1f7b3b00e921d68857abb22f48656ee4444c194c
Parents: 5a7c10e
Author: Alex Behm <al...@cloudera.com>
Authored: Wed Dec 13 12:34:00 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Dec 16 04:58:59 2017 +0000

----------------------------------------------------------------------
 be/src/exec/catalog-op-executor.cc              |   3 -
 common/thrift/JniCatalog.thrift                 |   8 --
 .../impala/analysis/ComputeStatsStmt.java       | 124 +++++++++++------
 .../impala/service/CatalogOpExecutor.java       |  27 ----
 .../QueryTest/compute-stats-tablesample.test    | 137 -------------------
 tests/common/impala_test_suite.py               |  13 ++
 .../custom_cluster/test_stats_extrapolation.py  | 117 +++++++++++-----
 tests/query_test/test_aggregation.py            |   9 +-
 8 files changed, 179 insertions(+), 259 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/be/src/exec/catalog-op-executor.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/catalog-op-executor.cc b/be/src/exec/catalog-op-executor.cc
index bb9508b..7490ed1 100644
--- a/be/src/exec/catalog-op-executor.cc
+++ b/be/src/exec/catalog-op-executor.cc
@@ -117,9 +117,6 @@ Status CatalogOpExecutor::ExecComputeStats(
     update_stats_params.table_stats.__set_total_file_bytes(
         compute_stats_params.total_file_bytes);
   }
-  if (compute_stats_params.__isset.sample_file_bytes) {
-    update_stats_params.__set_sample_file_bytes(compute_stats_params.sample_file_bytes);
-  }
   // col_stats_schema and col_stats_data will be empty if there was no column stats query.
   if (!col_stats_schema.columns.empty()) {
     if (compute_stats_params.is_incremental) {

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/common/thrift/JniCatalog.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/JniCatalog.thrift b/common/thrift/JniCatalog.thrift
index 18f30fe..6e746c7 100644
--- a/common/thrift/JniCatalog.thrift
+++ b/common/thrift/JniCatalog.thrift
@@ -295,10 +295,6 @@ struct TAlterTableUpdateStatsParams {
 
   // If true, this is the result of an incremental stats computation
   6: optional bool is_incremental
-
-  // Sum of file sizes in the table sample. Only set when TABLESAMPLE was specified.
-  // Only set for tables of type HDFS_TABLE and if is_incremental is false.
-  7: optional i64 sample_file_bytes
 }
 
 // Parameters for ALTER TABLE SET [PARTITION partitionSet] CACHED|UNCACHED
@@ -505,10 +501,6 @@ struct TComputeStatsParams {
   // Sum of file sizes in the table. Only set for tables of type HDFS_TABLE and if
   // is_incremental is false.
   9: optional i64 total_file_bytes
-
-  // Sum of file sizes in the table sample. Only set when TABLESAMPLE was specified.
-  // Only set for tables of type HDFS_TABLE and if is_incremental is false.
-  10: optional i64 sample_file_bytes
 }
 
 // Parameters for CREATE/DROP ROLE

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
index 64dc9bf..ed81f89 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ComputeStatsStmt.java
@@ -58,11 +58,11 @@ import com.google.common.collect.Sets;
  *   Computes and replaces the table-level row count and total file size, as well as all
  *   table-level column statistics. Existing partition-objects and their row count are
  *   not modified at all. The TABLESAMPLE clause can be used to limit the scanned data
- *   volume to a desired percentage. When sampling, the results of the COMPUTE STATS
- *   queries are sent to the CatalogServer. There, the stats are extrapolated before
- *   storing them into the HMS so as not to confuse other engines like Hive/SparkSQL
- *   which may rely on the shared HMS fields representing to the whole table and not
- *   a sample. See {@link CatalogOpExecutor#getExtrapolatedStatsVal}.
+ *   volume to a desired percentage. When sampling, the COMPUTE STATS queries directly
+ *   produce extrapolated stats which are then stored in the HMS via the CatalogServer.
+ *   We store extrapolated stats in the HMS so as not to confuse other engines like
+ *   Hive/SparkSQL which may rely on the shared HMS fields representing to the whole
+ *   table and not a sample. See {@link CatalogOpExecutor#getExtrapolatedStatsVal}.
  * - Stats extrapolation disabled:
  *   Computes and replaces the table-level row count and total file size, the row counts
  *   for all partitions (if applicable), as well as all table-level column statistics.
@@ -101,9 +101,9 @@ public class ComputeStatsStmt extends StatementBase {
   // Set during analysis.
   protected Table table_;
 
-  // Total number of bytes in the file sample of the target HDFS table. Set to -1 for
-  // non-HDFS tables or when TABLESAMPLE is not specified.
-  protected long sampleFileBytes_ = -1;
+  // Effective sampling percent based on the total number of bytes in the files sample.
+  // Set to -1 for non-HDFS tables or if TABLESAMPLE was not specified.
+  protected double effectiveSamplePerc_ = -1;
 
   // The Null count is not currently being used in optimization or run-time,
   // and compute stats runs 2x faster in many cases when not counting NULLs.
@@ -202,7 +202,11 @@ public class ComputeStatsStmt extends StatementBase {
       String colRefSql = ToSqlUtils.getIdentSql(c.getName());
       if (isIncremental_) {
         columnStatsSelectList.add("NDV_NO_FINALIZE(" + colRefSql + ") AS " + colRefSql);
+      } else if (effectiveSamplePerc_ > 0) {
+        columnStatsSelectList.add(String.format("SAMPLED_NDV(%s, %.10f) AS %s",
+            colRefSql, effectiveSamplePerc_, colRefSql));
       } else {
+        // Regular (non-incremental) compute stats without sampling.
         columnStatsSelectList.add("NDV(" + colRefSql + ") AS " + colRefSql);
       }
 
@@ -241,39 +245,58 @@ public class ComputeStatsStmt extends StatementBase {
   }
 
   /**
-   * Constructs two queries to compute statistics for 'tableName_', if that table exists
-   * (although if we can detect that no work needs to be done for either query, that query
-   * will be 'null' and not executed).
+   * Constructs two SQL queries for computing the row-count and column statistics and
+   * sets them in 'tableStatsQueryStr_' and 'columnStatsQueryStr_', respectively.
+   * The queries are generated as follows.
    *
-   * The first query computes the number of rows (on a per-partition basis if the table is
-   * partitioned) and has the form "SELECT COUNT(*) FROM tbl GROUP BY part_col1,
-   * part_col2...", with an optional WHERE clause for incremental computation (see below).
+   * 1. Regular COMPUTE STATS (not incremental and no sampling)
+   * 1.1 Row counts:
+   * SELECT COUNT(*) FROM tbl [GROUP BY part_col1, part_col2 ...]
+   * The GROUP BY clause is added if the target is a partitioned HDFS table and
+   * stats extrapolation is disabled. Otherwise, no GROUP BY is used.
    *
-   * The second query computes the NDV estimate, the average width, the maximum width and,
-   * optionally, the number of nulls for each column. For non-partitioned tables (or
-   * non-incremental computations), the query is simple:
+   * 1.2 Column stats:
+   * SELECT NDV(c1), CAST(-1 as typeof(c1)), MAX(length(c1)), AVG(length(c1)),
+   *        NDV(c2), CAST(-1 as typeof(c2)), MAX(length(c2)), AVG(length(c2)),
+   *        ...
+   * FROM tbl
    *
-   * SELECT NDV(col), COUNT(<nulls>), MAX(length(col)), AVG(length(col)) FROM tbl
+   * 2. COMPUTE STATS with TABLESAMPLE
+   * 2.1 Row counts:
+   * SELECT ROUND(COUNT(*) / <effective_sample_perc>)
+   * FROM tbl TABLESAMPLE SYSTEM(<sample_perc>) REPEATABLE (<random_seed>)
    *
-   * (For non-string columns, the widths are hard-coded as they are known at query
-   * construction time).
+   * 2.1 Column stats:
+   * SELECT SAMPLED_NDV(c1, p), CAST(-1 as typeof(c1)), MAX(length(c1)), AVG(length(c1)),
+   *        SAMPLED_NDV(c2, p), CAST(-1 as typeof(c2)), MAX(length(c2)), AVG(length(c2)),
+   *        ...
+   * FROM tbl TABLESAMPLE SYSTEM(<sample_perc>) REPEATABLE (<random_seed>)
+   * SAMPLED_NDV() is a specialized aggregation function that estimates the NDV based on
+   * a sample. The "p" passed to the SAMPLED_NDV() is the effective sampling rate.
    *
-   * If computation is incremental (i.e. the original statement was COMPUTE INCREMENTAL
-   * STATS.., and the underlying table is a partitioned HdfsTable), some modifications are
-   * made to the non-incremental per-column query. First, a different UDA,
-   * NDV_NO_FINALIZE() is used to retrieve and serialise the intermediate state from each
-   * column. Second, the results are grouped by partition, as with the row count query, so
-   * that the intermediate NDV computation state can be stored per-partition. The number
-   * of rows per-partition are also recorded.
+   * 3. COMPUTE INCREMENTAL STATS
+   * 3.1 Row counts:
+   * SELECT COUNT(*) FROM tbl GROUP BY part_col1, part_col2 ...
+   * [WHERE ((part_col1 = p1_val1) AND (part_col2 = p1_val2)) OR
+   *        ((part_col1 = p2_val1) AND (part_col2 = p2_val2)) OR ...]
+   * The WHERE clause is constructed to select the relevant partitions.
    *
-   * For both the row count query, and the column stats query, the query's WHERE clause is
-   * used to restrict execution only to partitions that actually require new statistics to
-   * be computed.
-   *
-   * SELECT NDV_NO_FINALIZE(col), <nulls, max, avg>, COUNT(col) FROM tbl
+   * 3.2 Column stats:
+   * SELECT NDV_NO_FINALIZE(c1), <nulls, max, avg>, COUNT(c1),
+   *        NDV_NO_FINALIZE(c2), <nulls, max, avg>, COUNT(c2),
+   *        ...
+   * FROM tbl
    * GROUP BY part_col1, part_col2, ...
-   * WHERE ((part_col1 = p1_val1) AND (part_col2 = p1_val2)) OR
-   *       ((part_col1 = p2_val1) AND (part_col2 = p2_val2)) OR ...
+   * [WHERE ((part_col1 = p1_val1) AND (part_col2 = p1_val2)) OR
+   *       ((part_col1 = p2_val1) AND (part_col2 = p2_val2)) OR ...]
+   * The WHERE clause is constructed to select the relevant partitions.
+   * NDV_NO_FINALIZE() produces a non-finalized HyperLogLog intermediate byte array.
+   *
+   * 4. For all COMPUTE STATS variants:
+   * - The MAX() and AVG() for the column stats queries are only relevant for var-len
+   *   columns like STRING. For fixed-len columns MAX() and AVG() are replaced with the
+   *   appropriate literals.
+   * - Queries will be set to null if we can detect that no work needs to be performed.
    */
   @Override
   public void analyze(Analyzer analyzer) throws AnalysisException {
@@ -432,10 +455,17 @@ public class ComputeStatsStmt extends StatementBase {
       validPartStats_.clear();
     }
 
+    // Tablesample clause to be used for all child queries.
+    String tableSampleSql = analyzeTableSampleClause(analyzer);
 
     // Query for getting the per-partition row count and the total row count.
     StringBuilder tableStatsQueryBuilder = new StringBuilder("SELECT ");
-    List<String> tableStatsSelectList = Lists.newArrayList("COUNT(*)");
+    String countSql = "COUNT(*)";
+    if (effectiveSamplePerc_ > 0) {
+      // Extrapolate the count based on the effective sampling rate.
+      countSql = String.format("ROUND(COUNT(*) / %.10f)", effectiveSamplePerc_);
+    }
+    List<String> tableStatsSelectList = Lists.newArrayList(countSql);
     // Add group by columns for incremental stats or with extrapolation disabled.
     List<String> groupByCols = Lists.newArrayList();
     if (!updateTableStatsOnly()) {
@@ -445,8 +475,6 @@ public class ComputeStatsStmt extends StatementBase {
       tableStatsSelectList.addAll(groupByCols);
     }
     tableStatsQueryBuilder.append(Joiner.on(", ").join(tableStatsSelectList));
-    // Tablesample clause to be used for all child queries.
-    String tableSampleSql = analyzeTableSampleClause(analyzer);
     tableStatsQueryBuilder.append(" FROM " + tableName_.toSql() + tableSampleSql);
 
     // Query for getting the per-column NDVs and number of NULLs.
@@ -495,7 +523,8 @@ public class ComputeStatsStmt extends StatementBase {
   }
 
   /**
-   * Analyzes the TABLESAMPLE clause and computes the sample to set 'sampleFileBytes_'.
+   * Analyzes the TABLESAMPLE clause and computes the files sample to set
+   * 'effectiveSamplePerc_'.
    * Returns the TABLESAMPLE SQL to be used for all child queries or an empty string if
    * not sampling. If sampling, the returned SQL includes a fixed random seed so all
    * child queries generate a consistent sample, even if the user did not originally
@@ -524,10 +553,20 @@ public class ComputeStatsStmt extends StatementBase {
     HdfsTable hdfsTable = (HdfsTable) table_;
     Map<Long, List<FileDescriptor>> sample = hdfsTable.getFilesSample(
         hdfsTable.getPartitions(), sampleParams_.getPercentBytes(), sampleSeed);
-    sampleFileBytes_ = 0;
+    long sampleFileBytes = 0;
     for (List<FileDescriptor> fds: sample.values()) {
-      for (FileDescriptor fd: fds) sampleFileBytes_ += fd.getFileLength();
+      for (FileDescriptor fd: fds) sampleFileBytes += fd.getFileLength();
     }
+
+    // Compute effective sampling percent.
+    long totalFileBytes = ((HdfsTable)table_).getTotalHdfsBytes();
+    if (totalFileBytes > 0) {
+      effectiveSamplePerc_ = (double) sampleFileBytes / (double) totalFileBytes;
+    } else {
+      effectiveSamplePerc_ = 0;
+    }
+    Preconditions.checkState(effectiveSamplePerc_ >= 0.0 && effectiveSamplePerc_ <= 1.0);
+
     return " " + sampleParams_.toSql(sampleSeed);
   }
 
@@ -666,13 +705,8 @@ public class ComputeStatsStmt extends StatementBase {
     if (isIncremental_) {
       params.setNum_partition_cols(((HdfsTable)table_).getNumClusteringCols());
     }
-
     if (table_ instanceof HdfsTable) {
       params.setTotal_file_bytes(((HdfsTable)table_).getTotalHdfsBytes());
-      if (sampleParams_ != null) {
-        Preconditions.checkState(sampleFileBytes_ >= 0);
-        if (sampleFileBytes_ != -1) params.setSample_file_bytes(sampleFileBytes_);
-      }
     }
     return params;
   }

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
index ba42583..b0ed45f 100644
--- a/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
+++ b/fe/src/main/java/org/apache/impala/service/CatalogOpExecutor.java
@@ -801,12 +801,6 @@ public class CatalogOpExecutor {
       org.apache.hadoop.hive.metastore.api.Table msTbl) throws ImpalaException {
     Preconditions.checkState(params.isSetTable_stats());
     long numRows = params.table_stats.num_rows;
-    // Extrapolate based on sampling (if applicable).
-    if (params.isSetSample_file_bytes() && params.table_stats.isSetTotal_file_bytes()) {
-      numRows = getExtrapolatedStatsVal(numRows, params.sample_file_bytes,
-          params.table_stats.total_file_bytes);
-    }
-
     // Update the table's ROW_COUNT and TOTAL_SIZE parameters.
     msTbl.putToParameters(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
     if (params.getTable_stats().isSetTotal_file_bytes()) {
@@ -852,33 +846,12 @@ public class CatalogOpExecutor {
     return colStats;
   }
 
-  /**
-   * Returns 'val' extrapolated based on the sampled and total file bytes. Uses a basic
-   * linear extrapolation. All parameters must be >= 0.
-   * The returned value is >= 'val' and >= 0. Returns Long.MAX_VALUE if a computation
-   * overflows.
-   */
-  private static long getExtrapolatedStatsVal(long val, long sampleFileBytes,
-      long totalFileBytes) {
-    Preconditions.checkArgument(val >= 0 && sampleFileBytes >= 0 && totalFileBytes >= 0);
-    double mult = 0.0;
-    if (sampleFileBytes > 0) mult = (double) totalFileBytes / sampleFileBytes;
-    // The round() caps the returned value at Long.MAX_VALUE.
-    return Math.round(val * mult);
-  }
-
   private static ColumnStatisticsData createHiveColStatsData(
       TAlterTableUpdateStatsParams params, TColumnStats colStats, Type colType) {
     ColumnStatisticsData colStatsData = new ColumnStatisticsData();
     long ndv = colStats.getNum_distinct_values();
     // Cap NDV at row count if available.
     if (params.isSetTable_stats()) ndv = Math.min(ndv, params.table_stats.num_rows);
-    // Extrapolate NDV based on sampling if applicable.
-    if (params.isSetSample_file_bytes() && params.isSetTable_stats()
-        && params.table_stats.isSetTotal_file_bytes()) {
-      ndv = getExtrapolatedStatsVal(ndv, params.sample_file_bytes,
-          params.table_stats.total_file_bytes);
-    }
 
     long numNulls = colStats.getNum_nulls();
     switch(colType.getPrimitiveType()) {

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/testdata/workloads/functional-query/queries/QueryTest/compute-stats-tablesample.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-tablesample.test b/testdata/workloads/functional-query/queries/QueryTest/compute-stats-tablesample.test
deleted file mode 100644
index 893f428..0000000
--- a/testdata/workloads/functional-query/queries/QueryTest/compute-stats-tablesample.test
+++ /dev/null
@@ -1,137 +0,0 @@
-====
----- QUERY
-compute stats alltypes tablesample system (50) repeatable (1)
----- RESULTS
-'Updated 1 partition(s) and 11 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-# Only the table-level row count is stored. The partition row counts are extrapolated.
-show table stats alltypes
----- LABELS
-YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
----- RESULTS
-'2009','1',-1,305,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=1'
-'2009','2',-1,277,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=2'
-'2009','3',-1,306,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=3'
-'2009','4',-1,299,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=4'
-'2009','5',-1,311,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=5'
-'2009','6',-1,301,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=6'
-'2009','7',-1,311,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=7'
-'2009','8',-1,311,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=8'
-'2009','9',-1,301,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=9'
-'2009','10',-1,311,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=10'
-'2009','11',-1,301,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=11'
-'2009','12',-1,311,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=12'
-'Total','',3643,3643,12,regex:.*B,'0B','','','',''
----- TYPES
-STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-show column stats alltypes
----- LABELS
-COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
----- RESULTS
-'id','INT',3643,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',17,-1,1,1
-'smallint_col','SMALLINT',17,-1,2,2
-'int_col','INT',17,-1,4,4
-'bigint_col','BIGINT',17,-1,8,8
-'float_col','FLOAT',17,-1,4,4
-'double_col','DOUBLE',17,-1,8,8
-'date_string_col','STRING',364,-1,8,8
-'string_col','STRING',17,-1,1,1
-'timestamp_col','TIMESTAMP',3612,-1,16,16
-'year','INT',1,0,4,4
-'month','INT',12,0,4,4
----- TYPES
-STRING,STRING,BIGINT,BIGINT,BIGINT,DOUBLE
-====
----- QUERY
-# Repeat tests on a very small sample.
-compute stats alltypes tablesample system (1) repeatable (1)
----- RESULTS
-'Updated 1 partition(s) and 11 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-show table stats alltypes
----- LABELS
-YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
----- RESULTS
-'2009','1',-1,304,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=1'
-'2009','2',-1,276,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=2'
-'2009','3',-1,305,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=3'
-'2009','4',-1,298,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=4'
-'2009','5',-1,310,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=5'
-'2009','6',-1,300,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=6'
-'2009','7',-1,310,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=7'
-'2009','8',-1,310,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=8'
-'2009','9',-1,300,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=9'
-'2009','10',-1,310,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=10'
-'2009','11',-1,300,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=11'
-'2009','12',-1,310,1,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/alltypes/year=2009/month=12'
-'Total','',3633,3633,12,regex:.*B,'0B','','','',''
----- TYPES
-STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-show column stats alltypes
----- LABELS
-COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
----- RESULTS
-'id','INT',3633,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',117,-1,1,1
-'smallint_col','SMALLINT',117,-1,2,2
-'int_col','INT',117,-1,4,4
-'bigint_col','BIGINT',117,-1,8,8
-'float_col','FLOAT',117,-1,4,4
-'double_col','DOUBLE',117,-1,8,8
-'date_string_col','STRING',352,-1,8,8
-'string_col','STRING',117,-1,1,1
-'timestamp_col','TIMESTAMP',3633,-1,16,16
-'year','INT',1,0,4,4
-'month','INT',12,0,4,4
----- TYPES
-STRING,STRING,BIGINT,BIGINT,BIGINT,DOUBLE
-====
----- QUERY
-# Test unpartitioned table.
-compute stats alltypesnopart tablesample system (10) repeatable (999)
----- RESULTS
-'Updated 1 partition(s) and 11 column(s).'
----- TYPES
-STRING
-====
----- QUERY
-show table stats alltypesnopart
----- LABELS
-#ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
----- RESULTS
-3660,3660,12,regex:.*B,'NOT CACHED','NOT CACHED','TEXT','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypesnopart'
----- TYPES
-BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
-====
----- QUERY
-show column stats alltypesnopart
----- LABELS
-COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
----- RESULTS
-'id','INT',3660,-1,4,4
-'bool_col','BOOLEAN',2,-1,1,1
-'tinyint_col','TINYINT',63,-1,1,1
-'smallint_col','SMALLINT',63,-1,2,2
-'int_col','INT',63,-1,4,4
-'bigint_col','BIGINT',63,-1,8,8
-'float_col','FLOAT',63,-1,4,4
-'double_col','DOUBLE',63,-1,8,8
-'date_string_col','STRING',360,-1,8,8
-'string_col','STRING',63,-1,1,1
-'timestamp_col','TIMESTAMP',3660,-1,16,16
----- TYPES
-STRING,STRING,BIGINT,BIGINT,BIGINT,DOUBLE
-====

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/tests/common/impala_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py
index b35e054..86bbf71 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -623,6 +623,19 @@ class ImpalaTestSuite(BaseTestSuite):
     self.hive_client.drop_table(db_name, table_name, True)
     self.hive_client.create_table(table)
 
+  def clone_table(self, src_tbl, dst_tbl, recover_partitions, vector):
+    src_loc = self._get_table_location(src_tbl, vector)
+    self.client.execute("create external table {0} like {1} location '{2}'"\
+        .format(dst_tbl, src_tbl, src_loc))
+    if recover_partitions:
+      self.client.execute("alter table {0} recover partitions".format(dst_tbl))
+
+  def appx_equals(self, a, b, diff_perc):
+    """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other,
+    False otherwise. 'diff_perc' must be a float in [0,1]."""
+    if a == b: return True # Avoid division by 0
+    assert abs(a - b) / float(max(a,b)) <= diff_perc
+
   def _get_table_location(self, table_name, vector):
     """ Returns the HDFS location of the table """
     result = self.execute_query_using_client(self.client,

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/tests/custom_cluster/test_stats_extrapolation.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_stats_extrapolation.py b/tests/custom_cluster/test_stats_extrapolation.py
index acf259a..6d0e34d 100644
--- a/tests/custom_cluster/test_stats_extrapolation.py
+++ b/tests/custom_cluster/test_stats_extrapolation.py
@@ -45,40 +45,93 @@ class TestStatsExtrapolation(CustomClusterTestSuite):
 
   @CustomClusterTestSuite.with_args(impalad_args=('--enable_stats_extrapolation=true'))
   def test_compute_stats_tablesample(self, vector, unique_database):
-    # Create a partitioned and unpartitioned text table. Use the existing files from
-    # functional.alltypes as data because those have a known, stable file size. This
-    # test is sensitive to changes in file sizes across test runs because the sampling
-    # is file based. Creating test tables with INSERT does not guarantee that the same
-    # file sample is selected across test runs, even with REPEATABLE.
+    """COMPUTE STATS TABLESAMPLE is inherently non-deterministic due to its use of
+    SAMPLED_NDV() so we test it specially. The goal of this test is to ensure that
+    COMPUTE STATS TABLESAMPLE computes in-the-right-ballpark stats and successfully
+    stores them in the HMS."""
 
-    # Create partitioned test table. External to avoid dropping files from alltypes.
+    # Test partitioned table.
     part_test_tbl = unique_database + ".alltypes"
-    self.client.execute(
-      "create external table %s like functional.alltypes" % part_test_tbl)
-    alltypes_loc = self._get_table_location("functional.alltypes", vector)
-    for m in xrange(1, 13):
-      part_loc = path.join(alltypes_loc, "year=2009/month=%s" % m)
-      self.client.execute(
-        "alter table %s add partition (year=2009,month=%s) location '%s'"
-        % (part_test_tbl, m, part_loc))
+    self.clone_table("functional.alltypes", part_test_tbl, True, vector)
+    self.__run_sampling_test(part_test_tbl, "functional.alltypes", 1, 3)
+    self.__run_sampling_test(part_test_tbl, "functional.alltypes", 10, 7)
+    self.__run_sampling_test(part_test_tbl, "functional.alltypes", 20, 13)
+    self.__run_sampling_test(part_test_tbl, "functional.alltypes", 100, 99)
 
-    # Create unpartitioned test table.
+    # Test unpartitioned table.
     nopart_test_tbl = unique_database + ".alltypesnopart"
-    self.client.execute("drop table if exists %s" % nopart_test_tbl)
-    self.client.execute(
-      "create table %s like functional.alltypesnopart" % nopart_test_tbl)
-    nopart_test_tbl_loc = self._get_table_location(nopart_test_tbl, vector)
-    # Remove NameNode prefix and first '/' because PyWebHdfs expects that
-    if nopart_test_tbl_loc.startswith(NAMENODE):
-      nopart_test_tbl_loc = nopart_test_tbl_loc[len(NAMENODE)+1:]
-    for m in xrange(1, 13):
-      src_part_loc = alltypes_loc + "/year=2009/month=%s" % m
-      # Remove NameNode prefix and first '/' because PyWebHdfs expects that
-      if src_part_loc.startswith(NAMENODE): src_part_loc = src_part_loc[len(NAMENODE)+1:]
-      file_names = self.filesystem_client.ls(src_part_loc)
-      for f in file_names:
-        self.filesystem_client.copy(path.join(src_part_loc, f),
-                                    path.join(nopart_test_tbl_loc, f))
-    self.client.execute("refresh %s" % nopart_test_tbl)
+    self.client.execute("create table {0} as select * from functional.alltypes"\
+      .format(nopart_test_tbl))
+    # Clone to use as a baseline. We run the regular COMPUTE STATS on this table.
+    nopart_test_tbl_exp = unique_database + ".alltypesnopart_exp"
+    self.clone_table(nopart_test_tbl, nopart_test_tbl_exp, False, vector)
+    self.client.execute("compute stats {0}".format(nopart_test_tbl_exp))
+    self.__run_sampling_test(nopart_test_tbl, nopart_test_tbl_exp, 1, 3)
+    self.__run_sampling_test(nopart_test_tbl, nopart_test_tbl_exp, 10, 7)
+    self.__run_sampling_test(nopart_test_tbl, nopart_test_tbl_exp, 20, 13)
+    self.__run_sampling_test(nopart_test_tbl, nopart_test_tbl_exp, 100, 99)
+
+    # Test empty table.
+    empty_test_tbl = unique_database + ".empty"
+    self.clone_table("functional.alltypes", empty_test_tbl, False, vector)
+    self.__run_sampling_test(empty_test_tbl, empty_test_tbl, 10, 7)
+
+    # Test wide table. Should not crash or error. This takes a few minutes so restrict
+    # to exhaustive.
+    if self.exploration_strategy() == "exhaustive":
+      wide_test_tbl = unique_database + ".wide"
+      self.clone_table("functional.widetable_1000_cols", wide_test_tbl, False, vector)
+      self.client.execute(
+        "compute stats {0} tablesample system(10)".format(wide_test_tbl))
+
+  def __run_sampling_test(self, tbl, expected_tbl, perc, seed):
+    """Drops stats on 'tbl' and then runs COMPUTE STATS TABLESAMPLE on 'tbl' with the
+    given sampling percent and random seed. Checks that the resulting table and column
+    stats are reasoanbly close to those of 'expected_tbl'."""
+    self.client.execute("drop stats {0}".format(tbl))
+    self.client.execute("compute stats {0} tablesample system ({1}) repeatable ({2})"\
+      .format(tbl, perc, seed))
+    self.__check_table_stats(tbl, expected_tbl)
+    self.__check_column_stats(tbl, expected_tbl)
+
+  def __check_table_stats(self, tbl, expected_tbl):
+    """Checks that the row counts reported in SHOW TABLE STATS on 'tbl' are within 2x
+    of those reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
+    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
+    actual = self.client.execute("show table stats {0}".format(tbl))
+    expected = self.client.execute("show table stats {0}".format(expected_tbl))
+    assert len(actual.data) == len(expected.data)
+    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
+    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
+    rows_col_idx = col_names.index("#ROWS")
+    extrap_rows_col_idx = col_names.index("EXTRAP #ROWS")
+    for i in xrange(0, len(actual.data)):
+      act_cols = actual.data[i].split("\t")
+      exp_cols = expected.data[i].split("\t")
+      assert int(exp_cols[rows_col_idx]) >= 0
+      self.appx_equals(\
+        int(act_cols[extrap_rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+      # Only the table-level row count is stored. The partition row counts
+      # are extrapolated.
+      if act_cols[0] == "Total":
+        self.appx_equals(
+          int(act_cols[rows_col_idx]), int(exp_cols[rows_col_idx]), 2)
+      elif len(actual.data) > 1:
+        # Partition row count is expected to not be set.
+        assert int(act_cols[rows_col_idx]) == -1
 
-    self.run_test_case('QueryTest/compute-stats-tablesample', vector, unique_database)
+  def __check_column_stats(self, tbl, expected_tbl):
+    """Checks that the NDVs in SHOW COLUMNS STATS on 'tbl' are within 2x of those
+    reported for 'expected_tbl'. Assumes that COMPUTE STATS was previously run
+    on 'expected_table' and that COMPUTE STATS TABLESAMPLE was run on 'tbl'."""
+    actual = self.client.execute("show column stats {0}".format(tbl))
+    expected = self.client.execute("show column stats {0}".format(expected_tbl))
+    assert len(actual.data) == len(expected.data)
+    assert len(actual.schema.fieldSchemas) == len(expected.schema.fieldSchemas)
+    col_names = [fs.name.upper() for fs in actual.schema.fieldSchemas]
+    ndv_col_idx = col_names.index("#DISTINCT VALUES")
+    for i in xrange(0, len(actual.data)):
+      act_cols = actual.data[i].split("\t")
+      exp_cols = expected.data[i].split("\t")
+      assert int(exp_cols[ndv_col_idx]) >= 0
+      self.appx_equals(int(act_cols[ndv_col_idx]), int(exp_cols[ndv_col_idx]), 2)

http://git-wip-us.apache.org/repos/asf/impala/blob/1f7b3b00/tests/query_test/test_aggregation.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py
index 233c33a..aee2236 100644
--- a/tests/query_test/test_aggregation.py
+++ b/tests/query_test/test_aggregation.py
@@ -329,7 +329,7 @@ class TestAggregationQueries(ImpalaTestSuite):
       # Low NDV columns. We expect a reasonaby accurate estimate regardless of the
       # sampling percent.
       for i in xrange(0, 14):
-        self.__appx_equals(int(sampled_ndv_vals[i]), int(ndv_vals[i]), 0.1)
+        self.appx_equals(int(sampled_ndv_vals[i]), int(ndv_vals[i]), 0.1)
       # High NDV columns. We expect the estimate to have high variance and error.
       # Since we give NDV() and SAMPLED_NDV() the same input data, i.e., we are not
       # actually sampling for SAMPLED_NDV(), we expect the result of SAMPLED_NDV() to
@@ -337,12 +337,7 @@ class TestAggregationQueries(ImpalaTestSuite):
       # For example, the column 'id' is a PK so we expect the result of SAMPLED_NDV()
       # with a sampling percent of 0.1 to be approximately 10x of the NDV().
       for i in xrange(14, 16):
-        self.__appx_equals(int(sampled_ndv_vals[i]) * sample_perc, int(ndv_vals[i]), 2.0)
-
-  def __appx_equals(self, a, b, diff_perc):
-    """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other,
-    False otherwise. 'diff_perc' must be a float in [0,1]."""
-    assert abs(a - b) / float(max(a, b)) <= diff_perc
+        self.appx_equals(int(sampled_ndv_vals[i]) * sample_perc, int(ndv_vals[i]), 2.0)
 
 class TestWideAggregationQueries(ImpalaTestSuite):
   """Test that aggregations with many grouping columns work"""