You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by to...@apache.org on 2018/07/12 02:47:18 UTC

[2/3] impala git commit: IMPALA-7140 (part 6): fetch column stats for LocalTable

IMPALA-7140 (part 6): fetch column stats for LocalTable

This adds fetching of column statistics for LocalTable. Currently, all
column stats are fetched when the table is loaded, even for simple
statements like 'DESCRIBE' where they aren't necessary. This is because
I couldn't find a convenient spot during analysis at which time the set
of necessary columns are known. I left a TODO for this potential
improvement.

With this change I can see that 'SHOW COLUMN STATS' shows the expected
results for functional.alltypes. A new simple unit test verifies this.

Planner tests still don't pass due to some NullPointerExceptions related
to loading functions from the builtins DB -- most of the tests seem to
rely on simple built-ins like COUNT and CAST.

Change-Id: Ib6403c2bedf4ee29c5e6f90e947382cb44f46e0c
Reviewed-on: http://gerrit.cloudera.org:8080/10797
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Todd Lipcon <to...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/9df9efc5
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/9df9efc5
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/9df9efc5

Branch: refs/heads/master
Commit: 9df9efc5f33d2e31f218b69a6053b575a30696a6
Parents: 9b4e6d8
Author: Todd Lipcon <to...@cloudera.com>
Authored: Wed Jun 20 15:42:39 2018 -0700
Committer: Todd Lipcon <to...@apache.org>
Committed: Thu Jul 12 02:36:55 2018 +0000

----------------------------------------------------------------------
 .../apache/impala/catalog/FeCatalogUtils.java   | 29 ++++++++++++++++++++
 .../java/org/apache/impala/catalog/Table.java   | 19 ++-----------
 .../catalog/local/DirectMetaProvider.java       | 10 +++++++
 .../impala/catalog/local/LocalFsTable.java      | 24 ++++++++++++++++
 .../apache/impala/catalog/local/LocalTable.java | 29 +++++++++++++++++---
 .../impala/catalog/local/MetaProvider.java      |  7 +++++
 .../impala/catalog/local/LocalCatalogTest.java  | 16 +++++++++++
 7 files changed, 114 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
index dc16629..12b152f 100644
--- a/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
+++ b/fe/src/main/java/org/apache/impala/catalog/FeCatalogUtils.java
@@ -24,6 +24,7 @@ import java.util.Map;
 
 import org.apache.hadoop.hive.common.FileUtils;
 import org.apache.hadoop.hive.common.StatsSetupConst;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.impala.analysis.Expr;
 import org.apache.impala.analysis.LiteralExpr;
@@ -129,6 +130,34 @@ public abstract class FeCatalogUtils {
   }
 
   /**
+   * Given the list of column stats returned from the metastore, inject those
+   * stats into matching columns in 'table'.
+   */
+  public static void injectColumnStats(List<ColumnStatisticsObj> colStats,
+      FeTable table) {
+    for (ColumnStatisticsObj stats: colStats) {
+      Column col = table.getColumn(stats.getColName());
+      Preconditions.checkNotNull(col, "Unable to find column %s in table %s",
+          stats.getColName(), table.getFullName());
+      if (!ColumnStats.isSupportedColType(col.getType())) {
+        LOG.warn(String.format(
+            "Statistics for %s, column %s are not supported as column " +
+            "has type %s", table.getFullName(), col.getName(), col.getType()));
+        continue;
+      }
+
+      if (!col.updateStats(stats.getStatsData())) {
+        LOG.warn(String.format(
+            "Failed to load column stats for %s, column %s. Stats may be " +
+            "incompatible with column type %s. Consider regenerating statistics " +
+            "for %s.", table.getFullName(), col.getName(), col.getType(),
+            table.getFullName()));
+        continue;
+      }
+    }
+  }
+
+  /**
    * Returns the value of the ROW_COUNT constant, or -1 if not found.
    */
   public static long getRowCount(Map<String, String> parameters) {

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/Table.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java b/fe/src/main/java/org/apache/impala/catalog/Table.java
index 7fdc3c9..06c4500 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Table.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Table.java
@@ -214,6 +214,8 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
 
     // We need to only query those columns which may have stats; asking HMS for other
     // columns causes loadAllColumnStats() to return nothing.
+    // TODO(todd): this no longer seems to be true - asking for a non-existent column
+    // is just ignored, and the columns that do exist are returned.
     List<String> colNames = getColumnNamesWithHmsStats();
 
     try {
@@ -223,22 +225,7 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
       return;
     }
 
-    for (ColumnStatisticsObj stats: colStats) {
-      Column col = getColumn(stats.getColName());
-      Preconditions.checkNotNull(col);
-      if (!ColumnStats.isSupportedColType(col.getType())) {
-        LOG.warn(String.format("Statistics for %s, column %s are not supported as " +
-                "column has type %s", getFullName(), col.getName(), col.getType()));
-        continue;
-      }
-
-      if (!col.updateStats(stats.getStatsData())) {
-        LOG.warn(String.format("Failed to load column stats for %s, column %s. Stats " +
-            "may be incompatible with column type %s. Consider regenerating statistics " +
-            "for %s.", getFullName(), col.getName(), col.getType(), getFullName()));
-        continue;
-      }
-    }
+    FeCatalogUtils.injectColumnStats(colStats, this);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java b/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java
index aadbf23..59acd9d 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/DirectMetaProvider.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.hive.common.FileUtils;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
@@ -160,6 +161,15 @@ class DirectMetaProvider implements MetaProvider {
     return ret;
   }
 
+
+  @Override
+  public List<ColumnStatisticsObj> loadTableColumnStatistics(String dbName,
+      String tblName, List<String> colNames) throws TException {
+    try (MetaStoreClient c = msClientPool_.getClient()) {
+      return c.getHiveClient().getTableColumnStatistics(dbName, tblName, colNames);
+    }
+  }
+
   @Override
   public List<LocatedFileStatus> loadFileMetadata(Path dir) throws IOException {
     Preconditions.checkNotNull(dir);

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java
index 6871f90..42d3b03 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalFsTable.java
@@ -31,6 +31,7 @@ import org.apache.impala.analysis.LiteralExpr;
 import org.apache.impala.analysis.NullLiteral;
 import org.apache.impala.catalog.CatalogException;
 import org.apache.impala.catalog.Column;
+import org.apache.impala.catalog.ColumnStats;
 import org.apache.impala.catalog.FeCatalogUtils;
 import org.apache.impala.catalog.FeFsPartition;
 import org.apache.impala.catalog.FeFsTable;
@@ -375,6 +376,29 @@ public class LocalFsTable extends LocalTable implements FeFsTable {
     partitionSpecs_ = b.build();
   }
 
+  /**
+   * Override base implementation to populate column stats for
+   * clustering columns based on the partition map.
+   */
+  @Override
+  protected void loadColumnStats() {
+    super.loadColumnStats();
+    // TODO(todd): this is called for all tables even if not necessary,
+    // which means we need to load all partition names, even if not
+    // necessary.
+    loadPartitionValueMap();
+    for (int i = 0; i < getNumClusteringCols(); i++) {
+      ColumnStats stats = getColumns().get(i).getStats();
+      int nonNullParts = partitionValueMap_.get(i).size();
+      int nullParts = nullPartitionIds_.get(i).size();
+      stats.setNumDistinctValues(nonNullParts + nullParts);
+      // TODO(todd): this calculation ends up setting the num_nulls stat
+      // to the number of partitions with null rows, not the number of rows.
+      // However, it maintains the existing behavior from HdfsTable.
+      stats.setNumNulls(nullParts);
+    }
+  }
+
   @Override
   public int parseSkipHeaderLineCount(StringBuilder error) {
     // TODO Auto-generated method stub

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java b/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java
index a0a7bd7..ae697b5 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/LocalTable.java
@@ -22,6 +22,7 @@ import java.util.List;
 
 import javax.annotation.concurrent.Immutable;
 
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.impala.analysis.TableName;
@@ -37,6 +38,7 @@ import org.apache.impala.catalog.StructType;
 import org.apache.impala.catalog.TableLoadingException;
 import org.apache.impala.thrift.TCatalogObjectType;
 import org.apache.impala.thrift.TTableStats;
+import org.apache.log4j.Logger;
 import org.apache.thrift.TException;
 
 import com.google.common.base.Preconditions;
@@ -52,6 +54,8 @@ import com.google.common.collect.Lists;
  * each catalog instance.
  */
 abstract class LocalTable implements FeTable {
+  private static final Logger LOG = Logger.getLogger(LocalTable.class);
+
   protected final LocalDb db_;
   /** The lower-case name of the table. */
   protected final String name_;
@@ -61,15 +65,22 @@ abstract class LocalTable implements FeTable {
     // In order to know which kind of table subclass to instantiate, we need
     // to eagerly grab and parse the top-level Table object from the HMS.
     SchemaInfo schemaInfo = SchemaInfo.load(db, tblName);
+    LocalTable t;
     if (HdfsFileFormat.isHdfsInputFormatClass(
         schemaInfo.msTable_.getSd().getInputFormat())) {
-      return new LocalFsTable(db, tblName, schemaInfo);
+      t = new LocalFsTable(db, tblName, schemaInfo);
+    } else {
+      throw new LocalCatalogException("Unknown table type for table " +
+          db.getName() + "." + tblName);
     }
 
-    throw new LocalCatalogException("Unknown table type for table " +
-        db.getName() + "." + tblName);
+    // TODO(todd): it would be preferable to only load stats for those columns
+    // referenced in a query, but there doesn't seem to be a convenient spot
+    // in between slot reference resolution and where the stats are needed.
+    // So, for now, we'll just load all the column stats up front.
+    t.loadColumnStats();
+    return t;
   }
-
   public LocalTable(LocalDb db, String tblName, SchemaInfo schemaInfo) {
     this.db_ = Preconditions.checkNotNull(db);
     this.name_ = Preconditions.checkNotNull(tblName);
@@ -179,6 +190,16 @@ abstract class LocalTable implements FeTable {
     return schemaInfo_.tableStats_;
   }
 
+  protected void loadColumnStats() {
+    try {
+      List<ColumnStatisticsObj> stats = db_.getCatalog().getMetaProvider()
+          .loadTableColumnStatistics(db_.getName(), getName(), getColumnNames());
+      FeCatalogUtils.injectColumnStats(stats, this);
+    } catch (TException e) {
+      LOG.warn("Could not load column statistics for: " + getFullName(), e);
+    }
+  }
+
   /**
    * The table schema, loaded from the HMS Table object. This is common
    * to all Table implementations and includes the column definitions and

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java b/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java
index f59a7c9..9f433f8 100644
--- a/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java
+++ b/fe/src/main/java/org/apache/impala/catalog/local/MetaProvider.java
@@ -23,6 +23,7 @@ import java.util.Map;
 
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.Database;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
@@ -70,6 +71,12 @@ interface MetaProvider {
       throws MetaException, TException;
 
   /**
+   * Load statistics for the given columns from the given table.
+   */
+  List<ColumnStatisticsObj> loadTableColumnStatistics(String dbName,
+      String tblName, List<String> colNames) throws TException;
+
+  /**
    * Load file metadata and block locations for the files in the given
    * partition directory.
    */

http://git-wip-us.apache.org/repos/asf/impala/blob/9df9efc5/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
index a242b1c..100ba47 100644
--- a/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
+++ b/fe/src/test/java/org/apache/impala/catalog/local/LocalCatalogTest.java
@@ -23,6 +23,7 @@ import java.util.List;
 import java.util.Set;
 
 import org.apache.impala.catalog.CatalogTest;
+import org.apache.impala.catalog.ColumnStats;
 import org.apache.impala.catalog.FeCatalogUtils;
 import org.apache.impala.catalog.FeDb;
 import org.apache.impala.catalog.FeFsPartition;
@@ -149,4 +150,19 @@ public class LocalCatalogTest {
     }
     assertEquals(24, totalFds);
   }
+
+  @Test
+  public void testColumnStats() throws Exception {
+    FeFsTable t = (FeFsTable) catalog_.getTable("functional",  "alltypesagg");
+    // Verify expected stats for a partitioning column.
+    // 'days' has 10 non-NULL plus one NULL partition
+    ColumnStats stats = t.getColumn("day").getStats();
+    assertEquals(11, stats.getNumDistinctValues());
+    assertEquals(1, stats.getNumNulls());
+
+    // Verify expected stats for timestamp.
+    stats = t.getColumn("timestamp_col").getStats();
+    assertEquals(10210, stats.getNumDistinctValues());
+    assertEquals(-1, stats.getNumNulls());
+  }
 }