You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/02/28 22:30:42 UTC

[impala] 03/07: IMPALA-6897: Catalog web-ui exposes top-n tables with most num of files.

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1879436955e06b094da52835881528e453274d29
Author: Yongzhi Chen <yc...@cloudera.com>
AuthorDate: Thu Feb 14 12:25:44 2019 -0500

    IMPALA-6897: Catalog web-ui exposes top-n tables with most num of files.
    
    Add functions in CatalogUsageMonitor to monitor and report the catalog
    usage of the tables have the most number of files
    List the tables with the most number of files in Catalog server web-ui
    and sorted by the files number.
    
    Testing:
    Add tests to check table usage information is in the catalogd's
    catalog page.
    
    Change-Id: I04df5756641bb10dbb86d813b8001c4a04d7dc9b
    Reviewed-on: http://gerrit.cloudera.org:8080/12483
    Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/catalog/catalog-server.cc                   | 26 ++++++++++++++
 be/src/catalog/catalog-server.h                    | 11 ++++--
 common/thrift/JniCatalog.thrift                    |  7 ++++
 .../impala/catalog/CatalogServiceCatalog.java      |  9 +++++
 .../apache/impala/catalog/CatalogUsageMonitor.java | 25 ++++++++++++--
 .../java/org/apache/impala/catalog/HdfsTable.java  |  1 +
 .../main/java/org/apache/impala/catalog/Table.java | 14 ++++++++
 tests/webserver/test_web_pages.py                  | 35 ++++++++++++++++++-
 www/catalog.tmpl                                   | 40 ++++++++++++++++++++++
 9 files changed, 162 insertions(+), 6 deletions(-)

diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index da73424..3da5ee0 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -511,6 +511,32 @@ void CatalogServer::GetCatalogUsage(Document* document) {
   num_frequent_tables.SetInt(catalog_usage_result.frequently_accessed_tables.size());
   document->AddMember("num_frequent_tables", num_frequent_tables,
       document->GetAllocator());
+
+  // Collect information about the most number of files tables.
+  Value high_filecount_tbls(kArrayType);
+  for (int i = 0; i < catalog_usage_result.high_file_count_tables.size(); ++i) {
+    Value tbl_obj(kObjectType);
+    const auto& high_filecount_tbl = catalog_usage_result.high_file_count_tables[i];
+    Value tbl_name(Substitute("$0.$1", high_filecount_tbl.table_name.db_name,
+        high_filecount_tbl.table_name.table_name).c_str(), document->GetAllocator());
+    tbl_obj.AddMember("name", tbl_name, document->GetAllocator());
+    Value num_files;
+    DCHECK(high_filecount_tbl.__isset.num_files);
+    num_files.SetInt64(high_filecount_tbl.num_files);
+    tbl_obj.AddMember("num_files", num_files,
+        document->GetAllocator());
+    high_filecount_tbls.PushBack(tbl_obj, document->GetAllocator());
+  }
+  Value has_high_filecount_tbls;
+  has_high_filecount_tbls.SetBool(true);
+  document->AddMember("has_high_file_count_tables", has_high_filecount_tbls,
+      document->GetAllocator());
+  document->AddMember("high_file_count_tables", high_filecount_tbls,
+      document->GetAllocator());
+  Value num_high_filecount_tbls;
+  num_high_filecount_tbls.SetInt(catalog_usage_result.high_file_count_tables.size());
+  document->AddMember("num_high_file_count_tables", num_high_filecount_tbls,
+      document->GetAllocator());
 }
 
 void CatalogServer::EventMetricsUrlCallback(
diff --git a/be/src/catalog/catalog-server.h b/be/src/catalog/catalog-server.h
index 4d3cd3e..9e51ccd 100644
--- a/be/src/catalog/catalog-server.h
+++ b/be/src/catalog/catalog-server.h
@@ -183,8 +183,9 @@ class CatalogServer {
 
   /// Retrieves from the FE information about the current catalog usage and populates
   /// the /catalog debug webpage. The catalog usage includes information about the TOP-N
-  /// frequently used (in terms of number of metadata operations) tables as well as the
-  /// TOP-N tables with the highest memory requirements.
+  /// frequently used (in terms of number of metadata operations) tables, the TOP-N
+  /// tables with the highest memory requirements and the TOP-N tables with the most
+  /// number of files.
   ///
   /// Example output:
   /// "large_tables": [
@@ -199,6 +200,12 @@ class CatalogServer {
   ///        "frequency": 10
   ///      }
   ///  ]
+  ///  "high_file_count_tables": [
+  ///      {
+  ///        "name": functional.alltypesagg",
+  ///        "num_files": 30
+  ///      }
+  ///  ]
   void GetCatalogUsage(rapidjson::Document* document);
 
   /// Debug webpage handler that is used to dump all the registered metrics of a
diff --git a/common/thrift/JniCatalog.thrift b/common/thrift/JniCatalog.thrift
index 24571a8..f641a60 100644
--- a/common/thrift/JniCatalog.thrift
+++ b/common/thrift/JniCatalog.thrift
@@ -706,6 +706,10 @@ struct TTableUsageMetrics {
 
   // Number of metadata operations performed on the table since it was loaded.
   3: optional i64 num_metadata_operations
+
+  // Number of files in this table. For partitioned table, this includes file counts
+  // across all the partitions.
+  4: optional i64 num_files
 }
 
 // Response to a GetCatalogUsage request.
@@ -716,6 +720,9 @@ struct TGetCatalogUsageResponse{
   // List of the most frequently accessed (in terms of number of metadata operations)
   // tables.
   2: required list<TTableUsageMetrics> frequently_accessed_tables
+
+  // List of the tables that have most number of files
+  3: required list<TTableUsageMetrics> high_file_count_tables
 }
 
 struct TColumnName {
diff --git a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
index 63c66f2..678c5fb 100644
--- a/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
+++ b/fe/src/main/java/org/apache/impala/catalog/CatalogServiceCatalog.java
@@ -2372,6 +2372,7 @@ public class CatalogServiceCatalog extends Catalog {
     TGetCatalogUsageResponse usage = new TGetCatalogUsageResponse();
     usage.setLarge_tables(new ArrayList<>());
     usage.setFrequently_accessed_tables(new ArrayList<>());
+    usage.setHigh_file_count_tables(new ArrayList<>());
     for (Table largeTable: CatalogUsageMonitor.INSTANCE.getLargestTables()) {
       TTableUsageMetrics tableUsageMetrics =
           new TTableUsageMetrics(largeTable.getTableName().toThrift());
@@ -2385,6 +2386,14 @@ public class CatalogServiceCatalog extends Catalog {
       tableUsageMetrics.setNum_metadata_operations(frequentTable.getMetadataOpsCount());
       usage.addToFrequently_accessed_tables(tableUsageMetrics);
     }
+
+    for (Table mostFilesTable:
+        CatalogUsageMonitor.INSTANCE.getHighFileCountTables()) {
+      TTableUsageMetrics tableUsageMetrics =
+          new TTableUsageMetrics(mostFilesTable.getTableName().toThrift());
+      tableUsageMetrics.setNum_files(mostFilesTable.getNumFiles());
+      usage.addToHigh_file_count_tables(tableUsageMetrics);
+    }
     return usage;
   }
 
diff --git a/fe/src/main/java/org/apache/impala/catalog/CatalogUsageMonitor.java b/fe/src/main/java/org/apache/impala/catalog/CatalogUsageMonitor.java
index a2e8d7e..72cf5d7 100644
--- a/fe/src/main/java/org/apache/impala/catalog/CatalogUsageMonitor.java
+++ b/fe/src/main/java/org/apache/impala/catalog/CatalogUsageMonitor.java
@@ -25,9 +25,10 @@ import com.google.common.base.Function;
 
 /**
  * Singleton class that monitors catalog usage. Currently, it tracks the most
- * frequently accessed tables (in terms of number of metadata operations) as well as
- * the tables with the highest (estimated) memory requirements. This class is
- * thread-safe.
+ * frequently accessed tables (in terms of number of metadata operations),
+ * the tables with the highest (estimated) memory requirements, and
+ * the table with most number of files.
+ * This class is thread-safe.
  */
 public final class CatalogUsageMonitor {
 
@@ -37,6 +38,8 @@ public final class CatalogUsageMonitor {
 
   private final TopNCache<Table, Long> largestTables_;
 
+  private final TopNCache<Table, Long> highFileCountTables_;
+
   private CatalogUsageMonitor() {
     final int num_tables_tracked = Integer.getInteger(
         "org.apache.impala.catalog.CatalogUsageMonitor.NUM_TABLES_TRACKED", 25);
@@ -51,6 +54,13 @@ public final class CatalogUsageMonitor {
           @Override
           public Long apply(Table tbl) { return tbl.getEstimatedMetadataSize(); }
         }, num_tables_tracked, false);
+
+    highFileCountTables_ = new TopNCache<Table, Long>(
+        new Function<Table, Long>() {
+          @Override
+          public Long apply(Table tbl) { return tbl.getNumFiles(); }
+        }, num_tables_tracked, false);
+
   }
 
   public void updateFrequentlyAccessedTables(Table tbl) {
@@ -59,9 +69,14 @@ public final class CatalogUsageMonitor {
 
   public void updateLargestTables(Table tbl) { largestTables_.putOrUpdate(tbl); }
 
+  public void updateHighFileCountTables(Table tbl) {
+    highFileCountTables_.putOrUpdate(tbl);
+  }
+
   public void removeTable(Table tbl) {
     frequentlyAccessedTables_.remove(tbl);
     largestTables_.remove(tbl);
+    highFileCountTables_.remove(tbl);
   }
 
   public List<Table> getFrequentlyAccessedTables() {
@@ -69,4 +84,8 @@ public final class CatalogUsageMonitor {
   }
 
   public List<Table> getLargestTables() { return largestTables_.listEntries(); }
+
+  public List<Table> getHighFileCountTables() {
+    return highFileCountTables_.listEntries();
+  }
 }
diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
index 1612b38..e8a6e71 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsTable.java
@@ -1852,6 +1852,7 @@ public class HdfsTable extends Table implements FeFsTable {
     memUsageEstimate += fileMetadataStats_.numFiles * PER_FD_MEM_USAGE_BYTES +
         fileMetadataStats_.numBlocks * PER_BLOCK_MEM_USAGE_BYTES;
     setEstimatedMetadataSize(memUsageEstimate);
+    setNumFiles(fileMetadataStats_.numFiles);
     THdfsTable hdfsTable = new THdfsTable(hdfsBaseDir_, getColumnNames(),
         nullPartitionKeyValue_, nullColumnValue_, idToPartition, prototypePartition);
     hdfsTable.setAvroSchema(avroSchema_);
diff --git a/fe/src/main/java/org/apache/impala/catalog/Table.java b/fe/src/main/java/org/apache/impala/catalog/Table.java
index e6b4dbe..38e650d 100644
--- a/fe/src/main/java/org/apache/impala/catalog/Table.java
+++ b/fe/src/main/java/org/apache/impala/catalog/Table.java
@@ -89,6 +89,11 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
   // table lock.
   protected AtomicLong metadataOpsCount_ = new AtomicLong(0);
 
+  // Number of files that the table has.
+  // Stored in an AtomicLong to allow this field to be accessed without holding the
+  // table lock.
+  protected AtomicLong numFiles_ = new AtomicLong(0);
+
   // Metrics for this table
   protected final Metrics metrics_ = new Metrics();
 
@@ -144,6 +149,8 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
 
   public long getMetadataOpsCount() { return metadataOpsCount_.get(); }
   public long getEstimatedMetadataSize() { return estimatedMetadataSize_.get(); }
+  public long getNumFiles() { return numFiles_.get(); }
+
   public void setEstimatedMetadataSize(long estimatedMetadataSize) {
     estimatedMetadataSize_.set(estimatedMetadataSize);
     if (!isStoredInImpaladCatalogCache()) {
@@ -158,6 +165,13 @@ public abstract class Table extends CatalogObjectImpl implements FeTable {
     }
   }
 
+  public void setNumFiles(long numFiles) {
+    numFiles_.set(numFiles);
+    if (!isStoredInImpaladCatalogCache()) {
+      CatalogUsageMonitor.INSTANCE.updateHighFileCountTables(this);
+    }
+  }
+
   public void initMetrics() {
     metrics_.addTimer(REFRESH_DURATION_METRIC);
     metrics_.addTimer(ALTER_DURATION_METRIC);
diff --git a/tests/webserver/test_web_pages.py b/tests/webserver/test_web_pages.py
index 99724b4..c03e143 100644
--- a/tests/webserver/test_web_pages.py
+++ b/tests/webserver/test_web_pages.py
@@ -25,7 +25,6 @@ import pytest
 import re
 import requests
 
-
 class TestWebPage(ImpalaTestSuite):
 
   ROOT_URL = "http://localhost:{0}/"
@@ -280,7 +279,9 @@ class TestWebPage(ImpalaTestSuite):
     self.__test_catalog_object("functional", "alltypesnopart")
     self.__test_catalog_object("functional_kudu", "alltypes")
     self.__test_table_metrics("functional", "alltypes", "total-file-size-bytes")
+    self.__test_table_metrics("functional", "alltypes", "num-files")
     self.__test_table_metrics("functional_kudu", "alltypes", "alter-duration")
+    self.__test_catalog_tablesfilesusage("functional", "alltypes", "24")
 
   def __test_catalog_object(self, db_name, tbl_name):
     """Tests the /catalog_object endpoint for the given db/table. Runs
@@ -299,6 +300,38 @@ class TestWebPage(ImpalaTestSuite):
     self.get_and_check_status(self.TABLE_METRICS_URL +
       "?name=%s.%s" % (db_name, tbl_name), metric, ports_to_test=self.CATALOG_TEST_PORT)
 
+  def __test_catalog_tablesfilesusage(self, db_name, tbl_name, numfiles):
+    """Test the list of tables with  most number of files in the catalog page.
+    Make sure the loaded table is in the list and with correct file number."""
+    self.client.execute("refresh %s.%s" % (db_name, tbl_name))
+    response = self.get_and_check_status(self.CATALOG_URL,
+      "Tables with Most Number of Files", ports_to_test=self.CATALOG_TEST_PORT)
+    list_file_str = re.search('<table id="high-file-count-tables"( .*?)</table>',
+      response[0].text, re.MULTILINE | re.DOTALL)
+    target_metric = "%s.%s-metric" % (db_name, tbl_name)
+    # Check the db table is in the list
+    assert target_metric in list_file_str.group(0)
+    list_files = re.findall('<tr>(.*?)</tr>', list_file_str.group(0),
+      re.MULTILINE | re.DOTALL)
+    for trow in list_files:
+      # Find the entry for the db table and verify its file count.
+      if re.search(target_metric, trow) is not None:
+        # Get the number following <td> in the entry
+        nfiles = re.search('(?<=\<td\>)\d+', trow)
+        assert nfiles.group(0) == numfiles
+    response = self.get_and_check_status(self.CATALOG_URL + "?json",
+      "high_file_count_tables", ports_to_test=self.CATALOG_TEST_PORT)
+    response_json = json.loads(response[0].text)
+    high_filecount_tbls = response_json["high_file_count_tables"]
+    tbl_fname = "%s.%s" % (db_name, tbl_name)
+    hasTbl = 0
+    assert len(high_filecount_tbls) > 0
+    for tblinfo in high_filecount_tbls:
+      if tblinfo["name"] == tbl_fname:
+        assert tblinfo["num_files"] == int(numfiles)
+        hasTbl = 1
+    assert hasTbl == 1
+
   def __run_query_and_get_debug_page(self, query, page_url, query_options=None,
                                      expected_state=None):
     """Runs a query to obtain the content of the debug page pointed to by page_url, then
diff --git a/www/catalog.tmpl b/www/catalog.tmpl
index 35ebf5f..b6c99e6 100644
--- a/www/catalog.tmpl
+++ b/www/catalog.tmpl
@@ -101,6 +101,46 @@ under the License.
 </script>
 {{/has_frequent_tables}}
 
+{{?has_high_file_count_tables}}
+<div class="panel panel-info">
+  <div class="panel-heading">
+      <h2 class="panel-title">
+      Top-{{num_high_file_count_tables}} Tables with Most Number of Files
+      </h2>
+  </div>
+  <div class="panel-body">
+    <table id="high-file-count-tables" class='table table-hover table-bordered'>
+      <thead>
+        <tr>
+          <th>Name</th>
+          <th>Number of Files</th>
+          <th>Metrics</th>
+        </tr>
+      </thead>
+      <tbody>
+        {{#high_file_count_tables}}
+        <tr>
+          <td><a href="catalog_object?object_type=TABLE&object_name={{name}}">{{name}}</a>
+          </td>
+          <td>{{num_files}}</td>
+          <td><a href="table_metrics?name={{name}}">{{name}}-metrics</a></td>
+        </tr>
+        {{/high_file_count_tables}}
+      </tbody>
+    </table>
+  </div>
+</div>
+
+<script>
+    $(document).ready(function() {
+        $('#high-file-count-tables').DataTable({
+            "order": [[ 1, "desc" ]],
+            "pageLength": 10
+        });
+    });
+</script>
+{{/has_high_file_count_tables}}
+
 <h3>Databases</h3>
 <ol class="breadcrumb">
 {{#databases}}