You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2020/10/21 06:38:35 UTC

[impala] 02/02: IMPALA-10168: Expose JSON catalog objects in catalogd's debug page

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit ee4043e1a0940ae5711c68336d1ad522631d0e35
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Tue Oct 13 09:24:20 2020 +0800

    IMPALA-10168: Expose JSON catalog objects in catalogd's debug page
    
    Catalogd has a debug page at '/catalog_object' showing catalog objects
    in thrift debug strings. It's inconvenient for tests to parse the thrift
    string and get interesting infos.
    
    This patch extends this page to support returning JSON results, which
    eases tests to extract complex infos from the catalog objects, e.g.
    partition ids of a hdfs table. Just like getting json results from other
    pages, the usage is adding a ‘json’ argument in the URL, e.g.
    http://localhost:25020/catalog_object?json&object_type=TABLE&object_name=db1.tbl1
    
    Implementation:
    Csaba helped to find that Thrift has a protocol, TSimpleJSONProtocol,
    which can convert thrift objects to human readable JSON strings. This
    simplifies the implementation a lot. However, TSimpleJSONProtocol is not
    implemented in cpp yet (THRIFT-2476). So we do the conversion in FE to
    use its java implementation.
    
    Tests:
     - Add tests to verify json fields existence.
    
    Change-Id: I15f256b4e3f5206c7140746694106e03b0a4ad92
    Reviewed-on: http://gerrit.cloudera.org:8080/16449
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/catalog/catalog-server.cc                   | 28 +++++++++---
 be/src/catalog/catalog.cc                          |  5 +++
 be/src/catalog/catalog.h                           |  6 +++
 .../java/org/apache/impala/service/JniCatalog.java | 13 ++++++
 tests/webserver/test_web_pages.py                  | 50 ++++++++++++++++++++++
 5 files changed, 96 insertions(+), 6 deletions(-)

diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index 0148745..a66e256 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -19,6 +19,7 @@
 
 #include <gutil/strings/substitute.h>
 #include <thrift/protocol/TDebugProtocol.h>
+#include <thrift/protocol/TJSONProtocol.h>
 
 #include "catalog/catalog-util.h"
 #include "exec/read-write-util.h"
@@ -621,6 +622,7 @@ void CatalogServer::CatalogObjectsUrlCallback(const Webserver::WebRequest& req,
   const auto& args = req.parsed_args;
   Webserver::ArgumentMap::const_iterator object_type_arg = args.find("object_type");
   Webserver::ArgumentMap::const_iterator object_name_arg = args.find("object_name");
+  Webserver::ArgumentMap::const_iterator json_arg = args.find("json");
   if (object_type_arg != args.end() && object_name_arg != args.end()) {
     TCatalogObjectType::type object_type =
         TCatalogObjectTypeFromName(object_type_arg->second);
@@ -630,13 +632,27 @@ void CatalogServer::CatalogObjectsUrlCallback(const Webserver::WebRequest& req,
     Status status =
         TCatalogObjectFromObjectName(object_type, object_name_arg->second, &request);
 
-    // Get the object and dump its contents.
-    TCatalogObject result;
-    if (status.ok()) status = catalog_->GetCatalogObject(request, &result);
-    if (status.ok()) {
-      Value debug_string(ThriftDebugString(result).c_str(), document->GetAllocator());
-      document->AddMember("thrift_string", debug_string, document->GetAllocator());
+    if (json_arg != args.end()) {
+      // Get the JSON string from FE since Thrift doesn't have a cpp implementation for
+      // SimpleJsonProtocol (THRIFT-2476), so we use it's java implementation.
+      // TODO: switch to use cpp implementation of SimpleJsonProtocol after THRIFT-2476
+      //  is resolved.
+      string json_str;
+      if (status.ok()) status = catalog_->GetJsonCatalogObject(request, &json_str);
+      if (status.ok()) {
+        Value debug_string(json_str.c_str(), document->GetAllocator());
+        document->AddMember("json_string", debug_string, document->GetAllocator());
+      }
     } else {
+      // Get the object and dump its contents.
+      TCatalogObject result;
+      if (status.ok()) status = catalog_->GetCatalogObject(request, &result);
+      if (status.ok()) {
+        Value debug_string(ThriftDebugString(result).c_str(), document->GetAllocator());
+        document->AddMember("thrift_string", debug_string, document->GetAllocator());
+      }
+    }
+    if (!status.ok()) {
       Value error(status.GetDetail().c_str(), document->GetAllocator());
       document->AddMember("error", error, document->GetAllocator());
     }
diff --git a/be/src/catalog/catalog.cc b/be/src/catalog/catalog.cc
index e7401de..d75484f 100644
--- a/be/src/catalog/catalog.cc
+++ b/be/src/catalog/catalog.cc
@@ -57,6 +57,7 @@ Catalog::Catalog() {
     {"getDbs", "([B)[B", &get_dbs_id_},
     {"getFunctions", "([B)[B", &get_functions_id_},
     {"getCatalogObject", "([B)[B", &get_catalog_object_id_},
+    {"getJsonCatalogObject", "([B)Ljava/lang/String;", &get_json_catalog_object_id_},
     {"getPartialCatalogObject", "([B)[B", &get_partial_catalog_object_id_},
     {"getCatalogDelta", "([B)[B", &get_catalog_delta_id_},
     {"getCatalogUsage", "()[B", &get_catalog_usage_id_},
@@ -92,6 +93,10 @@ Status Catalog::GetCatalogObject(const TCatalogObject& req,
   return JniUtil::CallJniMethod(catalog_, get_catalog_object_id_, req, resp);
 }
 
+Status Catalog::GetJsonCatalogObject(const TCatalogObject& req, string* res) {
+  return JniUtil::CallJniMethod(catalog_, get_json_catalog_object_id_, req, res);
+}
+
 Status Catalog::GetPartialCatalogObject(const TGetPartialCatalogObjectRequest& req,
     TGetPartialCatalogObjectResponse* resp) {
   return JniUtil::CallJniMethod(catalog_, get_partial_catalog_object_id_, req, resp);
diff --git a/be/src/catalog/catalog.h b/be/src/catalog/catalog.h
index 1287532..10c2410 100644
--- a/be/src/catalog/catalog.h
+++ b/be/src/catalog/catalog.h
@@ -72,6 +72,11 @@ class Catalog {
   /// information on the error will be returned.
   Status GetCatalogObject(const TCatalogObject& request, TCatalogObject* response);
 
+  /// Like the above method but get the json string of the catalog object. The json
+  /// string can't be deserialized to Thrift objects so can only be used in showing debug
+  /// infos.
+  Status GetJsonCatalogObject(const TCatalogObject& req, std::string* res);
+
   /// Return partial information about a Catalog object.
   /// Returns OK if the operation was successful, otherwise a Status object with
   /// information on the error will be returned.
@@ -141,6 +146,7 @@ class Catalog {
   jmethodID exec_ddl_id_;  // JniCatalog.execDdl()
   jmethodID reset_metadata_id_;  // JniCatalog.resetMetdata()
   jmethodID get_catalog_object_id_;  // JniCatalog.getCatalogObject()
+  jmethodID get_json_catalog_object_id_;  // JniCatalog.getJsonCatalogObject()
   jmethodID get_partial_catalog_object_id_;  // JniCatalog.getPartialCatalogObject()
   jmethodID get_catalog_delta_id_;  // JniCatalog.getCatalogDelta()
   jmethodID get_catalog_version_id_;  // JniCatalog.getCatalogVersion()
diff --git a/fe/src/main/java/org/apache/impala/service/JniCatalog.java b/fe/src/main/java/org/apache/impala/service/JniCatalog.java
index e76a2ab..3b0fbd4 100644
--- a/fe/src/main/java/org/apache/impala/service/JniCatalog.java
+++ b/fe/src/main/java/org/apache/impala/service/JniCatalog.java
@@ -72,6 +72,7 @@ import org.apache.impala.util.PatternMatcher;
 import org.apache.thrift.TException;
 import org.apache.thrift.TSerializer;
 import org.apache.thrift.protocol.TBinaryProtocol;
+import org.apache.thrift.protocol.TSimpleJSONProtocol;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -245,6 +246,18 @@ public class JniCatalog {
     return serializer.serialize(catalog_.getTCatalogObject(objectDescription));
   }
 
+  /**
+   * Gets the json string of a catalog object. It can only be used in showing debug
+   * messages and can't be deserialized to a thrift object.
+   */
+  public String getJsonCatalogObject(byte[] thriftParams) throws ImpalaException,
+      TException {
+    TCatalogObject objectDescription = new TCatalogObject();
+    JniUtil.deserializeThrift(protocolFactory_, objectDescription, thriftParams);
+    TSerializer jsonSerializer = new TSerializer(new TSimpleJSONProtocol.Factory());
+    return jsonSerializer.toString(catalog_.getTCatalogObject(objectDescription));
+  }
+
   public byte[] getPartialCatalogObject(byte[] thriftParams) throws ImpalaException,
       TException {
     TGetPartialCatalogObjectRequest req =
diff --git a/tests/webserver/test_web_pages.py b/tests/webserver/test_web_pages.py
index 4026e67..01661a4 100644
--- a/tests/webserver/test_web_pages.py
+++ b/tests/webserver/test_web_pages.py
@@ -299,6 +299,10 @@ class TestWebPage(ImpalaTestSuite):
     self.__test_catalog_object(unique_database, "foo_kudu", cluster_properties)
     self.__test_catalog_object(unique_database, "foo_part_parquet", cluster_properties)
     self.__test_catalog_object(unique_database, "foo", cluster_properties)
+    self.__test_json_db_object(unique_database)
+    self.__test_json_table_object(unique_database, "foo")
+    self.__test_json_table_object(unique_database, "foo_part")
+    self.__test_json_table_object(unique_database, "foo_part_parquet")
     self.__test_table_metrics(unique_database, "foo_part", "total-file-size-bytes")
     self.__test_table_metrics(unique_database, "foo_part", "num-files")
     self.__test_table_metrics(unique_database, "foo_part", "alter-duration")
@@ -335,6 +339,52 @@ class TestWebPage(ImpalaTestSuite):
       self.get_and_check_status(obj_url, impalad_expected_str,
           ports_to_test=self.IMPALAD_TEST_PORT)
 
+  def __test_json_db_object(self, db_name):
+    """Tests the /catalog_object?json endpoint of catalogd for the given db."""
+    obj_url = self.CATALOG_OBJECT_URL + \
+              "?json&object_type=DATABASE&object_name={0}".format(db_name)
+    responses = self.get_and_check_status(obj_url, ports_to_test=self.CATALOG_TEST_PORT)
+    obj = json.loads(json.loads(responses[0].text)["json_string"])
+    assert obj["type"] == 2, "type should be DATABASE"
+    assert "catalog_version" in obj, "TCatalogObject should have catalog_version"
+    db_obj = obj["db"]
+    assert db_obj["db_name"] == db_name
+    assert "metastore_db" in db_obj, "Loaded database should have metastore_db"
+
+  def __test_json_table_object(self, db_name, tbl_name):
+    """Tests the /catalog_object?json endpoint of catalogd for the given db/table. Runs
+    against an unloaded as well as a loaded table."""
+    obj_url = self.CATALOG_OBJECT_URL + \
+              "?json&object_type=TABLE&object_name={0}.{1}".format(db_name, tbl_name)
+    self.client.execute("invalidate metadata %s.%s" % (db_name, tbl_name))
+    responses = self.get_and_check_status(obj_url, ports_to_test=self.CATALOG_TEST_PORT)
+    obj = json.loads(json.loads(responses[0].text)["json_string"])
+    assert obj["type"] == 3, "type should be TABLE"
+    assert "catalog_version" in obj, "TCatalogObject should have catalog_version"
+    tbl_obj = obj["table"]
+    assert tbl_obj["db_name"] == db_name
+    assert tbl_obj["tbl_name"] == tbl_name
+    assert "hdfs_table" not in tbl_obj, "Unloaded table should not have hdfs_table"
+
+    self.client.execute("refresh %s.%s" % (db_name, tbl_name))
+    responses = self.get_and_check_status(obj_url, ports_to_test=self.CATALOG_TEST_PORT)
+    obj = json.loads(json.loads(responses[0].text)["json_string"])
+    assert obj["type"] == 3, "type should be TABLE"
+    assert "catalog_version" in obj, "TCatalogObject should have catalog_version"
+    tbl_obj = obj["table"]
+    assert tbl_obj["db_name"] == db_name
+    assert tbl_obj["tbl_name"] == tbl_name
+    assert "columns" in tbl_obj, "Loaded TTable should have columns"
+    assert tbl_obj["table_type"] == 0, "table_type should be HDFS_TABLE"
+    assert "metastore_table" in tbl_obj
+    hdfs_tbl_obj = tbl_obj["hdfs_table"]
+    assert "hdfsBaseDir" in hdfs_tbl_obj
+    assert "colNames" in hdfs_tbl_obj
+    assert "nullPartitionKeyValue" in hdfs_tbl_obj
+    assert "nullColumnValue" in hdfs_tbl_obj
+    assert "partitions" in hdfs_tbl_obj
+    assert "prototype_partition" in hdfs_tbl_obj
+
   def check_endpoint_is_disabled(self, url, string_to_search="", ports_to_test=None):
     """Helper method that verifies the given url does not exist."""
     if ports_to_test is None: