You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2021/03/03 00:38:48 UTC

[impala] 06/06: IMPALA-10450: Catalogd crashes due to exception in ThriftDebugString

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit a47700ed790c2415e52a85e40063bed53a7cb9e8
Author: Vihang Karajgaonkar <vi...@apache.org>
AuthorDate: Fri Feb 19 15:30:07 2021 -0800

    IMPALA-10450: Catalogd crashes due to exception in ThriftDebugString
    
    This patch adds a wrapper around ThriftDebugString method provided
    in the Thrift library. The thrift's method can throw exceptions
    like (bad_alloc or TProtocolException) when the object cannot be
    serialized into a string representation. This exception is not
    caught on the catalogd side and it crashes the catalogd.
    
    The error was specifically seen in the catalogd's debug UI
    which provides a way to display a Table object. An exception
    thrown when rendering the table on the UI would have crashed
    the catalogd before the patch. In order to simulate this crash a new debug
    action called EXCEPTION was added. A new custom cluster test
    was added which simulates a exception thrown in this method and
    makes sure that fetching the table from catalogd's debug UI
    does not crash the catalogd.
    
    Tests:
    1. Added a new custom cluster test which reproduces the crash.
    2. Created a large table which has ~270K partitions and reduced
    the memory of the catalogd to 16GB. This configuration throws
    bad_alloc exception in the ThriftDebugString method and crashes
    the catalogd. After the patch the crash is averted and we see
    a error message on the debug UI instead. I also looped around
    the catalog web UI call for more than an hour to see if there
    are any other stability issues. I could not see any problems.
    
    Change-Id: I42cee6186a3d5bacc1117bae5961ac60ac9f7a66
    Reviewed-on: http://gerrit.cloudera.org:8080/17110
    Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
    Tested-by: Vihang Karajgaonkar <vi...@cloudera.com>
---
 be/src/catalog/catalog-server.cc                   | 13 +++---
 be/src/util/debug-util.cc                          | 14 ++++++
 be/src/util/debug-util.h                           |  8 ++++
 be/src/util/thrift-debug-util.h                    | 41 +++++++++++++++++
 .../test_thrift_debug_string_exception.py          | 51 ++++++++++++++++++++++
 5 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index e14c982..9731dd3 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -32,6 +32,7 @@
 #include "util/event-metrics.h"
 #include "util/logging-support.h"
 #include "util/metrics.h"
+#include "util/thrift-debug-util.h"
 #include "util/webserver.h"
 
 #include "common/names.h"
@@ -88,7 +89,6 @@ DEFINE_int64(topic_update_tbl_max_wait_time_ms, 500, "Maximum time "
      "table lock. A value of 0 disables the timeout based locking which means topic "
      "update thread will always block until table lock is acquired.");
 
-
 DECLARE_string(debug_actions);
 DECLARE_string(state_store_host);
 DECLARE_int32(state_store_subscriber_port);
@@ -186,7 +186,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
     Status status = catalog_server_->catalog()->GetCatalogObject(req.object_desc,
         &resp.catalog_object);
     if (!status.ok()) LOG(ERROR) << status.GetDetail();
-    VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugString(resp);
+    VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugStringNoThrow(resp);
   }
 
   void GetPartialCatalogObject(TGetPartialCatalogObjectResponse& resp,
@@ -204,7 +204,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
     TStatus thrift_status;
     status.ToThrift(&thrift_status);
     resp.__set_status(thrift_status);
-    VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugString(resp);
+    VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugStringNoThrow(resp);
   }
 
   void GetPartitionStats(TGetPartitionStatsResponse& resp,
@@ -215,7 +215,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
     TStatus thrift_status;
     status.ToThrift(&thrift_status);
     resp.__set_status(thrift_status);
-    VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugString(resp);
+    VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugStringNoThrow(resp);
   }
 
   // Prioritizes the loading of metadata for one or more catalog objects. Currently only
@@ -662,8 +662,9 @@ void CatalogServer::CatalogObjectsUrlCallback(const Webserver::WebRequest& req,
       // Get the object and dump its contents.
       TCatalogObject result;
       if (status.ok()) status = catalog_->GetCatalogObject(request, &result);
-      if (status.ok()) {
-        Value debug_string(ThriftDebugString(result).c_str(), document->GetAllocator());
+      if(status.ok()) {
+        Value debug_string(
+            ThriftDebugStringNoThrow(result).c_str(), document->GetAllocator());
         document->AddMember("thrift_string", debug_string, document->GetAllocator());
       }
     }
diff --git a/be/src/util/debug-util.cc b/be/src/util/debug-util.cc
index 1ca8fd6..de43741 100644
--- a/be/src/util/debug-util.cc
+++ b/be/src/util/debug-util.cc
@@ -426,6 +426,20 @@ Status DebugActionImpl(
         ImpaladMetrics::DEBUG_ACTION_NUM_FAIL->Increment(1l);
       }
       return Status(TErrorCode::INTERNAL_ERROR, error_msg);
+    } else if (iequals(cmd, "EXCEPTION")) {
+      //EXCEPTION@<exception_type>
+      if (tokens.size() != 2) {
+        return Status(Substitute(ERROR_MSG, components[0], action_str,
+            "expected EXCEPTION@<exception_type>"));
+      }
+      static const auto end = EXCEPTION_STR_MAP.end();
+      auto it = EXCEPTION_STR_MAP.find(tokens[1]);
+      if (it != end) {
+        it->second();
+      } else {
+        return Status(
+            Substitute(ERROR_MSG, components[0], action_str, "Invalid exception type"));
+      }
     } else {
       DCHECK(false) << "Invalid debug action";
       return Status(Substitute(ERROR_MSG, components[0], action_str, "invalid command"));
diff --git a/be/src/util/debug-util.h b/be/src/util/debug-util.h
index e477acb..0165093 100644
--- a/be/src/util/debug-util.h
+++ b/be/src/util/debug-util.h
@@ -21,6 +21,7 @@
 #include <vector>
 
 #include <thrift/protocol/TDebugProtocol.h>
+#include <thrift/Thrift.h>
 
 #include "common/compiler-util.h"
 #include "common/config.h"
@@ -190,6 +191,13 @@ static inline void DebugActionNoFail(
   DebugActionNoFail(query_options.debug_action, label);
 }
 
+/// Map of exception string to the exception throwing function which is used when
+/// executing the EXCEPTION debug action.
+static const std::unordered_map<std::string,std::function<void()>> EXCEPTION_STR_MAP {
+        {"exception",   [](){ throw std::exception(); }},
+        {"bad_alloc",   [](){ throw std::bad_alloc(); }},
+        {"TException", [](){ throw apache::thrift::TException(); }}
+};
 // FILE_CHECKs are conditions that we expect to be true but could fail due to a malformed
 // input file. They differentiate these cases from DCHECKs, which indicate conditions that
 // are true unless there's a bug in Impala. We would ideally always return a bad Status
diff --git a/be/src/util/thrift-debug-util.h b/be/src/util/thrift-debug-util.h
new file mode 100644
index 0000000..8169837
--- /dev/null
+++ b/be/src/util/thrift-debug-util.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <string>
+#include <thrift/protocol/TDebugProtocol.h>
+#include "util/debug-util.h"
+
+DECLARE_string(debug_actions);
+
+namespace impala {
+
+  /// This method is a wrapper over Thrift's ThriftDebugString. It catches any exception
+  /// that might be thrown by the underlying Thrift method. Typically, we use this method
+  /// when we suspect the ThriftStruct could be a large object like a HdfsTable whose
+  /// string representation could have significantly larger memory requirements than the
+  /// binary representation.
+  template<typename ThriftStruct>
+  std::string ThriftDebugStringNoThrow(ThriftStruct ts) noexcept {
+    try {
+      Status status = DebugAction(FLAGS_debug_actions, "THRIFT_DEBUG_STRING");
+      return apache::thrift::ThriftDebugString(ts);
+    } catch (std::exception& e) {
+      return "Unexpected exception received: " + std::string(e.what());
+    }
+  }
+}
diff --git a/tests/custom_cluster/test_thrift_debug_string_exception.py b/tests/custom_cluster/test_thrift_debug_string_exception.py
new file mode 100644
index 0000000..af03abd
--- /dev/null
+++ b/tests/custom_cluster/test_thrift_debug_string_exception.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+
+class TestThriftDebugStringExceptions(CustomClusterTestSuite):
+  """Regression tests for IMPALA-10450"""
+
+  @CustomClusterTestSuite.with_args(
+    catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@bad_alloc")
+  def test_thrift_debug_str_bad_alloc(self):
+    """The test executes a API call to get a catalog object from the debug UI and makes
+    sure that catalogd does not crash if the ThriftDebugString throws bad_alloc
+    exception."""
+    obj = self._get_catalog_object()
+    assert "Unexpected exception received" in obj
+
+  @CustomClusterTestSuite.with_args(
+    catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@TException")
+  def test_thrift_debug_str_texception(self):
+    """The test executes a API call to get a catalog object from the debug UI and makes
+    sure that catalogd does not crash if the ThriftDebugString throws a TException."""
+    obj = self._get_catalog_object()
+    assert "Unexpected exception received" in obj
+
+  @CustomClusterTestSuite.with_args()
+  def test_thrift_debug_str(self):
+    """Sanity test which executes API call to get a catalog object and make sure that
+    it does not return a error message under normal circumstances."""
+    obj = self._get_catalog_object()
+    assert "Unexpected exception received" not in obj
+
+  def _get_catalog_object(self):
+    """ Return the catalog object of functional.alltypes serialized to string. """
+    return self.cluster.catalogd.service.read_debug_webpage(
+      "catalog_object?object_type=TABLE&object_name=functional.alltypes")