You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2021/03/03 00:38:48 UTC
[impala] 06/06: IMPALA-10450: Catalogd crashes due to exception in
ThriftDebugString
This is an automated email from the ASF dual-hosted git repository.
tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit a47700ed790c2415e52a85e40063bed53a7cb9e8
Author: Vihang Karajgaonkar <vi...@apache.org>
AuthorDate: Fri Feb 19 15:30:07 2021 -0800
IMPALA-10450: Catalogd crashes due to exception in ThriftDebugString
This patch adds a wrapper around ThriftDebugString method provided
in the Thrift library. The thrift's method can throw exceptions
like (bad_alloc or TProtocolException) when the object cannot be
serialized into a string representation. This exception is not
caught on the catalogd side and it crashes the catalogd.
The error was specifically seen in the catalogd's debug UI
which provides a way to display a Table object. An exception
thrown when rendering the table on the UI would have crashed
the catalogd before the patch. In order to simulate this crash a new debug
action called EXCEPTION was added. A new custom cluster test
was added which simulates a exception thrown in this method and
makes sure that fetching the table from catalogd's debug UI
does not crash the catalogd.
Tests:
1. Added a new custom cluster test which reproduces the crash.
2. Created a large table which has ~270K partitions and reduced
the memory of the catalogd to 16GB. This configuration throws
bad_alloc exception in the ThriftDebugString method and crashes
the catalogd. After the patch the crash is averted and we see
a error message on the debug UI instead. I also looped around
the catalog web UI call for more than an hour to see if there
are any other stability issues. I could not see any problems.
Change-Id: I42cee6186a3d5bacc1117bae5961ac60ac9f7a66
Reviewed-on: http://gerrit.cloudera.org:8080/17110
Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
Tested-by: Vihang Karajgaonkar <vi...@cloudera.com>
---
be/src/catalog/catalog-server.cc | 13 +++---
be/src/util/debug-util.cc | 14 ++++++
be/src/util/debug-util.h | 8 ++++
be/src/util/thrift-debug-util.h | 41 +++++++++++++++++
.../test_thrift_debug_string_exception.py | 51 ++++++++++++++++++++++
5 files changed, 121 insertions(+), 6 deletions(-)
diff --git a/be/src/catalog/catalog-server.cc b/be/src/catalog/catalog-server.cc
index e14c982..9731dd3 100644
--- a/be/src/catalog/catalog-server.cc
+++ b/be/src/catalog/catalog-server.cc
@@ -32,6 +32,7 @@
#include "util/event-metrics.h"
#include "util/logging-support.h"
#include "util/metrics.h"
+#include "util/thrift-debug-util.h"
#include "util/webserver.h"
#include "common/names.h"
@@ -88,7 +89,6 @@ DEFINE_int64(topic_update_tbl_max_wait_time_ms, 500, "Maximum time "
"table lock. A value of 0 disables the timeout based locking which means topic "
"update thread will always block until table lock is acquired.");
-
DECLARE_string(debug_actions);
DECLARE_string(state_store_host);
DECLARE_int32(state_store_subscriber_port);
@@ -186,7 +186,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
Status status = catalog_server_->catalog()->GetCatalogObject(req.object_desc,
&resp.catalog_object);
if (!status.ok()) LOG(ERROR) << status.GetDetail();
- VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugString(resp);
+ VLOG_RPC << "GetCatalogObject(): response=" << ThriftDebugStringNoThrow(resp);
}
void GetPartialCatalogObject(TGetPartialCatalogObjectResponse& resp,
@@ -204,7 +204,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
TStatus thrift_status;
status.ToThrift(&thrift_status);
resp.__set_status(thrift_status);
- VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugString(resp);
+ VLOG_RPC << "GetPartialCatalogObject(): response=" << ThriftDebugStringNoThrow(resp);
}
void GetPartitionStats(TGetPartitionStatsResponse& resp,
@@ -215,7 +215,7 @@ class CatalogServiceThriftIf : public CatalogServiceIf {
TStatus thrift_status;
status.ToThrift(&thrift_status);
resp.__set_status(thrift_status);
- VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugString(resp);
+ VLOG_RPC << "GetPartitionStats(): response=" << ThriftDebugStringNoThrow(resp);
}
// Prioritizes the loading of metadata for one or more catalog objects. Currently only
@@ -662,8 +662,9 @@ void CatalogServer::CatalogObjectsUrlCallback(const Webserver::WebRequest& req,
// Get the object and dump its contents.
TCatalogObject result;
if (status.ok()) status = catalog_->GetCatalogObject(request, &result);
- if (status.ok()) {
- Value debug_string(ThriftDebugString(result).c_str(), document->GetAllocator());
+ if(status.ok()) {
+ Value debug_string(
+ ThriftDebugStringNoThrow(result).c_str(), document->GetAllocator());
document->AddMember("thrift_string", debug_string, document->GetAllocator());
}
}
diff --git a/be/src/util/debug-util.cc b/be/src/util/debug-util.cc
index 1ca8fd6..de43741 100644
--- a/be/src/util/debug-util.cc
+++ b/be/src/util/debug-util.cc
@@ -426,6 +426,20 @@ Status DebugActionImpl(
ImpaladMetrics::DEBUG_ACTION_NUM_FAIL->Increment(1l);
}
return Status(TErrorCode::INTERNAL_ERROR, error_msg);
+ } else if (iequals(cmd, "EXCEPTION")) {
+ //EXCEPTION@<exception_type>
+ if (tokens.size() != 2) {
+ return Status(Substitute(ERROR_MSG, components[0], action_str,
+ "expected EXCEPTION@<exception_type>"));
+ }
+ static const auto end = EXCEPTION_STR_MAP.end();
+ auto it = EXCEPTION_STR_MAP.find(tokens[1]);
+ if (it != end) {
+ it->second();
+ } else {
+ return Status(
+ Substitute(ERROR_MSG, components[0], action_str, "Invalid exception type"));
+ }
} else {
DCHECK(false) << "Invalid debug action";
return Status(Substitute(ERROR_MSG, components[0], action_str, "invalid command"));
diff --git a/be/src/util/debug-util.h b/be/src/util/debug-util.h
index e477acb..0165093 100644
--- a/be/src/util/debug-util.h
+++ b/be/src/util/debug-util.h
@@ -21,6 +21,7 @@
#include <vector>
#include <thrift/protocol/TDebugProtocol.h>
+#include <thrift/Thrift.h>
#include "common/compiler-util.h"
#include "common/config.h"
@@ -190,6 +191,13 @@ static inline void DebugActionNoFail(
DebugActionNoFail(query_options.debug_action, label);
}
+/// Map of exception string to the exception throwing function which is used when
+/// executing the EXCEPTION debug action.
+static const std::unordered_map<std::string,std::function<void()>> EXCEPTION_STR_MAP {
+ {"exception", [](){ throw std::exception(); }},
+ {"bad_alloc", [](){ throw std::bad_alloc(); }},
+ {"TException", [](){ throw apache::thrift::TException(); }}
+};
// FILE_CHECKs are conditions that we expect to be true but could fail due to a malformed
// input file. They differentiate these cases from DCHECKs, which indicate conditions that
// are true unless there's a bug in Impala. We would ideally always return a bad Status
diff --git a/be/src/util/thrift-debug-util.h b/be/src/util/thrift-debug-util.h
new file mode 100644
index 0000000..8169837
--- /dev/null
+++ b/be/src/util/thrift-debug-util.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include <string>
+#include <thrift/protocol/TDebugProtocol.h>
+#include "util/debug-util.h"
+
+DECLARE_string(debug_actions);
+
+namespace impala {
+
+ /// This method is a wrapper over Thrift's ThriftDebugString. It catches any exception
+ /// that might be thrown by the underlying Thrift method. Typically, we use this method
+ /// when we suspect the ThriftStruct could be a large object like a HdfsTable whose
+ /// string representation could have significantly larger memory requirements than the
+ /// binary representation.
+ template<typename ThriftStruct>
+ std::string ThriftDebugStringNoThrow(ThriftStruct ts) noexcept {
+ try {
+ Status status = DebugAction(FLAGS_debug_actions, "THRIFT_DEBUG_STRING");
+ return apache::thrift::ThriftDebugString(ts);
+ } catch (std::exception& e) {
+ return "Unexpected exception received: " + std::string(e.what());
+ }
+ }
+}
diff --git a/tests/custom_cluster/test_thrift_debug_string_exception.py b/tests/custom_cluster/test_thrift_debug_string_exception.py
new file mode 100644
index 0000000..af03abd
--- /dev/null
+++ b/tests/custom_cluster/test_thrift_debug_string_exception.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
+
+
+class TestThriftDebugStringExceptions(CustomClusterTestSuite):
+ """Regression tests for IMPALA-10450"""
+
+ @CustomClusterTestSuite.with_args(
+ catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@bad_alloc")
+ def test_thrift_debug_str_bad_alloc(self):
+ """The test executes a API call to get a catalog object from the debug UI and makes
+ sure that catalogd does not crash if the ThriftDebugString throws bad_alloc
+ exception."""
+ obj = self._get_catalog_object()
+ assert "Unexpected exception received" in obj
+
+ @CustomClusterTestSuite.with_args(
+ catalogd_args="--debug_actions=THRIFT_DEBUG_STRING:EXCEPTION@TException")
+ def test_thrift_debug_str_texception(self):
+ """The test executes a API call to get a catalog object from the debug UI and makes
+ sure that catalogd does not crash if the ThriftDebugString throws a TException."""
+ obj = self._get_catalog_object()
+ assert "Unexpected exception received" in obj
+
+ @CustomClusterTestSuite.with_args()
+ def test_thrift_debug_str(self):
+ """Sanity test which executes API call to get a catalog object and make sure that
+ it does not return a error message under normal circumstances."""
+ obj = self._get_catalog_object()
+ assert "Unexpected exception received" not in obj
+
+ def _get_catalog_object(self):
+ """ Return the catalog object of functional.alltypes serialized to string. """
+ return self.cluster.catalogd.service.read_debug_webpage(
+ "catalog_object?object_type=TABLE&object_name=functional.alltypes")