You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2018/01/08 17:04:04 UTC

[3/3] impala git commit: IMPALA-6348: Redact only sensitive fields in runtime profiles

IMPALA-6348: Redact only sensitive fields in runtime profiles

Without this patch, redaction is applied to every field in the
runtime profile. This approach has an undesired side effect when
Kerberos auth + email redaction is in place.

Since the redaction applies to every field, even principals
(from Connected/Delegated User fields) are redacted, as the Kerberos
principal format generally pattern matches with an email redactor
template.

This is particularly problematic for monitoring tools that consume
runtime profiles and use these fields to group the queries by user.

This patch fixes the problem by redacting only the following sensitive
fields.

- Query Statement
- Error logs (since they can contain column references etc.)
- Query Status
- Query Plan

Other fields in the runtime profile are left unredacted.

Change-Id: Iae3b6726009bf458a7ec73131e5d659b12ab73cf
Reviewed-on: http://gerrit.cloudera.org:8080/8934
Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6a87eb20
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6a87eb20
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6a87eb20

Branch: refs/heads/master
Commit: 6a87eb20a5d55b0a2f6f9102375ff8da4b98ccba
Parents: ce65b43
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Wed Jan 3 15:31:32 2018 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Sat Jan 6 22:54:17 2018 +0000

----------------------------------------------------------------------
 be/src/service/client-request-state.cc |  7 ++++---
 be/src/service/client-request-state.h  | 10 ++++++++-
 be/src/service/impala-server.cc        |  4 ++--
 be/src/util/runtime-profile.cc         |  9 +++++---
 be/src/util/runtime-profile.h          |  7 ++++++-
 tests/custom_cluster/test_redaction.py | 32 ++++++++++++++++++++++++++---
 6 files changed, 56 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/be/src/service/client-request-state.cc
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.cc b/be/src/service/client-request-state.cc
index 5e95bb2..87ac8ff 100644
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -124,7 +124,8 @@ ClientRequestState::ClientRequestState(
   summary_profile_->AddInfoString("Network Address",
       lexical_cast<string>(session_->network_address));
   summary_profile_->AddInfoString("Default Db", default_db());
-  summary_profile_->AddInfoString("Sql Statement", query_ctx_.client_request.stmt);
+  summary_profile_->AddInfoStringRedacted(
+      "Sql Statement", query_ctx_.client_request.stmt);
   summary_profile_->AddInfoString("Coordinator",
       TNetworkAddressToString(exec_env->backend_address()));
 }
@@ -419,7 +420,7 @@ Status ClientRequestState::ExecQueryOrDmlRequest(
     plan_ss << "\n----------------\n"
             << query_exec_request.query_plan
             << "----------------";
-    summary_profile_->AddInfoString("Plan", plan_ss.str());
+    summary_profile_->AddInfoStringRedacted("Plan", plan_ss.str());
   }
   // Add info strings consumed by CM: Estimated mem and tables missing stats.
   if (query_exec_request.__isset.per_host_mem_estimate) {
@@ -741,7 +742,7 @@ Status ClientRequestState::UpdateQueryStatus(const Status& status) {
   if (!status.ok() && query_status_.ok()) {
     UpdateQueryState(beeswax::QueryState::EXCEPTION);
     query_status_ = status;
-    summary_profile_->AddInfoString("Query Status", query_status_.GetDetail());
+    summary_profile_->AddInfoStringRedacted("Query Status", query_status_.GetDetail());
   }
 
   return status;

http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/be/src/service/client-request-state.h
----------------------------------------------------------------------
diff --git a/be/src/service/client-request-state.h b/be/src/service/client-request-state.h
index 87f870a..d93512e 100644
--- a/be/src/service/client-request-state.h
+++ b/be/src/service/client-request-state.h
@@ -296,10 +296,18 @@ class ClientRequestState {
   /// * server_profile_ tracks time spent inside the ImpalaServer,
   ///   but not inside fragment execution, i.e. the time taken to
   ///   register and set-up the query and for rows to be fetched.
-  //
+  ///
   /// There's a fourth profile which is not built here (but is a
   /// child of profile_); the execution profile which tracks the
   /// actual fragment execution.
+  ///
+  /// Redaction: Only the following info strings in the profile are redacted as they
+  /// are expected to contain sensitive information like schema/column references etc.
+  /// Other fields are left unredacted.
+  /// - Query Statement
+  /// - Query Plan
+  /// - Query Status
+  /// - Error logs
   RuntimeProfile* const profile_;
   RuntimeProfile* const server_profile_;
   RuntimeProfile* const summary_profile_;

http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/be/src/service/impala-server.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 49009b6..5b895ee 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1030,8 +1030,8 @@ Status ImpalaServer::UnregisterQuery(const TUniqueId& query_id, bool check_infli
     TExecSummary t_exec_summary;
     request_state->coord()->GetTExecSummary(&t_exec_summary);
     string exec_summary = PrintExecSummary(t_exec_summary);
-    request_state->summary_profile()->AddInfoString("ExecSummary", exec_summary);
-    request_state->summary_profile()->AddInfoString("Errors",
+    request_state->summary_profile()->AddInfoStringRedacted("ExecSummary", exec_summary);
+    request_state->summary_profile()->AddInfoStringRedacted("Errors",
         request_state->coord()->GetErrorLog());
 
     const PerBackendExecParams& per_backend_params =

http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/be/src/util/runtime-profile.cc
----------------------------------------------------------------------
diff --git a/be/src/util/runtime-profile.cc b/be/src/util/runtime-profile.cc
index abb691d..a05e55c 100644
--- a/be/src/util/runtime-profile.cc
+++ b/be/src/util/runtime-profile.cc
@@ -466,14 +466,17 @@ void RuntimeProfile::AddInfoString(const string& key, const string& value) {
   return AddInfoStringInternal(key, value, false);
 }
 
+void RuntimeProfile::AddInfoStringRedacted(const string& key, const string& value) {
+  return AddInfoStringInternal(key, value, false, true);
+}
+
 void RuntimeProfile::AppendInfoString(const string& key, const string& value) {
   return AddInfoStringInternal(key, value, true);
 }
 
 void RuntimeProfile::AddInfoStringInternal(
-    const string& key, const string& value, bool append) {
-  // Values may contain sensitive data, such as a query.
-  const string& info = RedactCopy(value);
+    const string& key, const string& value, bool append, bool redact) {
+  const string& info = redact ? RedactCopy(value): value;
   lock_guard<SpinLock> l(info_strings_lock_);
   InfoStrings::iterator it = info_strings_.find(key);
   if (it == info_strings_.end()) {

http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/be/src/util/runtime-profile.h
----------------------------------------------------------------------
diff --git a/be/src/util/runtime-profile.h b/be/src/util/runtime-profile.h
index 8348161..320edf3 100644
--- a/be/src/util/runtime-profile.h
+++ b/be/src/util/runtime-profile.h
@@ -209,6 +209,10 @@ class RuntimeProfile { // NOLINT: This struct is not packed, but there are not s
   /// the value will be updated.
   void AddInfoString(const std::string& key, const std::string& value);
 
+  /// Same as AddInfoString(), except that this method applies the redaction
+  /// rules on 'value' before adding it to the runtime profile.
+  void AddInfoStringRedacted(const std::string& key, const std::string& value);
+
   /// Adds a string to the runtime profile.  If a value already exists for 'key',
   /// 'value' will be appended to the previous value, with ", " separating them.
   void AppendInfoString(const std::string& key, const std::string& value);
@@ -473,8 +477,9 @@ class RuntimeProfile { // NOLINT: This struct is not packed, but there are not s
 
   /// Implementation of AddInfoString() and AppendInfoString(). If 'append' is false,
   /// implements AddInfoString(), otherwise implements AppendInfoString().
+  /// Redaction rules are applied on the info string if 'redact' is true.
   void AddInfoStringInternal(
-      const std::string& key, const std::string& value, bool append);
+      const std::string& key, const std::string& value, bool append, bool redact = false);
 
   /// Name of the counter maintaining the total time.
   static const std::string TOTAL_TIME_COUNTER_NAME;

http://git-wip-us.apache.org/repos/asf/impala/blob/6a87eb20/tests/custom_cluster/test_redaction.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_redaction.py b/tests/custom_cluster/test_redaction.py
index 46fef9b..729c2ca 100644
--- a/tests/custom_cluster/test_redaction.py
+++ b/tests/custom_cluster/test_redaction.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import getpass
 import logging
 import os
 import pytest
@@ -160,6 +161,14 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
       assert results, "Web page %s should contain '%s' but does not" \
           % (url, search)
 
+  def assert_query_profile_contains(self, query_id, search):
+    ''' Asserts that the query profile for 'query_id' contains 'search' string'''
+    impala_service = self.create_impala_service()
+    url = 'query_profile?query_id=%s' % query_id
+    results = self.grep_file(impala_service.open_debug_webpage(url), search)
+    assert results, "Query profile %s should contain '%s' but does not" \
+        % (url, search)
+
   def assert_file_in_dir_contains(self, dir, search):
     '''Asserts that at least one file in the 'dir' contains the 'search' term.'''
     results = self.grep_dir(dir,search)
@@ -271,6 +280,7 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
     '''Check that redaction rules prevent 'sensitive' data from leaking into the
        logs and web ui.
     '''
+    current_user = getpass.getuser()
     self.start_cluster_using_rules(r"""
       {
         "version": 1,
@@ -285,17 +295,26 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
             "description": "Don't show credit cards numbers",
             "search": "\\d{4}-\\d{4}-\\d{4}-\\d{4}",
             "replace": "*credit card*"
+          },
+          {
+            "description": "Don't show current username in queries",
+            "search": "%s",
+            "replace": "redacted user"
           }
         ]
-      }""")
+      }""" % current_user)
     email = 'FOO@bar.com'
     # GROUP BY an expr containing the email so the expr will also be shown in the exec
     # node summary, ie HASH(string_col = ...).
-    self.execute_query_expect_success(self.client,
-        "SELECT string_col = '%s', COUNT(*) FROM functional.alltypes GROUP BY 1" % email)
+    query_template =\
+        "SELECT string_col = '%s', COUNT(*) FROM functional.alltypes GROUP BY 1"
+    self.execute_query_expect_success(self.client, query_template % email)
 
+    user_profile_pattern = "User: %s" % current_user
     # The email should be replaced with '*email*'.
     self.assert_web_ui_redaction(self.find_last_query_id(), email, "*email*")
+    # User field should not be redacted from the query profile.
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
     # Wait for the logs to be written.
     sleep(5)
     self.assert_log_redaction(email, "*email*")
@@ -306,10 +325,17 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
     # This assertion below relies on the fact that there is a syntax error be near
     # the credit card number so the number would have appeared in the message.
     self.assert_web_ui_redaction(self.find_last_query_id(), credit_card, "*credit card*")
+    # User field should not be redacted from the query profile.
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
     sleep(5)
     # Apparently an invalid query doesn't generate an audit log entry.
     self.assert_log_redaction(credit_card, "*credit card*", expect_audit=False)
 
+    # Assert that the username in the query stmt is redacted but not from the user fields.
+    self.execute_query_expect_success(self.client, query_template % current_user)
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
+    self.assert_query_profile_contains(self.find_last_query_id(), "redacted user")
+
     # Since all the tests passed, the log dir shouldn't be of interest and can be
     # deleted.
     shutil.rmtree(self.tmp_dir)