You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/04/19 21:07:14 UTC

[impala] branch master updated: IMPALA-8270: fix MemTracker teardown in FeSupport

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 8412c77  IMPALA-8270: fix MemTracker teardown in FeSupport
8412c77 is described below

commit 8412c772cb551c59fdb6e9dd59b49043f4aa9ef6
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Wed Apr 17 09:30:14 2019 -0700

    IMPALA-8270: fix MemTracker teardown in FeSupport
    
    This patch tries to simplify and standardise the order
    in which control structures are torn down. As a
    consequence the bug is fixed. I've described the bug
    below.
    
    The changes are:
    * Make more control structures owned directly by
      QueryState::obj_pool_, so that they are all
      destroyed at the same time via ~QueryState.
    * Tear down local_query_state_ explicitly before
      other destructors run.
    
    Either change is sufficient to fix the bug, but
    I preferred to do both  to reduce the chances
    of similar bugs in future.
    
    Description of bug:
    ===================
    In the normal query execution flow:
    - RuntimeState is in QueryState::obj_pool_
    - RuntimeState owns RuntimeState::instance_mem_tracker_ via scoped_ptr
    - QueryState::query_mem_tracker_ is in QueryState::obj_pool_
    - QueryState::query_mem_tracker_ has a reference to
      RuntimeState::instance_mem_tracker_
    The tear-down works because ~QueryState unregisters query_mem_tracker_
    from its parent, making the whole subtree unreachable before destroying
    QueryState::obj_pool_. It is thus safe to destroy instance_mem_tracker_
    along with the rest of obj_pool_.
    
    FeSupport messes this up by having RuntimeState own the QueryState
    RuntimeState::local_query_state_ via a scoped_ptr, and the implied
    destructor order means that RuntimeState::instance_mem_tracker_ is
    destroyed before RuntimeState::local_query_state_, which breaks the
    above flow and the destroyed instance_mem_tracker_ is reachable from
    the process MemTracker via QueryState::query_mem_tracker_ for a
    small window until it is unregistered.
    
    Testing:
    Added a backend test that reproduced the ASAN use-after-free
    failure when run against unmodified RuntimeState code. I did
    not make it a unified backend test so that it would be easier
    to backport this fix to older versions that don't have unified
    tests.
    
    Change-Id: If815130cd4db00917746f10b28514f779ee254f0
    Reviewed-on: http://gerrit.cloudera.org:8080/13057
    Reviewed-by: Tim Armstrong <ta...@cloudera.com>
    Reviewed-by: Joe McDonnell <jo...@cloudera.com>
    Reviewed-by: Bikramjeet Vig <bi...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/CMakeLists.txt        |  1 +
 be/src/runtime/runtime-state-test.cc | 71 ++++++++++++++++++++++++++++++++++++
 be/src/runtime/runtime-state.cc      | 17 ++++++---
 be/src/runtime/runtime-state.h       | 15 +++++---
 4 files changed, 92 insertions(+), 12 deletions(-)

diff --git a/be/src/runtime/CMakeLists.txt b/be/src/runtime/CMakeLists.txt
index a84344d..a8c9b8b 100644
--- a/be/src/runtime/CMakeLists.txt
+++ b/be/src/runtime/CMakeLists.txt
@@ -101,3 +101,4 @@ ADD_BE_LSAN_TEST(tmp-file-mgr-test)
 ADD_BE_LSAN_TEST(row-batch-serialize-test)
 ADD_BE_LSAN_TEST(row-batch-test)
 ADD_BE_LSAN_TEST(collection-value-builder-test)
+ADD_BE_LSAN_TEST(runtime-state-test)
diff --git a/be/src/runtime/runtime-state-test.cc b/be/src/runtime/runtime-state-test.cc
new file mode 100644
index 0000000..0e536f7
--- /dev/null
+++ b/be/src/runtime/runtime-state-test.cc
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+
+#include <boost/thread/thread.hpp>
+
+#include "codegen/llvm-codegen.h"
+#include "common/atomic.h"
+#include "runtime/mem-tracker.h"
+#include "runtime/runtime-state.h"
+#include "runtime/test-env.h"
+#include "service/fe-support.h"
+#include "testutil/gtest-util.h"
+
+#include "common/names.h"
+
+namespace impala {
+
+// Regression test for IMPALA-8270: MemTracker teardown order leading to
+// use-after-free from LogUsage.
+TEST(RuntimeStateTest, MemTrackerTeardown) {
+  const int LOG_USAGE_THREADS = 4;
+  const int CONSTRUCT_ITERS = 10;
+  TestEnv test_env;
+  ASSERT_OK(test_env.Init());
+  // Poll LogUsage() from other threads to reproduce race.
+  AtomicBool stop_threads(false);
+  thread_group log_usage_threads;
+  for (int i = 0; i < LOG_USAGE_THREADS; ++i) {
+    log_usage_threads.add_thread(new thread([&test_env, &stop_threads]() {
+      while (!stop_threads.Load()) {
+        string usage = test_env.exec_env()->process_mem_tracker()->LogUsage(100);
+      }
+    }));
+  }
+
+  // Construct and tear down RuntimeState repeatedly to try to reproduce race.
+  for (int i = 0; i < CONSTRUCT_ITERS; ++i) {
+    TQueryCtx query_ctx;
+    unique_ptr<RuntimeState> state(new RuntimeState(query_ctx, test_env.exec_env()));
+    ASSERT_TRUE(state->instance_mem_tracker() != nullptr);
+    state->ReleaseResources();
+    state.reset();
+  }
+  stop_threads.Store(true);
+  log_usage_threads.join_all();
+}
+}
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  impala::InitCommonRuntime(argc, argv, true, impala::TestInfo::BE_TEST);
+  impala::InitFeSupport();
+  ABORT_IF_ERROR(impala::LlvmCodeGen::InitializeLlvm());
+  return RUN_ALL_TESTS();
+}
diff --git a/be/src/runtime/runtime-state.cc b/be/src/runtime/runtime-state.cc
index 8ab60b7..09dbd60 100644
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -75,7 +75,7 @@ RuntimeState::RuntimeState(QueryState* query_state, const TPlanFragmentCtx& frag
     local_time_zone_(&TimezoneDatabase::GetUtcTimezone()),
     profile_(RuntimeProfile::Create(
         obj_pool(), "Fragment " + PrintId(instance_ctx.fragment_instance_id))),
-    instance_buffer_reservation_(new ReservationTracker) {
+    instance_buffer_reservation_(obj_pool()->Add(new ReservationTracker)) {
   Init();
 }
 
@@ -95,7 +95,8 @@ RuntimeState::RuntimeState(
     now_(new TimestampValue(TimestampValue::Parse(qctx.now_string))),
     utc_timestamp_(new TimestampValue(TimestampValue::Parse(qctx.utc_timestamp_string))),
     local_time_zone_(&TimezoneDatabase::GetUtcTimezone()),
-    profile_(RuntimeProfile::Create(obj_pool(), "<unnamed>")) {
+    profile_(RuntimeProfile::Create(obj_pool(), "<unnamed>")),
+    instance_buffer_reservation_(nullptr) {
   // We may use execution resources while evaluating exprs, etc. Decremented in
   // ReleaseResources() to release resources.
   local_query_state_->AcquireBackendResourceRefcount();
@@ -108,6 +109,10 @@ RuntimeState::RuntimeState(
 
 RuntimeState::~RuntimeState() {
   DCHECK(released_resources_) << "Must call ReleaseResources()";
+  // IMPALA-8270: run local_query_state_ destructor *before* other destructors so that
+  // teardown order for the TestEnv/fe-support RuntimeState matches the teardown order
+  // for the "real" RuntimeState. The previous divergence lead to hard-to-find bugs.
+  if (local_query_state_ != nullptr) local_query_state_.reset();
 }
 
 void RuntimeState::Init() {
@@ -126,12 +131,12 @@ void RuntimeState::Init() {
   total_network_send_timer_ = ADD_TIMER(runtime_profile(), "TotalNetworkSendTime");
   total_network_receive_timer_ = ADD_TIMER(runtime_profile(), "TotalNetworkReceiveTime");
 
-  instance_mem_tracker_.reset(new MemTracker(
+  instance_mem_tracker_ = obj_pool()->Add(new MemTracker(
       runtime_profile(), -1, runtime_profile()->name(), query_mem_tracker()));
 
   if (instance_buffer_reservation_ != nullptr) {
     instance_buffer_reservation_->InitChildTracker(profile_,
-        query_state_->buffer_reservation(), instance_mem_tracker_.get(),
+        query_state_->buffer_reservation(), instance_mem_tracker_,
         numeric_limits<int64_t>::max());
   }
 
@@ -163,7 +168,7 @@ Status RuntimeState::CreateCodegen() {
   if (codegen_.get() != NULL) return Status::OK();
   // TODO: add the fragment ID to the codegen ID as well
   RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen(this,
-      instance_mem_tracker_.get(), PrintId(fragment_instance_id()), &codegen_));
+      instance_mem_tracker_, PrintId(fragment_instance_id()), &codegen_));
   codegen_->EnableOptimizations(true);
   profile_->AddChild(codegen_->runtime_profile());
   return Status::OK();
@@ -269,7 +274,7 @@ void RuntimeState::SetMemLimitExceeded(MemTracker* tracker,
 Status RuntimeState::CheckQueryState() {
   DCHECK(instance_mem_tracker_ != nullptr);
   if (UNLIKELY(instance_mem_tracker_->AnyLimitExceeded(MemLimit::HARD))) {
-    SetMemLimitExceeded(instance_mem_tracker_.get());
+    SetMemLimitExceeded(instance_mem_tracker_);
   }
   return GetQueryStatus();
 }
diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h
index 5025215..d619bb0 100644
--- a/be/src/runtime/runtime-state.h
+++ b/be/src/runtime/runtime-state.h
@@ -107,10 +107,10 @@ class RuntimeState {
         ? instance_ctx_->fragment_instance_id
         : no_instance_id_;
   }
-  MemTracker* instance_mem_tracker() { return instance_mem_tracker_.get(); }
+  MemTracker* instance_mem_tracker() { return instance_mem_tracker_; }
   MemTracker* query_mem_tracker();  // reference to the query_state_'s memtracker
   ReservationTracker* instance_buffer_reservation() {
-    return instance_buffer_reservation_.get();
+    return instance_buffer_reservation_;
   }
   ThreadResourcePool* resource_pool() { return resource_pool_.get(); }
 
@@ -368,12 +368,15 @@ class RuntimeState {
   /// Counters for bytes sent over the network in this instance's tree, not owned.
   std::vector<RuntimeProfile::Counter*> bytes_sent_counters_;
 
-  /// Memory usage of this fragment instance, a child of 'query_mem_tracker_'.
-  boost::scoped_ptr<MemTracker> instance_mem_tracker_;
+  /// Memory usage of this fragment instance, a child of 'query_mem_tracker_'. Owned by
+  /// 'query_state_' and destroyed with the rest of the query's MemTracker hierarchy.
+  /// See IMPALA-8270 for a reason why having the QueryState own this is important.
+  MemTracker* instance_mem_tracker_ = nullptr;
 
   /// Buffer reservation for this fragment instance - a child of the query buffer
-  /// reservation. Non-NULL if 'query_state_' is not NULL.
-  boost::scoped_ptr<ReservationTracker> instance_buffer_reservation_;
+  /// reservation. Non-NULL if this is a finstance's RuntimeState used for query
+  /// execution. Owned by 'query_state_'.
+  ReservationTracker* const instance_buffer_reservation_;
 
   /// if true, execution should stop with a CANCELLED status
   bool is_cancelled_ = false;