You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by kw...@apache.org on 2017/03/10 02:16:22 UTC

incubator-impala git commit: IMPALA-4929: Safe concurrent access to IR function call graph

Repository: incubator-impala
Updated Branches:
  refs/heads/master 0467b0d54 -> 5f2fcd9f6


IMPALA-4929: Safe concurrent access to IR function call graph

Previously, LlvmCodeGen creates a map (fn_refs_map_) which
maps an IR function to its set of callees. That map was initialized
once at Impalad's start time and it is supposed to be read-only
afterwards.

However, the map was unintentionally modified by its user
LlvmCodeGen::MaterializeFunctionHelper() due to the use of
operator []. In particular, that operator may create a new
entry in the map if it doesn't exist already. This is possible
because the map initially only contains entries for functions
with at least one callee function. Using the operator [] with
functions with no callee function may modify the map. Since the
map is shared by all fragment instances, such unsafe concurrent
modification can cause missing or wrong callee functions to be
materialized, leading to failure to resolve symbols in LLVM.

This change fixes the problem above by:
1. Create an entry for all functions even if it has no callee.
   In fact, LlvmCodeGen::LinkModule() assumes all functions
   defined in main module will have entries in the map.

2. Introduce a new class CodegenCallGraph which encapsulates
   the map described above. The map was established once at
   initialization time and there is no interface to modify the
   map afterwards, thus preventing any accidental modification
   to the map at run time.

Change-Id: I1acd6bad80341121c8189d817e0fe62f2862f28a
Reviewed-on: http://gerrit.cloudera.org:8080/6326
Reviewed-by: Michael Ho <kw...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/5f2fcd9f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/5f2fcd9f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/5f2fcd9f

Branch: refs/heads/master
Commit: 5f2fcd9f6ae97c52f395f509b7b221a3b27d2c45
Parents: 0467b0d
Author: Michael Ho <kw...@cloudera.com>
Authored: Wed Mar 8 20:17:31 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Fri Mar 10 02:00:47 2017 +0000

----------------------------------------------------------------------
 be/src/codegen/CMakeLists.txt       |  1 +
 be/src/codegen/codegen-callgraph.cc | 95 ++++++++++++++++++++++++++++++++
 be/src/codegen/codegen-callgraph.h  | 73 ++++++++++++++++++++++++
 be/src/codegen/llvm-codegen.cc      | 77 +++++---------------------
 be/src/codegen/llvm-codegen.h       | 23 ++------
 5 files changed, 188 insertions(+), 81 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5f2fcd9f/be/src/codegen/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/be/src/codegen/CMakeLists.txt b/be/src/codegen/CMakeLists.txt
index d002c68..edd78a0 100644
--- a/be/src/codegen/CMakeLists.txt
+++ b/be/src/codegen/CMakeLists.txt
@@ -28,6 +28,7 @@ set(IR_NO_SSE_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-no-
 
 add_library(CodeGen
   codegen-anyval.cc
+  codegen-callgraph.cc
   codegen-symbol-emitter.cc
   llvm-codegen.cc
   instruction-counter.cc

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5f2fcd9f/be/src/codegen/codegen-callgraph.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-callgraph.cc b/be/src/codegen/codegen-callgraph.cc
new file mode 100644
index 0000000..73f5d18
--- /dev/null
+++ b/be/src/codegen/codegen-callgraph.cc
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "codegen/codegen-callgraph.h"
+
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/GlobalVariable.h>
+
+#include "runtime/lib-cache.h"
+
+#include "common/names.h"
+
+using namespace llvm;
+using namespace strings;
+
+namespace impala {
+
+bool CodegenCallGraph::IsDefinedInImpalad(const string& fn_name) {
+  void* fn_ptr = nullptr;
+  Status status =
+      LibCache::instance()->GetSoFunctionPtr("", fn_name, &fn_ptr, nullptr, true);
+  return status.ok();
+}
+
+void CodegenCallGraph::FindGlobalUsers(User* val, vector<GlobalObject*>* users) {
+  for (Use& u: val->uses()) {
+    User* user = u.getUser();
+    if (isa<Instruction>(user)) {
+      Instruction* inst = dyn_cast<Instruction>(u.getUser());
+      users->push_back(inst->getFunction());
+    } else if (isa<GlobalVariable>(user)) {
+      GlobalVariable* gv = cast<GlobalVariable>(user);
+      string val_name = gv->getName();
+      // We strip global ctors and dtors out of the modules as they are not run.
+      if (val_name.find("llvm.global_ctors") == string::npos &&
+          val_name.find("llvm.global_dtors") == string::npos) {
+        users->push_back(gv);;
+      }
+    } else if (isa<Constant>(user)) {
+      FindGlobalUsers(user, users);
+    } else {
+      DCHECK(false) << "Unknown user's types for " << val->getName().str();
+    }
+  }
+}
+
+void CodegenCallGraph::Init(Module* module) {
+  DCHECK(!inited_);
+  // Create a mapping of functions to their referenced functions.
+  for (Function& fn: module->functions()) {
+    if (fn.isIntrinsic() || fn.isDeclaration()) continue;
+    string fn_name = fn.getName();
+    // Create an entry for a function if it doesn't exist already.
+    // This creates entries for functions which don't have any callee.
+    if (call_graph_.find(fn_name) == call_graph_.end()) {
+      call_graph_.emplace(fn_name, unordered_set<string>());
+    }
+    vector<GlobalObject*> users;
+    FindGlobalUsers(&fn, &users);
+    for (GlobalValue* val: users) {
+      const string& caller_name = val->getName();
+      DCHECK(isa<GlobalVariable>(val) || isa<Function>(val));
+      // 'call_graph_' contains functions which need to be materialized when a certain
+      // IR Function is materialized. We choose to include functions referenced by
+      // another IR function in the map even if it's defined in Impalad binary so it
+      // can be inlined for further optimization. This is not applicable for functions
+      // referenced by global variables only.
+      if (isa<GlobalVariable>(val)) {
+        if (IsDefinedInImpalad(fn_name)) continue;
+        fns_referenced_by_gv_.insert(fn_name);
+      } else {
+        // There may not be an entry for 'caller_name' yet. Create an entry if needed.
+        call_graph_[caller_name].insert(fn_name);
+      }
+    }
+  }
+  inited_ = true;
+}
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5f2fcd9f/be/src/codegen/codegen-callgraph.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/codegen-callgraph.h b/be/src/codegen/codegen-callgraph.h
new file mode 100644
index 0000000..72427e7
--- /dev/null
+++ b/be/src/codegen/codegen-callgraph.h
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef IMPALA_CODEGEN_CODEGEN_CALLGRAPH_H
+#define IMPALA_CODEGEN_CODEGEN_CALLGRAPH_H
+
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include <llvm/IR/Module.h>
+
+#include "codegen/llvm-codegen.h"
+
+namespace impala {
+
+/// This class implements the callgraph of functions in a LLVM module.
+class CodegenCallGraph {
+ public:
+  /// Creates an uninitialized call graph.
+  CodegenCallGraph() : inited_(false) { }
+
+  /// Initializes the call graph based on the functions defined in 'module'.
+  void Init(llvm::Module* module);
+
+  /// Returns the callees of the function with name 'fn_name'.
+  /// Returns NULL if the function is not found in the call graph.
+  const boost::unordered_set<std::string>* GetCallees(const std::string& fn_name) const {
+    DCHECK(inited_);
+    CallGraph::const_iterator iter = call_graph_.find(fn_name);
+    return LIKELY(iter != call_graph_.end()) ? &iter->second : nullptr;
+  }
+
+  /// Returns the set of functions referenced by global variables in 'module'
+  const boost::unordered_set<std::string>& fns_referenced_by_gv() const {
+    DCHECK(inited_);
+    return fns_referenced_by_gv_;
+  }
+
+ private:
+  bool inited_;
+
+  /// Map of functions' names to their callee functions' names.
+  typedef boost::unordered_map<std::string, boost::unordered_set<std::string>> CallGraph;
+  CallGraph call_graph_;
+
+  /// The set of all functions referenced by global variables.
+  /// They always need to be materialized.
+  boost::unordered_set<std::string> fns_referenced_by_gv_;
+
+  /// Returns true if the function 'fn' is defined in the Impalad native code.
+  static bool IsDefinedInImpalad(const std::string& fn);
+
+  /// Find all global variables and functions which reference the llvm::Value 'val'
+  /// and append them to 'users'.
+  static void FindGlobalUsers(llvm::User* val, std::vector<llvm::GlobalObject*>* users);
+};
+
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5f2fcd9f/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index a548654..7977c01 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -53,6 +53,7 @@
 #include <llvm/Transforms/Utils/Cloning.h>
 
 #include "codegen/codegen-anyval.h"
+#include "codegen/codegen-callgraph.h"
 #include "codegen/codegen-symbol-emitter.h"
 #include "codegen/impala-ir-data.h"
 #include "codegen/instruction-counter.h"
@@ -101,46 +102,15 @@ DECLARE_string(local_library_dir);
 namespace impala {
 
 bool LlvmCodeGen::llvm_initialized_ = false;
-
 string LlvmCodeGen::cpu_name_;
 vector<string> LlvmCodeGen::cpu_attrs_;
-unordered_set<string> LlvmCodeGen::fns_to_always_materialize_;
-FnRefsMap LlvmCodeGen::fn_refs_map_;
+CodegenCallGraph LlvmCodeGen::shared_call_graph_;
 
 [[noreturn]] static void LlvmCodegenHandleError(
     void* user_data, const std::string& reason, bool gen_crash_diag) {
   LOG(FATAL) << "LLVM hit fatal error: " << reason.c_str();
 }
 
-bool LlvmCodeGen::IsDefinedInImpalad(const string& fn_name) {
-  void* fn_ptr = NULL;
-  Status status =
-      LibCache::instance()->GetSoFunctionPtr("", fn_name, &fn_ptr, NULL, true);
-  return status.ok();
-}
-
-void LlvmCodeGen::FindGlobalUsers(User* val, vector<GlobalObject*>* users) {
-  for (Use& u: val->uses()) {
-    User* user = u.getUser();
-    if (isa<Instruction>(user)) {
-      Instruction* inst = dyn_cast<Instruction>(u.getUser());
-      users->push_back(inst->getFunction());
-    } else if (isa<GlobalVariable>(user)) {
-      GlobalVariable* gv = cast<GlobalVariable>(user);
-      string val_name = gv->getName();
-      // We strip global ctors and dtors out of the modules as they are not run.
-      if (val_name.find("llvm.global_ctors") == string::npos &&
-          val_name.find("llvm.global_dtors") == string::npos) {
-        users->push_back(gv);;
-      }
-    } else if (isa<Constant>(user)) {
-      FindGlobalUsers(user, users);
-    } else {
-      DCHECK(false) << "Unknown user's types for " << val->getName().str();
-    }
-  }
-}
-
 Status LlvmCodeGen::InitializeLlvm(bool load_backend) {
   DCHECK(!llvm_initialized_);
   llvm::remove_fatal_error_handler();
@@ -187,28 +157,8 @@ Status LlvmCodeGen::InitializeLlvm(bool load_backend) {
     }
   }
 
-  // Create a mapping of functions to their referenced functions.
-  for (Function& fn: init_codegen->module_->functions()) {
-    if (fn.isIntrinsic() || fn.isDeclaration()) continue;
-    string fn_name = fn.getName();
-    vector<GlobalObject*> users;
-    FindGlobalUsers(&fn, &users);
-    for (GlobalValue* val: users) {
-      string key = val->getName();
-      DCHECK(isa<GlobalVariable>(val) || isa<Function>(val));
-      // 'fn_refs_map_' contains functions which need to be materialized when a certain
-      // IR Function is materialized. We choose to include functions referenced by
-      // another IR function in the map even if it's defined in Impalad binary so it
-      // can be inlined for further optimization. This is not applicable for functions
-      // referenced by global variables only.
-      if (isa<GlobalVariable>(val)) {
-        if (IsDefinedInImpalad(fn_name)) continue;
-        fns_to_always_materialize_.insert(fn_name);
-      } else {
-        fn_refs_map_[key].insert(fn_name);
-      }
-    }
-  }
+  // Initialize the global shared call graph.
+  shared_call_graph_.Init(init_codegen->module_);
   return Status::OK();
 }
 
@@ -336,8 +286,9 @@ Status LlvmCodeGen::LinkModule(const string& file) {
   // are chosen by the linker or referenced by functions in the new module. Note that
   // linkModules() will materialize functions defined only in the new module.
   for (Function& fn: new_module->functions()) {
-    if (fn_refs_map_.find(fn.getName()) != fn_refs_map_.end()) {
-      Function* local_fn = module_->getFunction(fn.getName());
+    const string& fn_name = fn.getName();
+    if (shared_call_graph_.GetCallees(fn_name) != nullptr) {
+      Function* local_fn = module_->getFunction(fn_name);
       RETURN_IF_ERROR(MaterializeFunction(local_fn));
     }
   }
@@ -388,7 +339,7 @@ Status LlvmCodeGen::CreateImpalaCodegen(RuntimeState* state,
   }
 
   // Materialize functions referenced by the global variables.
-  for (const string& fn_name : fns_to_always_materialize_) {
+  for (const string& fn_name : shared_call_graph_.fns_referenced_by_gv()) {
     Function* fn = codegen->module_->getFunction(fn_name);
     DCHECK(fn != nullptr);
     RETURN_IF_ERROR(codegen->MaterializeFunction(fn));
@@ -638,11 +589,13 @@ Status LlvmCodeGen::MaterializeFunctionHelper(Function *fn) {
 
   // Materialized functions are marked as not materializable by LLVM.
   DCHECK(!fn->isMaterializable());
-  const unordered_set<string>& callees = fn_refs_map_[fn->getName().str()];
-  for (const string& callee: callees) {
-    Function* callee_fn = module_->getFunction(callee);
-    DCHECK(callee_fn != nullptr);
-    RETURN_IF_ERROR(MaterializeFunctionHelper(callee_fn));
+  const unordered_set<string>* callees = shared_call_graph_.GetCallees(fn->getName());
+  if (callees != nullptr) {
+    for (const string& callee : *callees) {
+      Function* callee_fn = module_->getFunction(callee);
+      DCHECK(callee_fn != nullptr);
+      RETURN_IF_ERROR(MaterializeFunctionHelper(callee_fn));
+    }
   }
   return Status::OK();
 }

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/5f2fcd9f/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index 058403d..5a8c494 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -27,7 +27,6 @@
 #include <vector>
 #include <boost/scoped_ptr.hpp>
 
-#include <boost/unordered_map.hpp>
 #include <boost/unordered_set.hpp>
 
 #include <llvm/IR/DerivedTypes.h>
@@ -72,6 +71,7 @@ namespace llvm {
 
 namespace impala {
 
+class CodegenCallGraph;
 class CodegenSymbolEmitter;
 class ImpalaMCJITMemoryManager;
 class SubExprElimination;
@@ -82,8 +82,6 @@ class LlvmBuilder : public llvm::IRBuilder<> {
   using llvm::IRBuilder<>::IRBuilder;
 };
 
-/// Map of functions' names to their callee functions' names.
-typedef boost::unordered_map<std::string, boost::unordered_set<std::string>> FnRefsMap;
 
 /// LLVM code generator.  This is the top level object to generate jitted code.
 //
@@ -521,13 +519,6 @@ class LlvmCodeGen {
   friend class LlvmCodeGenTest;
   friend class SubExprElimination;
 
-  /// Returns true if the function 'fn' is defined in the Impalad native code.
-  static bool IsDefinedInImpalad(const std::string& fn);
-
-  /// Find all global variables and functions which reference the llvm::Value 'val'
-  /// and return them in 'users'.
-  static void FindGlobalUsers(llvm::User* val, std::vector<llvm::GlobalObject*>* users);
-
   /// Top level codegen object. 'module_id' is used for debugging when outputting the IR.
   LlvmCodeGen(RuntimeState* state, ObjectPool* pool, MemTracker* parent_mem_tracker,
       const std::string& module_id);
@@ -643,15 +634,9 @@ class LlvmCodeGen {
   static std::string cpu_name_;
   static std::vector<std::string> cpu_attrs_;
 
-  /// A call graph for all IR functions in the main module. Used for determining
-  /// dependencies when materializing IR functions.
-  static FnRefsMap fn_refs_map_;
-
-  /// This set contains names of all functions which always need to be materialized.
-  /// They are referenced by global variables but NOT defined in the Impalad native
-  /// code (they may have been inlined by gcc). These functions are always materialized
-  /// when a module is loaded to ensure that LLVM can resolve references to them.
-  static boost::unordered_set<std::string> fns_to_always_materialize_;
+  /// A global shared call graph for all IR functions in the main module.
+  /// Used for determining dependencies when materializing IR functions.
+  static CodegenCallGraph shared_call_graph_;
 
   /// Pointer to the RuntimeState which owns this codegen object. Needed in
   /// InlineConstFnAttr() to access the query options.