You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2021/04/07 18:28:50 UTC

[GitHub] [tvm] csullivan commented on a change in pull request #7518: [RFC]TECompiler: Staged refactor and removal of compile engine

csullivan commented on a change in pull request #7518:
URL: https://github.com/apache/tvm/pull/7518#discussion_r608872546



##########
File path: src/relay/backend/graph_executor_codegen.cc
##########
@@ -349,65 +380,47 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     return AddNode(node, GetRef<Expr>(op));
   }
 
-  std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
-    Expr expr = GetRef<Expr>(op);
-    Function func;
-    if (op->op.as<OpNode>()) {
-      LOG(FATAL) << "Operators should be transformed away; try applying"
-                 << "the fuse_ops transformation to the expression.";
-    } else if (op->op.as<GlobalVarNode>()) {
-      LOG(FATAL) << "Not implemented";
-    } else if (op->op.as<FunctionNode>()) {
-      func = GetRef<Function>(op->op.as<FunctionNode>());
-    } else {
-      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
-    }
-    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
-      LOG(FATAL) << "TVM only support calls to primitive functions "
-                 << "(i.e functions composed of fusable operator invocations)";
-    }
+  std::vector<GraphNodeRef> VisitExpr_(const CallNode* call_node) override {
+    relay::Call call = GetRef<Call>(call_node);
+    if (auto global_node = call->op.as<GlobalVarNode>()) {
 
-    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
-    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
-    Target target;
-    // Handle external function
-    if (func->GetAttr<String>(attr::kCompiler).defined()) {
-      target = Target("ext_dev");
-      CCacheKey key = (*pf0)(func, target);
-      CachedFunc ext_func = (*pf1)(compile_engine_, key);
-      ICHECK(ext_func.defined()) << "External function is not defined.";
-      UpdateConstants(func, &params_);
-      return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
-    }
+      auto prim_fn_name = global_node->name_hint;
 
-    ICHECK_GE(storage_device_map_.count(expr), 0);
-    auto& device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;
-    // Normal Relay Function
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      const auto& it = targets_.begin();
-      target = (*it).second;
-    } else {
-      // heterogeneous execution.
-      std::string call_dev_name;
-      if (call_dev_type == 0) {
-        call_dev_name = "llvm";
+      Target target;
+
+      // // Handle external function
+      // if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      //   UpdateConstants(func, &params_);
+      //   return GraphAddCallNode(call_node, prim_fn_name, prim_fn_name);

Review comment:
       We still need emit a call for the external function, no?

##########
File path: src/relay/backend/graph_executor_codegen.cc
##########
@@ -33,10 +33,15 @@
 #include <vector>
 
 #include "compile_engine.h"
+#include "te_compiler.h"
 #include "utils.h"
 
 namespace tvm {
 namespace relay {
+
+/// TODO(@jroesch, @chris): declare directly elsewhere

Review comment:
       ```suggestion
   /// TODO(@jroesch, @csullivan): declare directly elsewhere
   ```

##########
File path: tests/python/relay/test_backend_graph_executor.py
##########
@@ -126,6 +126,7 @@ def test_plan_memory():
     func = relay.Function([x, y], z)
     mod = tvm.IRModule.from_expr(func)
     mod = relay.transform.InferType()(mod)
+    print(mod)

Review comment:
       ```suggestion
   ```

##########
File path: tests/python/relay/test_backend_graph_executor.py
##########
@@ -231,10 +232,12 @@ def test_graph_executor_nested_tuples():
 
 
 if __name__ == "__main__":
-    test_plan_memory()
-    test_with_params()
-    test_add_op_scalar()
     test_add_op_tensor()
-    test_add_op_broadcast()
-    test_gru_like()
-    test_compile_nested_tuples()
+    # test_plan_memory()
+    # test_with_params()
+    # test_add_op_scalar()
+    # test_add_op_tensor()
+    # test_add_op_broadcast()
+    # test_gru_like()
+    # test_compile_nested_tuples()
+    # test_add_op_tensor()

Review comment:
       Shouldn't need to disable these,
   ```suggestion
       test_plan_memory()
       test_with_params()
       test_add_op_scalar()
       test_add_op_tensor()
       test_add_op_broadcast()
       test_gru_like()
       test_compile_nested_tuples()
       test_add_op_tensor()
   ```

##########
File path: src/relay/backend/compile_engine.cc
##########
@@ -731,50 +170,50 @@ class CompileEngineImpl : public CompileEngineNode {
     // No need to lower external functions for now. We will invoke the external
     // codegen tool once and lower all functions together.
     if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
-      auto cache_node = make_object<CachedFuncNode>();
+      auto ir_module = IRModule();
       const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
       ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
-      cache_node->func_name = std::string(name_node.value());
-      cache_node->target = Target("ext_dev");
-      cache_node->funcs->Add(GlobalVar(cache_node->func_name), key->source_func);
-      value->cached_func = CachedFunc(cache_node);
+      auto func_name = std::string(name_node.value());
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      ir_module->Add(global_var, key->source_func);
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
       return value;
     }
+
     // Enforce use the target.
     With<Target> target_scope(key->target);
 
     ICHECK(!value->cached_func.defined());
-    auto cfunc = CreateSchedule(key->source_func, key->target);
-    auto cache_node = make_object<CachedFuncNode>(*(cfunc.operator->()));
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, name_map_); });
 
     // Skip lowering for device copy node.
     const Expr body = (key->source_func)->body;
     if (const CallNode* call_node = body.as<CallNode>()) {
       if (call_node->attrs.as<DeviceCopyAttrs>()) {
-        value->cached_func = CachedFunc(cache_node);
+        value->cached_func = cfunc;
         return value;
       }
     }
 
-    cache_node->func_name = GetUniqueName(cache_node->func_name);
     // NOTE: array will copy on write.
-    Array<te::Tensor> all_args = cache_node->inputs;
-    for (te::Tensor arg : cache_node->outputs) {
+    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
+    for (te::Tensor arg : cfunc->outputs) {
       all_args.push_back(arg);
     }
-    // lower the function
-    if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {
-      cache_node->funcs = (*f)(cfunc->schedule, all_args, cache_node->func_name, key->source_func);
-    } else {
-      using tvm::transform::PassContext;
-      With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
 
-      std::unordered_map<te::Tensor, tir::Buffer> binds;
-      cache_node->funcs = tvm::lower(cfunc->schedule, all_args, cache_node->func_name, binds);
-    }
-    value->cached_func = CachedFunc(cache_node);
+    using tvm::transform::PassContext;
+    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
+
+    std::unordered_map<te::Tensor, tir::Buffer> binds;
+    auto func_name = cfunc->prim_fn_var->name_hint;
+    cfunc->funcs->Update(tvm::lower(cfunc->schedule, all_args, func_name, binds));

Review comment:
       It looks like the compiler has been relying on `if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {` for long enough that the set of passes run in the python impl of tvm.lower (in python/driver/build_module.py) is different than the c++ `tvm::lower`. 
   
   My feeling is that we should use the same flow as before to avoid too much disruption and then refactor after this lands. 

##########
File path: src/runtime/graph_executor/graph_executor.cc
##########
@@ -372,6 +372,7 @@ void GraphExecutor::SetupOpExecs() {
       uint32_t eid = this->entry_id(e);
       args.push_back(*(data_entry_[eid].operator->()));
     }
+

Review comment:
       ```suggestion
   ```

##########
File path: src/relay/backend/te_compiler.cc
##########
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "te_compiler.h"
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../transforms/pass_utils.h"
+#include "te_compiler_cache.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+using namespace tvm::relay::transform;
+
+TVM_REGISTER_OBJECT_TYPE(TECompilerNode);
+
+class TECompilerImpl : public TECompilerNode {
+ public:
+  // Lower the function.
+  CachedFunc Lower(const CCacheKey& key) { return LowerInternal(key)->cached_func; }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) {
+      return value->packed_func;
+    }
+    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
+    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
+    return value->packed_func;
+  }
+
+  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
+    return LowerShapeFuncInternal(key)->cached_func;
+  }
+
+  Map<String, IRModule> GetLoweredFunctions() {
+    Map<String, IRModule> lowered_functions;
+    for (const auto& it : cache_) {
+      auto source_func = it.first;
+      auto lowered_func = it.second;
+      auto target = source_func->target;
+
+      if (!lowered_functions.count(target->str())) {
+        lowered_functions.Set(target->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+
+      lowered_functions[target->str()]->Update(lowered_func->cached_func->funcs);
+    }
+    return lowered_functions;
+  }
+
+  Array<tvm::runtime::Module> LowerExternalFunctions() {
+    Array<tvm::runtime::Module> ret;
+    std::unordered_map<std::string, std::string> cached_symbol;
+    std::vector<CCacheKey> cached_ext_funcs;
+    for (const auto& it : cache_) {
+      auto src_func = it.first->source_func;
+      ICHECK(src_func.defined());
+      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
+        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
+        ICHECK(code_gen.defined()) << "No external codegen is set";
+        std::string code_gen_name = code_gen.value();
+        cached_ext_funcs.push_back(it.first);
+
+        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
+
+        std::string sn = symbol_name.value();
+        if (cached_symbol.count(sn)) {
+          cached_symbol[sn] = code_gen_name;
+        } else {
+          ICHECK_NE(sn, code_gen_name)
+              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
+        }
+
+        std::string ext_name = "relay.ext." + code_gen_name;
+        auto pf = tvm::runtime::Registry::Get(ext_name);
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+        // No need to keep compiler attribute at this point, functions have been
+        // extracted for specific codegen.
+        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
+        runtime::Module ext_mod = (*pf)(src_func);
+
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ret.push_back(ext_mod);
+      }
+    }
+
+    // No need to cache external functions as we collected them all to create
+    // external runtime modules.
+    for (const auto& it : cached_ext_funcs) {
+      cache_.erase(it);
+    }
+    return ret;
+  }
+
+  void Clear() final { cache_.clear(); }
+
+  // List all items in the cache.
+  Array<ObjectRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<ObjectRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+
+  /*!
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
+   */
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_object<CCacheValueNode>());
+      value->use_count = 0;
+      if (!backend::IsCompileEngineCacheDisabled()) {
+        cache_[key] = value;
+      }
+    }
+    cur_ccache_key_ = key;
+
+    // No need to lower external functions for now. We will invoke the external
+    // codegen tool once and lower all functions together.
+    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
+      auto ir_module = IRModule();
+      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      auto func_name = std::string(name_node.value());
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      ir_module->Add(global_var, key->source_func);
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
+      return value;
+    }
+    // Enforce use the target.
+    With<Target> target_scope(key->target);
+
+    ICHECK(!value->cached_func.defined());
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, name_map_); });
+
+    // Skip lowering for device copy node.
+    const Expr body = (key->source_func)->body;
+    if (const CallNode* call_node = body.as<CallNode>()) {
+      if (call_node->attrs.as<DeviceCopyAttrs>()) {
+        value->cached_func = cfunc;
+        return value;
+      }
+    }
+
+    std::cout << "Input Size: " << cfunc->inputs.size() << std::endl;
+    std::cout << "Output Size: " << cfunc->outputs.size() << std::endl;

Review comment:
       ```suggestion
   ```

##########
File path: src/relay/backend/graph_executor_codegen.cc
##########
@@ -181,23 +186,56 @@ class GraphOpNode : public GraphNode {
   const std::string op_type_name_{"tvm_op"};
 };
 
-/*! \brief Code generator for graph executor */
+/*! \brief Code generator for the graph executor, produces a module containing the graph JSON,
+ * module, and parameters.
+ */
 class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<GraphNodeRef>> {
  public:
   GraphExecutorCodegen(runtime::Module* mod, const TargetsMap& targets) : mod_(mod) {
-    compile_engine_ = CompileEngine::Global();
     targets_ = targets;
   }
 
   LoweredOutput Codegen(relay::Function func) {
-    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
-    storage_device_map_ = (*pf)(func);
+    // Jared: why do we do this? just call C++ API.
+    // auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
+    // storage_device_map_ = (*pf)(func);
+    storage_device_map_ = GraphPlanMemory(func);

Review comment:
       Likely it's better to run the CollectDeviceInfo (and future CollectStorageInfo) pass prior to GraphPlanMemory, and then the collected info provided to the memory planner. In that case we would not need to invoke the memory planner twice, and instead do memory planning only after we have lowered everything.
   
   ```suggestion
       // TODO(csullivan): Consider refactoring CollectDeviceInfo out of memory
       // planner and provide as an argument for use during planning. In this way
       // we can avoid running the memory planner prior to lowering and do so only
       // thereafter.
       storage_device_map_ = GraphPlanMemory(func);
   ```

##########
File path: src/relay/backend/te_compiler.cc
##########
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "te_compiler.h"
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../transforms/pass_utils.h"
+#include "te_compiler_cache.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+using namespace tvm::relay::transform;
+
+TVM_REGISTER_OBJECT_TYPE(TECompilerNode);
+
+class TECompilerImpl : public TECompilerNode {
+ public:
+  // Lower the function.
+  CachedFunc Lower(const CCacheKey& key) { return LowerInternal(key)->cached_func; }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) {
+      return value->packed_func;
+    }
+    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
+    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
+    return value->packed_func;
+  }
+
+  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
+    return LowerShapeFuncInternal(key)->cached_func;
+  }
+
+  Map<String, IRModule> GetLoweredFunctions() {
+    Map<String, IRModule> lowered_functions;
+    for (const auto& it : cache_) {
+      auto source_func = it.first;
+      auto lowered_func = it.second;
+      auto target = source_func->target;
+
+      if (!lowered_functions.count(target->str())) {
+        lowered_functions.Set(target->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+
+      lowered_functions[target->str()]->Update(lowered_func->cached_func->funcs);
+    }
+    return lowered_functions;
+  }
+
+  Array<tvm::runtime::Module> LowerExternalFunctions() {
+    Array<tvm::runtime::Module> ret;
+    std::unordered_map<std::string, std::string> cached_symbol;
+    std::vector<CCacheKey> cached_ext_funcs;
+    for (const auto& it : cache_) {
+      auto src_func = it.first->source_func;
+      ICHECK(src_func.defined());
+      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
+        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
+        ICHECK(code_gen.defined()) << "No external codegen is set";
+        std::string code_gen_name = code_gen.value();
+        cached_ext_funcs.push_back(it.first);
+
+        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
+
+        std::string sn = symbol_name.value();
+        if (cached_symbol.count(sn)) {
+          cached_symbol[sn] = code_gen_name;
+        } else {
+          ICHECK_NE(sn, code_gen_name)
+              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
+        }
+
+        std::string ext_name = "relay.ext." + code_gen_name;
+        auto pf = tvm::runtime::Registry::Get(ext_name);
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+        // No need to keep compiler attribute at this point, functions have been
+        // extracted for specific codegen.
+        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
+        runtime::Module ext_mod = (*pf)(src_func);
+
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ret.push_back(ext_mod);
+      }
+    }
+
+    // No need to cache external functions as we collected them all to create
+    // external runtime modules.
+    for (const auto& it : cached_ext_funcs) {
+      cache_.erase(it);
+    }
+    return ret;
+  }
+
+  void Clear() final { cache_.clear(); }
+
+  // List all items in the cache.
+  Array<ObjectRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<ObjectRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+
+  /*!
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
+   */
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_object<CCacheValueNode>());
+      value->use_count = 0;
+      if (!backend::IsCompileEngineCacheDisabled()) {
+        cache_[key] = value;
+      }
+    }
+    cur_ccache_key_ = key;
+
+    // No need to lower external functions for now. We will invoke the external
+    // codegen tool once and lower all functions together.
+    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
+      auto ir_module = IRModule();
+      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      auto func_name = std::string(name_node.value());
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      ir_module->Add(global_var, key->source_func);
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
+      return value;
+    }
+    // Enforce use the target.
+    With<Target> target_scope(key->target);
+
+    ICHECK(!value->cached_func.defined());
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, name_map_); });
+
+    // Skip lowering for device copy node.
+    const Expr body = (key->source_func)->body;
+    if (const CallNode* call_node = body.as<CallNode>()) {
+      if (call_node->attrs.as<DeviceCopyAttrs>()) {
+        value->cached_func = cfunc;
+        return value;
+      }
+    }
+
+    std::cout << "Input Size: " << cfunc->inputs.size() << std::endl;
+    std::cout << "Output Size: " << cfunc->outputs.size() << std::endl;
+    // NOTE: array will copy on write.
+    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
+    for (te::Tensor arg : cfunc->outputs) {
+      all_args.push_back(arg);
+    }
+
+    std::cout << "Allargs Size: " << all_args.size() << std::endl;
+
+    using tvm::transform::PassContext;
+    With<PassContext> fresh_pass_ctx_scope(PassContext::Create());
+
+    std::unordered_map<te::Tensor, tir::Buffer> binds;
+    auto func_name = cfunc->prim_fn_var->name_hint;
+    cfunc->funcs->Update(tvm::lower(cfunc->schedule, all_args, func_name, binds));

Review comment:
       Same comment as above, 
   > It looks like the compiler has been relying on `if (const auto* f = runtime::Registry::Get("relay.backend.lower")) {` for long enough that the set of passes run in the python impl of tvm.lower (in python/driver/build_module.py) is different than the c++ `tvm::lower`. 
   > 
   > My feeling is that we should use the same flow as before to avoid too much disruption and then refactor after this lands. 

##########
File path: src/runtime/graph_executor/graph_executor.cc
##########
@@ -433,6 +434,7 @@ GraphExecutor::CreateTVMOp(const TVMOpParam& param, const std::vector<DLTensor>&
   ICHECK(pf != nullptr) << "no such function in module: " << param.func_name;
 
   auto fexec = [arg_ptr, pf]() {
+    std::cout << "Number of args: " << static_cast<int>(arg_ptr->arg_values.size());

Review comment:
       ```suggestion
   ```

##########
File path: src/relay/backend/te_compiler.cc
##########
@@ -0,0 +1,372 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "te_compiler.h"
+
+#include <tvm/driver/driver_api.h>
+#include <tvm/ir/type_functor.h>
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/attrs/device_copy.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/runtime/container.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/te/operation.h>
+#include <tvm/te/schedule.h>
+#include <tvm/te/schedule_pass.h>
+#include <tvm/topi/tags.h>
+
+#include <functional>
+#include <limits>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "../transforms/pass_utils.h"
+#include "te_compiler_cache.h"
+#include "utils.h"
+
+namespace tvm {
+namespace relay {
+namespace tec {
+
+using namespace tvm::relay::transform;
+
+TVM_REGISTER_OBJECT_TYPE(TECompilerNode);
+
+class TECompilerImpl : public TECompilerNode {
+ public:
+  // Lower the function.
+  CachedFunc Lower(const CCacheKey& key) { return LowerInternal(key)->cached_func; }
+
+  // For now, build one module per function.
+  PackedFunc JIT(const CCacheKey& key) final {
+    CCacheValue value = LowerInternal(key);
+    if (value->packed_func != nullptr) {
+      return value->packed_func;
+    }
+    auto m = build(value->cached_func->funcs, key->target, Target(nullptr));
+    value->packed_func = m.GetFunction(value->cached_func->prim_fn_var->name_hint);
+    return value->packed_func;
+  }
+
+  CachedFunc LowerShapeFunc(const CCacheKey& key) final {
+    return LowerShapeFuncInternal(key)->cached_func;
+  }
+
+  Map<String, IRModule> GetLoweredFunctions() {
+    Map<String, IRModule> lowered_functions;
+    for (const auto& it : cache_) {
+      auto source_func = it.first;
+      auto lowered_func = it.second;
+      auto target = source_func->target;
+
+      if (!lowered_functions.count(target->str())) {
+        lowered_functions.Set(target->str(), IRModule(Map<GlobalVar, BaseFunc>({})));
+      }
+
+      lowered_functions[target->str()]->Update(lowered_func->cached_func->funcs);
+    }
+    return lowered_functions;
+  }
+
+  Array<tvm::runtime::Module> LowerExternalFunctions() {
+    Array<tvm::runtime::Module> ret;
+    std::unordered_map<std::string, std::string> cached_symbol;
+    std::vector<CCacheKey> cached_ext_funcs;
+    for (const auto& it : cache_) {
+      auto src_func = it.first->source_func;
+      ICHECK(src_func.defined());
+      if (src_func->GetAttr<String>(attr::kCompiler).defined()) {
+        auto code_gen = src_func->GetAttr<String>(attr::kCompiler);
+        ICHECK(code_gen.defined()) << "No external codegen is set";
+        std::string code_gen_name = code_gen.value();
+        cached_ext_funcs.push_back(it.first);
+
+        auto symbol_name = src_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+        ICHECK(symbol_name.defined()) << "No external symbol is set for:\n"
+                                      << AsText(src_func, false);
+
+        std::string sn = symbol_name.value();
+        if (cached_symbol.count(sn)) {
+          cached_symbol[sn] = code_gen_name;
+        } else {
+          ICHECK_NE(sn, code_gen_name)
+              << "Found duplicated symbol: " << sn << " for: " << code_gen_name;
+        }
+
+        std::string ext_name = "relay.ext." + code_gen_name;
+        auto pf = tvm::runtime::Registry::Get(ext_name);
+        ICHECK(pf) << "Failed to find the codegen tool for " << ext_name << "\n";
+        // No need to keep compiler attribute at this point, functions have been
+        // extracted for specific codegen.
+        src_func = WithAttr(std::move(src_func), attr::kCompiler, NullValue<ObjectRef>());
+        runtime::Module ext_mod = (*pf)(src_func);
+
+        ICHECK(ext_mod.defined()) << "No external runtime is generated.";
+        ret.push_back(ext_mod);
+      }
+    }
+
+    // No need to cache external functions as we collected them all to create
+    // external runtime modules.
+    for (const auto& it : cached_ext_funcs) {
+      cache_.erase(it);
+    }
+    return ret;
+  }
+
+  void Clear() final { cache_.clear(); }
+
+  // List all items in the cache.
+  Array<ObjectRef> ListItems() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Array<ObjectRef> items;
+    for (auto& kv : cache_) {
+      items.push_back(kv.first);
+      items.push_back(kv.second);
+    }
+    return items;
+  }
+
+  /*!
+   * \brief Get the cache key of the function that is being lowered currently
+   * \return the cache key
+   */
+  CCacheKey GetCurrentCCacheKey() { return cur_ccache_key_; }
+
+ private:
+  // implement lowered func
+  CCacheValue LowerInternal(const CCacheKey& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    CCacheValue value;
+    auto it = cache_.find(key);
+    if (it != cache_.end()) {
+      it->second->use_count += 1;
+      if (it->second->cached_func.defined()) return it->second;
+      value = it->second;
+    } else {
+      value = CCacheValue(make_object<CCacheValueNode>());
+      value->use_count = 0;
+      if (!backend::IsCompileEngineCacheDisabled()) {
+        cache_[key] = value;
+      }
+    }
+    cur_ccache_key_ = key;
+
+    // No need to lower external functions for now. We will invoke the external
+    // codegen tool once and lower all functions together.
+    if (key->source_func->GetAttr<String>(attr::kCompiler).defined()) {
+      auto ir_module = IRModule();
+      const auto name_node = key->source_func->GetAttr<String>(tvm::attr::kGlobalSymbol);
+      ICHECK(name_node.defined()) << "External function has not been attached a name yet.";
+      auto func_name = std::string(name_node.value());
+      auto target = Target("ext_dev");
+      auto global_var = GlobalVar(func_name);
+      global_var->checked_type_ = key->source_func->checked_type();
+      ir_module->Add(global_var, key->source_func);
+      value->cached_func = CachedFunc(target, global_var, {}, {}, te::Schedule(), {}, ir_module);
+      return value;
+    }
+    // Enforce use the target.
+    With<Target> target_scope(key->target);
+
+    ICHECK(!value->cached_func.defined());
+    auto cfunc = PrimFuncFor(key->source_func, key->target,
+                             [&](std::string name) { return GetUniqueName(name, name_map_); });
+
+    // Skip lowering for device copy node.
+    const Expr body = (key->source_func)->body;
+    if (const CallNode* call_node = body.as<CallNode>()) {
+      if (call_node->attrs.as<DeviceCopyAttrs>()) {
+        value->cached_func = cfunc;
+        return value;
+      }
+    }
+
+    std::cout << "Input Size: " << cfunc->inputs.size() << std::endl;
+    std::cout << "Output Size: " << cfunc->outputs.size() << std::endl;
+    // NOTE: array will copy on write.
+    Array<te::Tensor> all_args = Array<te::Tensor>(cfunc->inputs);
+    for (te::Tensor arg : cfunc->outputs) {
+      all_args.push_back(arg);
+    }
+
+    std::cout << "Allargs Size: " << all_args.size() << std::endl;

Review comment:
       ```suggestion
   ```

##########
File path: src/relay/backend/graph_executor_codegen.cc
##########
@@ -349,65 +380,47 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     return AddNode(node, GetRef<Expr>(op));
   }
 
-  std::vector<GraphNodeRef> VisitExpr_(const CallNode* op) override {
-    Expr expr = GetRef<Expr>(op);
-    Function func;
-    if (op->op.as<OpNode>()) {
-      LOG(FATAL) << "Operators should be transformed away; try applying"
-                 << "the fuse_ops transformation to the expression.";
-    } else if (op->op.as<GlobalVarNode>()) {
-      LOG(FATAL) << "Not implemented";
-    } else if (op->op.as<FunctionNode>()) {
-      func = GetRef<Function>(op->op.as<FunctionNode>());
-    } else {
-      LOG(FATAL) << "TVM runtime does not support calls to " << op->op->GetTypeKey();
-    }
-    if (!func->HasNonzeroAttr(attr::kPrimitive)) {
-      LOG(FATAL) << "TVM only support calls to primitive functions "
-                 << "(i.e functions composed of fusable operator invocations)";
-    }
+  std::vector<GraphNodeRef> VisitExpr_(const CallNode* call_node) override {
+    relay::Call call = GetRef<Call>(call_node);
+    if (auto global_node = call->op.as<GlobalVarNode>()) {
 
-    auto pf0 = GetPackedFunc("relay.backend._make_CCacheKey");
-    auto pf1 = GetPackedFunc("relay.backend._CompileEngineLower");
-    Target target;
-    // Handle external function
-    if (func->GetAttr<String>(attr::kCompiler).defined()) {
-      target = Target("ext_dev");
-      CCacheKey key = (*pf0)(func, target);
-      CachedFunc ext_func = (*pf1)(compile_engine_, key);
-      ICHECK(ext_func.defined()) << "External function is not defined.";
-      UpdateConstants(func, &params_);
-      return GraphAddCallNode(op, ext_func->func_name, ext_func->func_name);
-    }
+      auto prim_fn_name = global_node->name_hint;
 
-    ICHECK_GE(storage_device_map_.count(expr), 0);
-    auto& device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;
-    // Normal Relay Function
-    if (targets_.size() == 1) {
-      // homogeneous execution.
-      const auto& it = targets_.begin();
-      target = (*it).second;
-    } else {
-      // heterogeneous execution.
-      std::string call_dev_name;
-      if (call_dev_type == 0) {
-        call_dev_name = "llvm";
+      Target target;
+
+      // // Handle external function
+      // if (func->GetAttr<String>(attr::kCompiler).defined()) {
+      //   UpdateConstants(func, &params_);
+      //   return GraphAddCallNode(call_node, prim_fn_name, prim_fn_name);
+      // }
+
+      ICHECK_GE(storage_device_map_.count(call), 0);
+      auto& device_type = storage_device_map_[call][1];
+      auto call_dev_type = device_type[0]->value;
+      // Normal Relay Function
+      if (targets_.size() == 1) {
+        // homogeneous execution.
+        const auto& it = targets_.begin();
+        target = (*it).second;
       } else {
-        call_dev_name = runtime::DeviceName(call_dev_type);
-      }
-      if (targets_.count(call_dev_type) == 0) {
-        LOG(FATAL) << "No target is provided for device " << call_dev_name;
+        // heterogeneous execution.
+        std::string call_dev_name;
+        if (call_dev_type == 0) {
+          call_dev_name = "llvm";
+        } else {
+          call_dev_name = runtime::DeviceName(call_dev_type);
+        }
+        if (targets_.count(call_dev_type) == 0) {
+          LOG(FATAL) << "No target is provided for device " << call_dev_name;
+        }
+        target = targets_[call_dev_type];
       }
-      target = targets_[call_dev_type];
-    }
-    CCacheKey key = (*pf0)(func, target);
-    CachedFunc lowered_func = (*pf1)(compile_engine_, key);
-    if (!lowered_funcs_.count(target->str())) {
-      lowered_funcs_[target->str()] = IRModule(Map<GlobalVar, BaseFunc>({}));
+
+      return GraphAddCallNode(call_node, _GetUniqueName(prim_fn_name), prim_fn_name);
+    } else {
+      LOG(FATAL) << "BadCase: " << PrettyPrint(call) << std::endl;

Review comment:
       ```suggestion
         LOG(FATAL) << "Unhandled call type, GraphExec can only lower primitive function calls: " << PrettyPrint(call) << std::endl;
   ```

##########
File path: src/target/llvm/llvm_module.cc
##########
@@ -234,7 +238,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
       }
       funcs.push_back(f);
     }
-    ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));
+    // ICHECK(funcs.size() > 0 || (could_have_linked_params && found_linked_params));

Review comment:
       This should still be a valid check, no?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org