You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/05/20 02:34:52 UTC

[GitHub] [tvm] ruxiliang commented on a diff in pull request #11286: Dynamic management

ruxiliang commented on code in PR #11286:
URL: https://github.com/apache/tvm/pull/11286#discussion_r877682467


##########
include/tvm/runtime/ndarray.h:
##########
@@ -73,7 +73,7 @@ class NDArray : public ObjectRef {
   explicit NDArray(ObjectPtr<Object> data) : ObjectRef(data) {}
 
   /*! \brief reset the content of NDArray to be nullptr */
-  inline void reset();
+  inline void reset(){ data_.reset(); }

Review Comment:
   I remembered that if I do not add this the ld link cannot find for the reset method. I will check this again.



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }
+
+void GraphExecutor::PerformOp(size_t nid) {
+  auto node = nodes_[nid];
+  // first we need to get all the input tensors and add the reference count
+  // if they are not in memory, rematerialize them
+  for (const auto& input : node.inputs) {
+    // also we do not care about cpu
+    if (pool_entry_[get_sid(entry_id(input))].device_type == (int)(kDLCPU)) continue;
+    // we do not need to care about input and parameter
+    if (std::find(input_nodes_.begin(), input_nodes_.end(), entry_id(input)) != input_nodes_.end())
+      continue;
+    if (std::find(nodes_not_evicted_.begin(), nodes_not_evicted_.end(), entry_id(input)) != nodes_not_evicted_.end())
+      continue;
+    // otherwise we should rematerialize the input tensors that are not in the memory
+    // and update the reference count at the same time
+
+    if (is_evicted(entry_id(input))) {
+        RematerializeTensor(entry_id(input));
+    }
+    ref_cnt[entry_id(input)] += 1;
+  }
+  // now all of the inputs are recovered, we can set up to compute the location of input/output
+  // compute current input memory location of the tensor
+  std::unordered_set<uint32_t> input_node_eids;
+  for (unsigned int nid_ : input_nodes_) {
+    input_node_eids.insert(entry_id(nid_, 0));
+  }
+  std::unordered_set<uint32_t> output_node_eids;
+  for (auto& output : outputs_) {
+    output_node_eids.insert(entry_id(output));
+  }
+
+  const auto& inode = nodes_[nid];
+  std::vector<DLTensor> args;
+
+  for (const auto& e : inode.inputs) {
+    uint32_t eid = this->entry_id(e);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  /*
+   * Note that some output tensors might not be allocated because some op might produce plural
+   * values Thus we should check if the outputs were allocated and set up the data entry as well
+   */
+  for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
+    uint32_t eid = this->entry_id(nid, index);
+    set_tensor(eid);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  // Now we should update the memory location in TVM Operator
+  std::shared_ptr<OpArgs> updated_args = UpdateTVMOp(nid, nodes_[nid].param, args);
+
+  UpdateInputOutputTensors(input_node_eids, output_node_eids, nid, updated_args);
+
+  // now we can execute the operator
+  op_execs_[nid]();
+  // at last, we should perform eviction
+  PerformEviction(nid);
+}
+
+void GraphExecutor::PerformEviction(size_t nid) {
+  auto node = nodes_[nid];
+  for (const auto& input : node.inputs) {
+    // we do not care about cpu

Review Comment:
   I think leaving host tensors alone is not necessary, but to get the result of the experiments quickly I ignored those tensors so that I can get the results quickly. 



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }

Review Comment:
   I will try to subclass it.



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -422,14 +527,31 @@ void GraphExecutor::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    // just make view for allocated parameters and tensors in cpu
+    if (pool_entry_[storage_id].device_type == static_cast<int>(kDLCPU)){
+      data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    }
 
-    const DLTensor* tmp = data_entry_[i].operator->();
-    data_alignment_[i] = details::GetDataAlignment(*tmp);
+    // determine memory usage using grabbed vtype
+    size_t align = (vtype[i].bits / 8) * vtype[i].lanes;
+    data_alignment_[i] = align < kAllocAlignment ? kAllocAlignment : align;
   }
 }
 
 void GraphExecutor::SetupOpExecs() {
+  ref_cnt.resize(num_node_entries(),0);
+  nodes_not_evicted_ = {};
+  for(size_t nid = 0;nid < num_node_entries();nid++){
+    auto slice_node = nodes_[nid];
+    if (slice_node.name.find("slice") != std::string::npos){

Review Comment:
   In my experiment, I found that if I do not leave the slice nodes alone the result of my computation would go wrong, so I encoded this logic to ensure the correctness of my computation.



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }
+
+void GraphExecutor::PerformOp(size_t nid) {
+  auto node = nodes_[nid];
+  // first we need to get all the input tensors and add the reference count
+  // if they are not in memory, rematerialize them
+  for (const auto& input : node.inputs) {
+    // also we do not care about cpu
+    if (pool_entry_[get_sid(entry_id(input))].device_type == (int)(kDLCPU)) continue;
+    // we do not need to care about input and parameter
+    if (std::find(input_nodes_.begin(), input_nodes_.end(), entry_id(input)) != input_nodes_.end())
+      continue;
+    if (std::find(nodes_not_evicted_.begin(), nodes_not_evicted_.end(), entry_id(input)) != nodes_not_evicted_.end())
+      continue;
+    // otherwise we should rematerialize the input tensors that are not in the memory
+    // and update the reference count at the same time
+
+    if (is_evicted(entry_id(input))) {
+        RematerializeTensor(entry_id(input));
+    }
+    ref_cnt[entry_id(input)] += 1;
+  }
+  // now all of the inputs are recovered, we can set up to compute the location of input/output
+  // compute current input memory location of the tensor
+  std::unordered_set<uint32_t> input_node_eids;
+  for (unsigned int nid_ : input_nodes_) {
+    input_node_eids.insert(entry_id(nid_, 0));
+  }
+  std::unordered_set<uint32_t> output_node_eids;
+  for (auto& output : outputs_) {
+    output_node_eids.insert(entry_id(output));
+  }
+
+  const auto& inode = nodes_[nid];
+  std::vector<DLTensor> args;
+
+  for (const auto& e : inode.inputs) {
+    uint32_t eid = this->entry_id(e);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  /*
+   * Note that some output tensors might not be allocated because some op might produce plural
+   * values Thus we should check if the outputs were allocated and set up the data entry as well
+   */
+  for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
+    uint32_t eid = this->entry_id(nid, index);
+    set_tensor(eid);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  // Now we should update the memory location in TVM Operator
+  std::shared_ptr<OpArgs> updated_args = UpdateTVMOp(nid, nodes_[nid].param, args);
+
+  UpdateInputOutputTensors(input_node_eids, output_node_eids, nid, updated_args);
+
+  // now we can execute the operator
+  op_execs_[nid]();
+  // at last, we should perform eviction
+  PerformEviction(nid);
+}
+
+void GraphExecutor::PerformEviction(size_t nid) {

Review Comment:
   I will change it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org