You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/05/19 21:07:54 UTC
[GitHub] [tvm] altanh commented on a diff in pull request #11286: Dynamic management

altanh commented on code in PR #11286:
URL: https://github.com/apache/tvm/pull/11286#discussion_r877531120


##########
include/tvm/runtime/ndarray.h:
##########
@@ -73,7 +73,7 @@ class NDArray : public ObjectRef {
   explicit NDArray(ObjectPtr<Object> data) : ObjectRef(data) {}
 
   /*! \brief reset the content of NDArray to be nullptr */
-  inline void reset();
+  inline void reset(){ data_.reset(); }

Review Comment:
   is this change necessary?



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }
+
+void GraphExecutor::PerformOp(size_t nid) {
+  auto node = nodes_[nid];
+  // first we need to get all the input tensors and add the reference count
+  // if they are not in memory, rematerialize them
+  for (const auto& input : node.inputs) {
+    // also we do not care about cpu
+    if (pool_entry_[get_sid(entry_id(input))].device_type == (int)(kDLCPU)) continue;
+    // we do not need to care about input and parameter
+    if (std::find(input_nodes_.begin(), input_nodes_.end(), entry_id(input)) != input_nodes_.end())
+      continue;
+    if (std::find(nodes_not_evicted_.begin(), nodes_not_evicted_.end(), entry_id(input)) != nodes_not_evicted_.end())
+      continue;
+    // otherwise we should rematerialize the input tensors that are not in the memory
+    // and update the reference count at the same time
+
+    if (is_evicted(entry_id(input))) {
+        RematerializeTensor(entry_id(input));
+    }
+    ref_cnt[entry_id(input)] += 1;
+  }
+  // now all of the inputs are recovered, we can set up to compute the location of input/output
+  // compute current input memory location of the tensor
+  std::unordered_set<uint32_t> input_node_eids;
+  for (unsigned int nid_ : input_nodes_) {
+    input_node_eids.insert(entry_id(nid_, 0));
+  }
+  std::unordered_set<uint32_t> output_node_eids;
+  for (auto& output : outputs_) {
+    output_node_eids.insert(entry_id(output));
+  }
+
+  const auto& inode = nodes_[nid];
+  std::vector<DLTensor> args;
+
+  for (const auto& e : inode.inputs) {
+    uint32_t eid = this->entry_id(e);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  /*
+   * Note that some output tensors might not be allocated because some op might produce plural
+   * values Thus we should check if the outputs were allocated and set up the data entry as well
+   */
+  for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
+    uint32_t eid = this->entry_id(nid, index);
+    set_tensor(eid);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  // Now we should update the memory location in TVM Operator
+  std::shared_ptr<OpArgs> updated_args = UpdateTVMOp(nid, nodes_[nid].param, args);
+
+  UpdateInputOutputTensors(input_node_eids, output_node_eids, nid, updated_args);
+
+  // now we can execute the operator
+  op_execs_[nid]();
+  // at last, we should perform eviction
+  PerformEviction(nid);
+}
+
+void GraphExecutor::PerformEviction(size_t nid) {
+  auto node = nodes_[nid];
+  for (const auto& input : node.inputs) {
+    // we do not care about cpu

Review Comment:
   I understand the reasoning, but should this be hardcoded?



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -422,14 +527,31 @@ void GraphExecutor::SetupStorage() {
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     int storage_id = attrs_.storage_id[i];
     ICHECK_LT(static_cast<size_t>(storage_id), storage_pool_.size());
-    data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    // just make view for allocated parameters and tensors in cpu
+    if (pool_entry_[storage_id].device_type == static_cast<int>(kDLCPU)){
+      data_entry_[i] = storage_pool_[storage_id].CreateView(attrs_.shape[i], vtype[i]);
+    }
 
-    const DLTensor* tmp = data_entry_[i].operator->();
-    data_alignment_[i] = details::GetDataAlignment(*tmp);
+    // determine memory usage using grabbed vtype
+    size_t align = (vtype[i].bits / 8) * vtype[i].lanes;
+    data_alignment_[i] = align < kAllocAlignment ? kAllocAlignment : align;
   }
 }
 
 void GraphExecutor::SetupOpExecs() {
+  ref_cnt.resize(num_node_entries(),0);
+  nodes_not_evicted_ = {};
+  for(size_t nid = 0;nid < num_node_entries();nid++){
+    auto slice_node = nodes_[nid];
+    if (slice_node.name.find("slice") != std::string::npos){

Review Comment:
   can you explain this?



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }
+
+void GraphExecutor::PerformOp(size_t nid) {
+  auto node = nodes_[nid];
+  // first we need to get all the input tensors and add the reference count
+  // if they are not in memory, rematerialize them
+  for (const auto& input : node.inputs) {
+    // also we do not care about cpu
+    if (pool_entry_[get_sid(entry_id(input))].device_type == (int)(kDLCPU)) continue;
+    // we do not need to care about input and parameter
+    if (std::find(input_nodes_.begin(), input_nodes_.end(), entry_id(input)) != input_nodes_.end())
+      continue;
+    if (std::find(nodes_not_evicted_.begin(), nodes_not_evicted_.end(), entry_id(input)) != nodes_not_evicted_.end())
+      continue;
+    // otherwise we should rematerialize the input tensors that are not in the memory
+    // and update the reference count at the same time
+
+    if (is_evicted(entry_id(input))) {
+        RematerializeTensor(entry_id(input));
+    }
+    ref_cnt[entry_id(input)] += 1;
+  }
+  // now all of the inputs are recovered, we can set up to compute the location of input/output
+  // compute current input memory location of the tensor
+  std::unordered_set<uint32_t> input_node_eids;
+  for (unsigned int nid_ : input_nodes_) {
+    input_node_eids.insert(entry_id(nid_, 0));
+  }
+  std::unordered_set<uint32_t> output_node_eids;
+  for (auto& output : outputs_) {
+    output_node_eids.insert(entry_id(output));
+  }
+
+  const auto& inode = nodes_[nid];
+  std::vector<DLTensor> args;
+
+  for (const auto& e : inode.inputs) {
+    uint32_t eid = this->entry_id(e);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  /*
+   * Note that some output tensors might not be allocated because some op might produce plural
+   * values Thus we should check if the outputs were allocated and set up the data entry as well
+   */
+  for (uint32_t index = 0; index < inode.param.num_outputs; ++index) {
+    uint32_t eid = this->entry_id(nid, index);
+    set_tensor(eid);
+    args.push_back(*(data_entry_[eid].operator->()));
+  }
+
+  // Now we should update the memory location in TVM Operator
+  std::shared_ptr<OpArgs> updated_args = UpdateTVMOp(nid, nodes_[nid].param, args);
+
+  UpdateInputOutputTensors(input_node_eids, output_node_eids, nid, updated_args);
+
+  // now we can execute the operator
+  op_execs_[nid]();
+  // at last, we should perform eviction
+  PerformEviction(nid);
+}
+
+void GraphExecutor::PerformEviction(size_t nid) {

Review Comment:
   if I understand correctly, this function decreases the ref count and tries to eagerly evict the inputs of `nid`. I would prefer if the logic was split out so that the reference decrement + eager eviction applied directly to `nid` (and then call this function in a loop over the inputs once an op has been performed).
   
   would also suggest a different name, maybe call it `ReleaseTensor` or something similar, since this is different from the core eviction loop in DTR



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -59,10 +61,98 @@ inline size_t GetDataAlignment(const DLTensor& arr) {
 void GraphExecutor::Run() {
   // setup the array and requirements.
   for (size_t i = 0; i < op_execs_.size(); ++i) {
-    if (op_execs_[i]) op_execs_[i]();
+    if (op_execs_[i]) {
+      PerformOp(i);
+    }
+  }
+}
+
+void GraphExecutor::RematerializeTensor(size_t nid) { PerformOp(nid); }

Review Comment:
   could you subclass the `GraphExecutor` so that all these changes don't become default behavior? hopefully it's not too tricky to just override the relevant methods



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org