You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2020/12/05 10:21:36 UTC

[tvm] branch main updated: [AutoScheduler] Remove `max_registers_per_block` in HardwareParams (#7040)

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 878a0a9  [AutoScheduler] Remove `max_registers_per_block` in HardwareParams (#7040)
878a0a9 is described below

commit 878a0a999724796725e8516a83684b2b6ae7ba97
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Sat Dec 5 02:21:19 2020 -0800

    [AutoScheduler] Remove `max_registers_per_block` in HardwareParams (#7040)
    
    * [AutoScheduler] Fix hardware params
    
    * address comments
---
 include/tvm/auto_scheduler/search_task.h | 10 +++++-----
 python/tvm/auto_scheduler/search_task.py |  8 ++++----
 src/auto_scheduler/feature.cc            |  2 +-
 src/auto_scheduler/measure_record.cc     |  4 ++--
 src/auto_scheduler/search_task.cc        | 19 ++++++++++---------
 5 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h
index 6d85835..60e721b 100755
--- a/include/tvm/auto_scheduler/search_task.h
+++ b/include/tvm/auto_scheduler/search_task.h
@@ -46,8 +46,8 @@ class HardwareParamsNode : public Object {
   // GPU related parameters got from device query API
   /*! \brief The max shared memory per block in bytes. */
   int max_shared_memory_per_block;
-  /*! \brief The max number of register per block. */
-  int max_registers_per_block;
+  /*! \brief The max local memory per block in bytes. */
+  int max_local_memory_per_block;
   /*! \brief The max number of threads per block. */
   int max_threads_per_block;
   /*! \brief The max vthread extent. */
@@ -60,7 +60,7 @@ class HardwareParamsNode : public Object {
     v->Visit("vector_unit_bytes", &vector_unit_bytes);
     v->Visit("cache_line_bytes", &cache_line_bytes);
     v->Visit("max_shared_memory_per_block", &max_shared_memory_per_block);
-    v->Visit("max_registers_per_block", &max_registers_per_block);
+    v->Visit("max_local_memory_per_block", &max_local_memory_per_block);
     v->Visit("max_threads_per_block", &max_threads_per_block);
     v->Visit("max_vthread_extent", &max_vthread_extent);
     v->Visit("warp_size", &warp_size);
@@ -90,13 +90,13 @@ class HardwareParams : public ObjectRef {
    * \param vector_unit_bytes The width of vector units in bytes.
    * \param cache_line_bytes The size of cache line in bytes.
    * \param max_shared_memory_per_block The max amount of shared memory per block for GPU.
-   * \param max_registers_per_block The max number of registers per block for GPU.
+   * \param max_local_memory_per_block The max amount of local memory per block for GPU.
    * \param max_threads_per_block The max number of threads per block for GPU.
    * \param max_vthread_extent The max extent of vthread for GPU.
    * \param warp_size The warp size for GPU
    */
   HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
-                 int max_shared_memory_per_block, int max_registers_per_block,
+                 int max_shared_memory_per_block, int max_local_memory_per_block,
                  int max_threads_per_block, int max_vthread_extent, int warp_size);
 
   TVM_DEFINE_OBJECT_REF_METHODS(HardwareParams, ObjectRef, HardwareParamsNode);
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index a22cd6e..b75c979 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -49,8 +49,8 @@ class HardwareParams(Object):
         The size of cache line in bytes.
     max_shared_memory_per_block : int
         The max shared memory per block in bytes.
-    max_registers_per_block : int
-        The max number of register per block.
+    max_local_memory_per_block : int
+        The max local memory per block in bytes.
     max_threads_per_block : int
         The max number of threads per block.
     max_vthread_extent : int
@@ -65,7 +65,7 @@ class HardwareParams(Object):
         vector_unit_bytes,
         cache_line_bytes,
         max_shared_memory_per_block,
-        max_registers_per_block,
+        max_local_memory_per_block,
         max_threads_per_block,
         max_vthread_extent,
         warp_size,
@@ -76,7 +76,7 @@ class HardwareParams(Object):
             vector_unit_bytes,
             cache_line_bytes,
             max_shared_memory_per_block,
-            max_registers_per_block,
+            max_local_memory_per_block,
             max_threads_per_block,
             max_vthread_extent,
             warp_size,
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 0df69b9..0a3d705 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1310,7 +1310,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
       pass_list.push_back(tir::transform::Simplify());
       tvm::Map<String, tvm::PrimExpr> gpu_params{
           {"max_shared_memory_per_block", task->hardware_params->max_shared_memory_per_block},
-          {"max_local_memory_per_block", task->hardware_params->max_registers_per_block},
+          {"max_local_memory_per_block", task->hardware_params->max_local_memory_per_block},
           {"max_threads_per_block", task->hardware_params->max_threads_per_block},
           {"max_vector_bytes", task->hardware_params->vector_unit_bytes},
           {"max_vthread", task->hardware_params->max_vthread_extent},
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 1bc2c78..d57e2f2 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -116,7 +116,7 @@ struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
     writer->WriteArrayItem(data.vector_unit_bytes);
     writer->WriteArrayItem(data.cache_line_bytes);
     writer->WriteArrayItem(data.max_shared_memory_per_block);
-    writer->WriteArrayItem(data.max_registers_per_block);
+    writer->WriteArrayItem(data.max_local_memory_per_block);
     writer->WriteArrayItem(data.max_threads_per_block);
     writer->WriteArrayItem(data.max_vthread_extent);
     writer->WriteArrayItem(data.warp_size);
@@ -140,7 +140,7 @@ struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
     reader->Read(&data->max_shared_memory_per_block);
     s = reader->NextArrayItem();
     CHECK(s);
-    reader->Read(&data->max_registers_per_block);
+    reader->Read(&data->max_local_memory_per_block);
     s = reader->NextArrayItem();
     CHECK(s);
     reader->Read(&data->max_threads_per_block);
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 48b3fc5..4c8cc6d 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -36,14 +36,14 @@ TVM_REGISTER_NODE_TYPE(HardwareParamsNode);
 TVM_REGISTER_NODE_TYPE(SearchTaskNode);
 
 HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
-                               int max_shared_memory_per_block, int max_registers_per_block,
+                               int max_shared_memory_per_block, int max_local_memory_per_block,
                                int max_threads_per_block, int max_vthread_extent, int warp_size) {
   auto node = make_object<HardwareParamsNode>();
   node->num_cores = num_cores;
   node->vector_unit_bytes = vector_unit_bytes;
   node->cache_line_bytes = cache_line_bytes;
   node->max_shared_memory_per_block = max_shared_memory_per_block;
-  node->max_registers_per_block = max_registers_per_block;
+  node->max_local_memory_per_block = max_local_memory_per_block;
   node->max_threads_per_block = max_threads_per_block;
   node->max_vthread_extent = max_vthread_extent;
   node->warp_size = warp_size;
@@ -64,8 +64,9 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
     int max_shared_memory_per_block = ret;
 
-    device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxRegistersPerBlock, &ret);
-    int max_registers_per_block = ret;
+    // There is no explicit local memory limition in CUDA runtime,
+    // so we can use INT32_MAX to disalbe the check on local_memory.
+    int max_local_memory_per_block = INT32_MAX;
 
     device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
     int max_threads_per_block = ret;
@@ -74,17 +75,17 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
     int warp_size = ret;
 
     int max_vthread_extent = warp_size / 4;
-    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                           max_threads_per_block, max_vthread_extent, warp_size);
   } else if (target->kind->device_type == kDLMetal) {
     // Reference: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
     // This setting looks working for Metal GPUs later than A10
     int max_shared_memory_per_block = 32 * 1024;
-    int max_registers_per_block = 4 * 1024;
+    int max_local_memory_per_block = INT32_MAX;  // skip the check on local memory
     int max_threads_per_block = 1024;
     int warp_size = 8;
     int max_vthread_extent = warp_size / 4;
-    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+    return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
                           max_threads_per_block, max_vthread_extent, warp_size);
   } else {
     LOG(FATAL) << "No default hardware parameters for target: " << target;
@@ -110,10 +111,10 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
 
 TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams")
     .set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes,
-                       int max_shared_memory_per_block, int max_registers_per_block,
+                       int max_shared_memory_per_block, int max_local_memory_per_block,
                        int max_threads_per_block, int max_vthread_extent, int warp_size) {
       return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes,
-                            max_shared_memory_per_block, max_registers_per_block,
+                            max_shared_memory_per_block, max_local_memory_per_block,
                             max_threads_per_block, max_vthread_extent, warp_size);
     });