You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2020/12/05 10:21:36 UTC
[tvm] branch main updated: [AutoScheduler] Remove
`max_registers_per_block` in HardwareParams (#7040)
This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 878a0a9 [AutoScheduler] Remove `max_registers_per_block` in HardwareParams (#7040)
878a0a9 is described below
commit 878a0a999724796725e8516a83684b2b6ae7ba97
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Sat Dec 5 02:21:19 2020 -0800
[AutoScheduler] Remove `max_registers_per_block` in HardwareParams (#7040)
* [AutoScheduler] Fix hardware params
* address comments
---
include/tvm/auto_scheduler/search_task.h | 10 +++++-----
python/tvm/auto_scheduler/search_task.py | 8 ++++----
src/auto_scheduler/feature.cc | 2 +-
src/auto_scheduler/measure_record.cc | 4 ++--
src/auto_scheduler/search_task.cc | 19 ++++++++++---------
5 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h
index 6d85835..60e721b 100755
--- a/include/tvm/auto_scheduler/search_task.h
+++ b/include/tvm/auto_scheduler/search_task.h
@@ -46,8 +46,8 @@ class HardwareParamsNode : public Object {
// GPU related parameters got from device query API
/*! \brief The max shared memory per block in bytes. */
int max_shared_memory_per_block;
- /*! \brief The max number of register per block. */
- int max_registers_per_block;
+ /*! \brief The max local memory per block in bytes. */
+ int max_local_memory_per_block;
/*! \brief The max number of threads per block. */
int max_threads_per_block;
/*! \brief The max vthread extent. */
@@ -60,7 +60,7 @@ class HardwareParamsNode : public Object {
v->Visit("vector_unit_bytes", &vector_unit_bytes);
v->Visit("cache_line_bytes", &cache_line_bytes);
v->Visit("max_shared_memory_per_block", &max_shared_memory_per_block);
- v->Visit("max_registers_per_block", &max_registers_per_block);
+ v->Visit("max_local_memory_per_block", &max_local_memory_per_block);
v->Visit("max_threads_per_block", &max_threads_per_block);
v->Visit("max_vthread_extent", &max_vthread_extent);
v->Visit("warp_size", &warp_size);
@@ -90,13 +90,13 @@ class HardwareParams : public ObjectRef {
* \param vector_unit_bytes The width of vector units in bytes.
* \param cache_line_bytes The size of cache line in bytes.
* \param max_shared_memory_per_block The max amount of shared memory per block for GPU.
- * \param max_registers_per_block The max number of registers per block for GPU.
+ * \param max_local_memory_per_block The max amount of local memory per block for GPU.
* \param max_threads_per_block The max number of threads per block for GPU.
* \param max_vthread_extent The max extent of vthread for GPU.
* \param warp_size The warp size for GPU
*/
HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
- int max_shared_memory_per_block, int max_registers_per_block,
+ int max_shared_memory_per_block, int max_local_memory_per_block,
int max_threads_per_block, int max_vthread_extent, int warp_size);
TVM_DEFINE_OBJECT_REF_METHODS(HardwareParams, ObjectRef, HardwareParamsNode);
diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py
index a22cd6e..b75c979 100644
--- a/python/tvm/auto_scheduler/search_task.py
+++ b/python/tvm/auto_scheduler/search_task.py
@@ -49,8 +49,8 @@ class HardwareParams(Object):
The size of cache line in bytes.
max_shared_memory_per_block : int
The max shared memory per block in bytes.
- max_registers_per_block : int
- The max number of register per block.
+ max_local_memory_per_block : int
+ The max local memory per block in bytes.
max_threads_per_block : int
The max number of threads per block.
max_vthread_extent : int
@@ -65,7 +65,7 @@ class HardwareParams(Object):
vector_unit_bytes,
cache_line_bytes,
max_shared_memory_per_block,
- max_registers_per_block,
+ max_local_memory_per_block,
max_threads_per_block,
max_vthread_extent,
warp_size,
@@ -76,7 +76,7 @@ class HardwareParams(Object):
vector_unit_bytes,
cache_line_bytes,
max_shared_memory_per_block,
- max_registers_per_block,
+ max_local_memory_per_block,
max_threads_per_block,
max_vthread_extent,
warp_size,
diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc
index 0df69b9..0a3d705 100755
--- a/src/auto_scheduler/feature.cc
+++ b/src/auto_scheduler/feature.cc
@@ -1310,7 +1310,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i
pass_list.push_back(tir::transform::Simplify());
tvm::Map<String, tvm::PrimExpr> gpu_params{
{"max_shared_memory_per_block", task->hardware_params->max_shared_memory_per_block},
- {"max_local_memory_per_block", task->hardware_params->max_registers_per_block},
+ {"max_local_memory_per_block", task->hardware_params->max_local_memory_per_block},
{"max_threads_per_block", task->hardware_params->max_threads_per_block},
{"max_vector_bytes", task->hardware_params->vector_unit_bytes},
{"max_vthread", task->hardware_params->max_vthread_extent},
diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc
index 1bc2c78..d57e2f2 100644
--- a/src/auto_scheduler/measure_record.cc
+++ b/src/auto_scheduler/measure_record.cc
@@ -116,7 +116,7 @@ struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
writer->WriteArrayItem(data.vector_unit_bytes);
writer->WriteArrayItem(data.cache_line_bytes);
writer->WriteArrayItem(data.max_shared_memory_per_block);
- writer->WriteArrayItem(data.max_registers_per_block);
+ writer->WriteArrayItem(data.max_local_memory_per_block);
writer->WriteArrayItem(data.max_threads_per_block);
writer->WriteArrayItem(data.max_vthread_extent);
writer->WriteArrayItem(data.warp_size);
@@ -140,7 +140,7 @@ struct Handler<::tvm::auto_scheduler::HardwareParamsNode> {
reader->Read(&data->max_shared_memory_per_block);
s = reader->NextArrayItem();
CHECK(s);
- reader->Read(&data->max_registers_per_block);
+ reader->Read(&data->max_local_memory_per_block);
s = reader->NextArrayItem();
CHECK(s);
reader->Read(&data->max_threads_per_block);
diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc
index 48b3fc5..4c8cc6d 100755
--- a/src/auto_scheduler/search_task.cc
+++ b/src/auto_scheduler/search_task.cc
@@ -36,14 +36,14 @@ TVM_REGISTER_NODE_TYPE(HardwareParamsNode);
TVM_REGISTER_NODE_TYPE(SearchTaskNode);
HardwareParams::HardwareParams(int num_cores, int vector_unit_bytes, int cache_line_bytes,
- int max_shared_memory_per_block, int max_registers_per_block,
+ int max_shared_memory_per_block, int max_local_memory_per_block,
int max_threads_per_block, int max_vthread_extent, int warp_size) {
auto node = make_object<HardwareParamsNode>();
node->num_cores = num_cores;
node->vector_unit_bytes = vector_unit_bytes;
node->cache_line_bytes = cache_line_bytes;
node->max_shared_memory_per_block = max_shared_memory_per_block;
- node->max_registers_per_block = max_registers_per_block;
+ node->max_local_memory_per_block = max_local_memory_per_block;
node->max_threads_per_block = max_threads_per_block;
node->max_vthread_extent = max_vthread_extent;
node->warp_size = warp_size;
@@ -64,8 +64,9 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret);
int max_shared_memory_per_block = ret;
- device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxRegistersPerBlock, &ret);
- int max_registers_per_block = ret;
+ // There is no explicit local memory limition in CUDA runtime,
+ // so we can use INT32_MAX to disalbe the check on local_memory.
+ int max_local_memory_per_block = INT32_MAX;
device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret);
int max_threads_per_block = ret;
@@ -74,17 +75,17 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target
int warp_size = ret;
int max_vthread_extent = warp_size / 4;
- return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+ return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
max_threads_per_block, max_vthread_extent, warp_size);
} else if (target->kind->device_type == kDLMetal) {
// Reference: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
// This setting looks working for Metal GPUs later than A10
int max_shared_memory_per_block = 32 * 1024;
- int max_registers_per_block = 4 * 1024;
+ int max_local_memory_per_block = INT32_MAX; // skip the check on local memory
int max_threads_per_block = 1024;
int warp_size = 8;
int max_vthread_extent = warp_size / 4;
- return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_registers_per_block,
+ return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block,
max_threads_per_block, max_vthread_extent, warp_size);
} else {
LOG(FATAL) << "No default hardware parameters for target: " << target;
@@ -110,10 +111,10 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe
TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams")
.set_body_typed([](int num_cores, int vector_unit_bytes, int cache_line_bytes,
- int max_shared_memory_per_block, int max_registers_per_block,
+ int max_shared_memory_per_block, int max_local_memory_per_block,
int max_threads_per_block, int max_vthread_extent, int warp_size) {
return HardwareParams(num_cores, vector_unit_bytes, cache_line_bytes,
- max_shared_memory_per_block, max_registers_per_block,
+ max_shared_memory_per_block, max_local_memory_per_block,
max_threads_per_block, max_vthread_extent, warp_size);
});