You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lm...@apache.org on 2021/02/05 22:01:37 UTC
[tvm] 02/02: update

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch custom_tile_size
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit b1fcd45d2f9c04d2ff544c390a4f0bb7dd43c692
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Fri Feb 5 22:01:11 2021 +0000

    update
---
 tutorials/autotvm/tune_conv2d_cuda.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index a00fe5f..aa7449e 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -100,9 +100,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     rc, ry, rx = s[conv].op.reduce_axis
 
     cfg = autotvm.get_config()
-    cfg.define_split("tile_f", f, num_outputs=4)          # filter / output channel
-    cfg.define_split("tile_y", y, num_outputs=4)          # height
-    cfg.define_split("tile_x", x, num_outputs=4)          # width
+    cfg.define_split("tile_f", f, num_outputs=4)          # filter / output channel -> blockIdx.z, vthread, threadIdx.z, thread_inner
+    cfg.define_split("tile_y", y, num_outputs=4)          # height                  -> blockIdx.y, vthread, threadIdx.y, thread_inner
+    cfg.define_split("tile_x", x, num_outputs=4)          # width                   -> blockIdx.x, vthread, threadIdx.x, thread_inner
     cfg.define_split("tile_rc", rc, num_outputs=3)        # input channel
     cfg.define_split("tile_ry", ry, num_outputs=3)        # kernel width
     cfg.define_split("tile_rx", rx, num_outputs=3)        # kernel height
@@ -110,6 +110,13 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     cfg.define_knob("unroll_explicit", [0])               # disable auto unroll
     ##### space definition end #####
 
+    # Constraints  (read this from deviceQuery)
+    #  blockIdx.z <= 2^31, blockIdx.y < 2^16, blockIdx.x < 2^16        (Max dimension size of a grid size)
+    #  threadIdx.z <= 1024, threadIdx.y <= 1024 , threadIdx.z <=1024   (Max dimension size of a thread block)
+    #  threadIdx.z * threadIdx.y * threadIdx.z <= 1024                 (Maximum number of threads per block)
+    #
+    #  input buffer + weight buffer in each block < 49152 bytes        (Total amount of shared memory per block)
+
     # inline padding
     pad_data = s[conv].op.input_tensors[0]
     s[pad_data].compute_inline()
@@ -221,6 +228,7 @@ best_config = dispatch_context.query(task.target, task.workload)
 
 # Plug your own tile sizes
 #best_config._entity_map['tile_f'] = SplitEntity([-1, 2, 8, 8])
+#                                [-1, 2, 8, 8] will be mapped to [blockId, vthread, threadId, local_id]
 
 print("\nBest config:")
 print(best_config)