You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by lm...@apache.org on 2021/02/05 22:01:35 UTC

[tvm] branch custom_tile_size created (now b1fcd45)

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a change to branch custom_tile_size
in repository https://gitbox.apache.org/repos/asf/tvm.git.


      at b1fcd45  update

This branch includes the following new commits:

     new 9082cf8  update tune_conv2d_cuda.py
     new b1fcd45  update

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.

[tvm] 01/02: update tune_conv2d_cuda.py

Posted by lm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch custom_tile_size
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit 9082cf81e473701ca6985cf78f6bf9ddf8869ea8
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Fri Feb 5 21:53:35 2021 +0000

    update tune_conv2d_cuda.py
---
 tutorials/autotvm/tune_conv2d_cuda.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index dc8e6e5..a00fe5f 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -58,6 +58,8 @@ from tvm.topi.testing import conv2d_nchw_python
 import tvm.testing
 
 from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
+from tvm.autotvm.task.dispatcher import ApplyConfig
 
 ######################################################################
 # Step 1:  Define the search space
@@ -98,14 +100,14 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     rc, ry, rx = s[conv].op.reduce_axis
 
     cfg = autotvm.get_config()
-    cfg.define_split("tile_f", f, num_outputs=4)
-    cfg.define_split("tile_y", y, num_outputs=4)
-    cfg.define_split("tile_x", x, num_outputs=4)
-    cfg.define_split("tile_rc", rc, num_outputs=3)
-    cfg.define_split("tile_ry", ry, num_outputs=3)
-    cfg.define_split("tile_rx", rx, num_outputs=3)
-    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
-    cfg.define_knob("unroll_explicit", [0, 1])
+    cfg.define_split("tile_f", f, num_outputs=4)          # filter / output channel
+    cfg.define_split("tile_y", y, num_outputs=4)          # height
+    cfg.define_split("tile_x", x, num_outputs=4)          # width
+    cfg.define_split("tile_rc", rc, num_outputs=3)        # input channel
+    cfg.define_split("tile_ry", ry, num_outputs=3)        # kernel width
+    cfg.define_split("tile_rx", rx, num_outputs=3)        # kernel height
+    cfg.define_knob("auto_unroll_max_step", [0])          # disable auto unroll
+    cfg.define_knob("unroll_explicit", [0])               # disable auto unroll
     ##### space definition end #####
 
     # inline padding
@@ -204,7 +206,7 @@ measure_option = autotvm.measure_option(
 # see many error reports. As long as you can see non-zero GFLOPS, it is okay.
 tuner = autotvm.tuner.XGBTuner(task)
 tuner.tune(
-    n_trial=20,
+    n_trial=100,
     measure_option=measure_option,
     callbacks=[autotvm.callback.log_to_file("conv2d.log")],
 )
@@ -216,11 +218,16 @@ tuner.tune(
 # inspect the best config
 dispatch_context = autotvm.apply_history_best("conv2d.log")
 best_config = dispatch_context.query(task.target, task.workload)
+
+# Plug your own tile sizes
+#best_config._entity_map['tile_f'] = SplitEntity([-1, 2, 8, 8])
+
 print("\nBest config:")
 print(best_config)
 
 # apply history best from log file
-with autotvm.apply_history_best("conv2d.log"):
+#with autotvm.apply_history_best("conv2d.log"):
+with ApplyConfig(best_config):
     with tvm.target.Target("cuda"):
         s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
         func = tvm.build(s, arg_bufs)

[tvm] 02/02: update

Posted by lm...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

lmzheng pushed a commit to branch custom_tile_size
in repository https://gitbox.apache.org/repos/asf/tvm.git

commit b1fcd45d2f9c04d2ff544c390a4f0bb7dd43c692
Author: Lianmin Zheng <li...@gmail.com>
AuthorDate: Fri Feb 5 22:01:11 2021 +0000

    update
---
 tutorials/autotvm/tune_conv2d_cuda.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index a00fe5f..aa7449e 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -100,9 +100,9 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     rc, ry, rx = s[conv].op.reduce_axis
 
     cfg = autotvm.get_config()
-    cfg.define_split("tile_f", f, num_outputs=4)          # filter / output channel
-    cfg.define_split("tile_y", y, num_outputs=4)          # height
-    cfg.define_split("tile_x", x, num_outputs=4)          # width
+    cfg.define_split("tile_f", f, num_outputs=4)          # filter / output channel -> blockIdx.z, vthread, threadIdx.z, thread_inner
+    cfg.define_split("tile_y", y, num_outputs=4)          # height                  -> blockIdx.y, vthread, threadIdx.y, thread_inner
+    cfg.define_split("tile_x", x, num_outputs=4)          # width                   -> blockIdx.x, vthread, threadIdx.x, thread_inner
     cfg.define_split("tile_rc", rc, num_outputs=3)        # input channel
     cfg.define_split("tile_ry", ry, num_outputs=3)        # kernel width
     cfg.define_split("tile_rx", rx, num_outputs=3)        # kernel height
@@ -110,6 +110,13 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     cfg.define_knob("unroll_explicit", [0])               # disable auto unroll
     ##### space definition end #####
 
+    # Constraints  (read this from deviceQuery)
+    #  blockIdx.z <= 2^31, blockIdx.y < 2^16, blockIdx.x < 2^16        (Max dimension size of a grid size)
+    #  threadIdx.z <= 1024, threadIdx.y <= 1024 , threadIdx.z <=1024   (Max dimension size of a thread block)
+    #  threadIdx.z * threadIdx.y * threadIdx.z <= 1024                 (Maximum number of threads per block)
+    #
+    #  input buffer + weight buffer in each block < 49152 bytes        (Total amount of shared memory per block)
+
     # inline padding
     pad_data = s[conv].op.input_tensors[0]
     s[pad_data].compute_inline()
@@ -221,6 +228,7 @@ best_config = dispatch_context.query(task.target, task.workload)
 
 # Plug your own tile sizes
 #best_config._entity_map['tile_f'] = SplitEntity([-1, 2, 8, 8])
+#                                [-1, 2, 8, 8] will be mapped to [blockId, vthread, threadId, local_id]
 
 print("\nBest config:")
 print(best_config)