You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/09/20 02:38:18 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@a75dcabd3f5306ed1c792c0877becab219004ed8)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new d535fbc61d deploying docs (apache/tvm@a75dcabd3f5306ed1c792c0877becab219004ed8)
d535fbc61d is described below
commit d535fbc61deb3fbdaba6ea87bedcd50040badda8
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Tue Sep 20 02:38:11 2022 +0000
deploying docs (apache/tvm@a75dcabd3f5306ed1c792c0877becab219004ed8)
---
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_keras.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 18 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 10 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 1679 +++++++++-----------
.../tune_network_cuda.rst.txt | 2 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 101 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 10 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 26 +-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 16 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 10 +-
.../work_with_relay/sg_execution_times.rst.txt | 8 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 14 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 4 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 6 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 9 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 58 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 24 +-
.../tutorial/tensor_expr_get_started.rst.txt | 40 +-
docs/commit_hash | 2 +-
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_keras.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 15 +-
docs/how_to/compile_models/from_pytorch.html | 4 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 30 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 59 +-
docs/how_to/deploy_models/deploy_prequantized.html | 9 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 38 +-
docs/how_to/deploy_models/sg_execution_times.html | 18 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 10 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 1679 +++++++++-----------
.../tune_with_autoscheduler/tune_network_cuda.html | 2 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 101 +-
.../tune_with_autotvm/sg_execution_times.html | 10 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 26 +-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 10 +-
.../how_to/work_with_relay/sg_execution_times.html | 8 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 14 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/reference/api/doxygen/block__scope_8h.html | 2 +-
.../api/doxygen/block__scope_8h__dep__incl.svg | 282 ++--
docs/reference/api/doxygen/classes.html | 28 +-
...stvm_1_1meta__schedule_1_1Database-members.html | 2 +-
.../classtvm_1_1meta__schedule_1_1Database.html | 31 +-
...classtvm_1_1meta__schedule_1_1DatabaseNode.html | 8 +-
...a__schedule_1_1DatabaseNode__inherit__graph.svg | 166 +-
..._1meta__schedule_1_1PyDatabaseNode-members.html | 26 +-
...asstvm_1_1meta__schedule_1_1PyDatabaseNode.html | 328 +++-
...ta__schedule_1_1PyDatabaseNode__coll__graph.svg | 362 +++--
..._schedule_1_1PyDatabaseNode__inherit__graph.svg | 166 +-
docs/reference/api/doxygen/database_8h.html | 3 +-
.../api/doxygen/database_8h__dep__incl.svg | 48 +-
docs/reference/api/doxygen/database_8h__incl.svg | 1285 +++++++--------
docs/reference/api/doxygen/database_8h_source.html | 80 +-
docs/reference/api/doxygen/dir_000004_000011.html | 2 +-
.../dir_4378f18824ae7d4ad48f8d7785cd7ac8_dep.svg | 4 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg | 4 +-
docs/reference/api/doxygen/functions_f.html | 30 +-
docs/reference/api/doxygen/functions_func_p.html | 2 +-
docs/reference/api/doxygen/functions_func_q.html | 3 +
docs/reference/api/doxygen/functions_func_s.html | 2 +-
docs/reference/api/doxygen/functions_p.html | 2 +-
docs/reference/api/doxygen/functions_q.html | 3 +
docs/reference/api/doxygen/functions_s.html | 4 +-
docs/reference/api/doxygen/functions_t.html | 6 +-
docs/reference/api/doxygen/functions_type_f.html | 9 +
docs/reference/api/doxygen/functions_v.html | 8 +-
docs/reference/api/doxygen/functions_vars_f.html | 9 +
docs/reference/api/doxygen/hierarchy.html | 93 +-
docs/reference/api/doxygen/index__map_8h.html | 2 +-
.../api/doxygen/index__map_8h__dep__incl.svg | 808 +++++-----
docs/reference/api/doxygen/inherit_graph_11.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_117.svg | 32 +-
docs/reference/api/doxygen/inherit_graph_162.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_163.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_164.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_165.svg | 21 +-
docs/reference/api/doxygen/inherit_graph_166.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_167.svg | 19 +-
docs/reference/api/doxygen/inherit_graph_168.svg | 24 +-
docs/reference/api/doxygen/inherit_graph_169.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_170.svg | 24 +-
docs/reference/api/doxygen/inherit_graph_171.svg | 21 +-
docs/reference/api/doxygen/inherit_graph_172.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_173.svg | 21 +-
docs/reference/api/doxygen/inherit_graph_174.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_175.svg | 18 +-
docs/reference/api/doxygen/inherit_graph_176.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_177.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_178.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_179.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_180.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_181.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_182.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_183.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_184.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_185.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_186.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_187.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_188.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_189.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_190.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_191.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_192.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_193.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_194.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_195.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_196.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_197.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_198.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_199.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_200.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_201.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_202.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_203.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_204.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_205.svg | 17 +-
docs/reference/api/doxygen/inherit_graph_206.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_207.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_208.svg | 79 +-
docs/reference/api/doxygen/inherit_graph_209.svg | 79 +-
docs/reference/api/doxygen/inherit_graph_210.svg | 17 +-
docs/reference/api/doxygen/inherit_graph_211.svg | 78 +-
docs/reference/api/doxygen/inherit_graph_212.svg | 78 +-
docs/reference/api/doxygen/inherit_graph_213.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_214.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_215.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_216.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_217.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_218.svg | 19 +-
docs/reference/api/doxygen/inherit_graph_219.svg | 14 +-
docs/reference/api/doxygen/inherit_graph_220.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_221.svg | 19 +-
docs/reference/api/doxygen/inherit_graph_222.svg | 29 +-
docs/reference/api/doxygen/inherit_graph_223.svg | 30 +-
docs/reference/api/doxygen/inherit_graph_224.svg | 15 +-
docs/reference/api/doxygen/inherit_graph_225.svg | 30 +-
docs/reference/api/doxygen/inherit_graph_226.svg | 30 +-
docs/reference/api/doxygen/inherit_graph_227.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_228.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_229.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_230.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_231.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_232.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_233.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_234.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_235.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_236.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_237.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_238.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_239.svg | 12 +-
docs/reference/api/doxygen/inherit_graph_240.svg | 12 +-
...inherit_graph_238.svg => inherit_graph_241.svg} | 0
...inherit_graph_239.svg => inherit_graph_242.svg} | 0
...inherit_graph_240.svg => inherit_graph_243.svg} | 0
docs/reference/api/doxygen/inherit_graph_41.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_44.svg | 8 +-
docs/reference/api/doxygen/inherit_graph_45.svg | 8 +-
docs/reference/api/doxygen/inherit_graph_99.svg | 8 +-
docs/reference/api/doxygen/inherits.html | 164 +-
.../api/doxygen/instruction_8h__dep__incl.svg | 210 +--
.../api/doxygen/measure__callback_8h.html | 2 +-
.../api/doxygen/measure__callback_8h__incl.svg | 1280 +++++++--------
docs/reference/api/doxygen/random__engine_8h.html | 2 +-
.../api/doxygen/random__engine_8h__dep__incl.svg | 280 ++--
docs/reference/api/doxygen/search/all_10.js | 2 +-
docs/reference/api/doxygen/search/all_11.js | 2 +-
docs/reference/api/doxygen/search/all_12.js | 6 +-
docs/reference/api/doxygen/search/all_13.js | 2 +-
docs/reference/api/doxygen/search/all_14.js | 14 +-
docs/reference/api/doxygen/search/all_15.js | 13 +-
docs/reference/api/doxygen/search/all_17.js | 4 +-
docs/reference/api/doxygen/search/all_4.js | 2 +-
docs/reference/api/doxygen/search/all_7.js | 8 +-
docs/reference/api/doxygen/search/all_d.js | 2 +-
docs/reference/api/doxygen/search/classes_10.js | 6 +-
docs/reference/api/doxygen/search/classes_11.js | 5 +-
docs/reference/api/doxygen/search/classes_13.js | 2 +-
docs/reference/api/doxygen/search/classes_5.js | 2 +-
docs/reference/api/doxygen/search/classes_9.js | 2 +-
docs/reference/api/doxygen/search/functions_10.js | 2 +-
docs/reference/api/doxygen/search/functions_11.js | 6 +-
docs/reference/api/doxygen/search/functions_13.js | 4 +-
docs/reference/api/doxygen/search/functions_3.js | 2 +-
docs/reference/api/doxygen/search/functions_f.js | 2 +-
docs/reference/api/doxygen/search/typedefs_5.js | 3 +
docs/reference/api/doxygen/search/typedefs_e.js | 2 +-
docs/reference/api/doxygen/search/variables_6.js | 3 +
.../reference/api/doxygen/search__strategy_8h.html | 2 +-
.../api/doxygen/search__strategy_8h__incl.svg | 1154 +++++++-------
docs/reference/api/doxygen/state_8h.html | 2 +-
docs/reference/api/doxygen/state_8h__dep__incl.svg | 272 ++--
docs/reference/api/doxygen/task__scheduler_8h.html | 2 +-
.../api/doxygen/task__scheduler_8h__incl.svg | 1394 ++++++++--------
docs/reference/api/doxygen/tir_2function_8h.html | 2 +-
.../api/doxygen/tir_2function_8h__dep__incl.svg | 660 ++++----
.../api/doxygen/tir_2schedule_2schedule_8h.html | 2 +-
.../tir_2schedule_2schedule_8h__dep__incl.svg | 262 +--
docs/reference/api/doxygen/trace_8h__dep__incl.svg | 200 +--
docs/reference/api/doxygen/tune__context_8h.html | 2 +-
.../api/doxygen/tune__context_8h__incl.svg | 1292 +++++++--------
docs/reference/api/python/auto_scheduler.html | 4 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 4 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 6 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 4 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 262 +--
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 28 +-
docs/tutorial/tensor_expr_get_started.html | 40 +-
285 files changed, 9099 insertions(+), 8768 deletions(-)
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 921be3e7da..b4113d0dd9 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -315,7 +315,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 4.869 seconds)
+ **Total running time of the script:** ( 1 minutes 4.971 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index d84df4a264..b011a12040 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -228,7 +228,7 @@ Look up prediction top 1 index in 1000 class synset.
.. code-block:: none
Relay top-1 id: 285, class name: Egyptian cat
-
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 960ms/step
+
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 948ms/step
Keras top-1 id: 285, class name: Egyptian cat
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index f493080027..9353be8f39 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipf6e85a27-255e-48c3-be6b-71655e6a85f9 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipf899b4c2-08e5-4b21-98e5-f645fe0875be from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 803981b983..24be1dd65c 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -116,7 +116,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
15%|#5 | 6.33M/41.5M [00:00<00:00, 45.5MB/s]
26%|##5 | 10.7M/41.5M [00:00<00:01, 28.2MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 33.4MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 38.9MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 45.4MB/s]
92%|#########2| 38.3M/41.5M [00:01<00:00, 35.8MB/s]
100%|##########| 41.5M/41.5M [00:01<00:00, 36.8MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
15%|#5 | 6.33M/41.5M [00:00<00:01, 32.0MB/s]
23%|##2 | 9.38M/41.5M [00:00<00:01, 27.7MB/s]
36%|###6 | 15.0M/41.5M [00:00<00:00, 38.1MB/s]
47%|####6 | 19.3M/41.5M [00:00<00:00, 40.4MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 32.7MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 40.7MB/s]
92%|#########2| 38.3M/41.5M [00:01<00:00, 36.1MB/s]
100%|##########| 41.5M/41.5M [00:01<00:00, 36.6MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index e018c4cdf7..c90470b445 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -94,7 +94,7 @@ Load a pretrained PyTorch model
.. code-block:: none
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
32%|###1 | 14.1M/44.7M [00:00<00:00, 148MB/s]
89%|########9 | 39.9M/44.7M [00:00<00:00, 220MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 208MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
42%|####2 | 18.9M/44.7M [00:00<00:00, 198MB/s]
89%|########8 | 39.6M/44.7M [00:00<00:00, 209MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 208MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 815fdf3702..2fe355f676 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -416,7 +416,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 4.131 seconds)
+ **Total running time of the script:** ( 1 minutes 8.186 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index 987fb352b2..62a2ee0aa2 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:10.376** total execution time for **how_to_compile_models** files:
+**05:13.283** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:04.869 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:08.186 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:04.131 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:04.971 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:39.520 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:39.256 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:28.548 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:28.430 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.779 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:26.136 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:25.558 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.930 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:22.119 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.534 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:20.047 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:19.789 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:17.303 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:16.648 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.502 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.405 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 1c9fb2f016..d57db1e858 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -434,7 +434,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.8558 15.7675 16.3276 15.6436 0.2005
+ 15.5949 15.6058 15.7256 15.4827 0.0880
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index 23ce584c6b..b96e39b003 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -123,7 +123,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
.. code-block:: none
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
6%|5 | 9.69M/170M [00:00<00:01, 101MB/s]
15%|#5 | 26.1M/170M [00:00<00:01, 143MB/s]
25%|##5 | 42.8M/170M [00:00<00:00, 158MB/s]
35%|###5 | 60.0M/170M [00:00<00:00, 166MB/s]
45%|####5 | 77.1M/170M [00:00<00:00, 171MB/s]
56%|#####5 | 94.7M/170M [00:00<00:00, 176MB/s]
66%|######6 | 112M/170M [00:00<00:00, 179MB/s]
77%|#######6 | 130M/170M [00:00<00:00, 182MB/s]
88%|########7 | 149M/170M [00:00<00:00, 185MB/s]
99%|#########8| 168M/170M [00:01<00:00, 189MB/s]
100%|##########| 170M/170M [00:01<00:00, 176MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
2%|2 | 3.73M/170M [00:00<00:04, 39.2MB/s]
5%|5 | 8.69M/170M [00:00<00:03, 45.4MB/s]
8%|7 | 13.0M/170M [00:00<00:04, 36.0MB/s]
10%|# | 17.1M/170M [00:00<00:04, 38.3MB/s]
13%|#3 | 22.6M/170M [00:00<00:03, 44.3MB/s]
16%|#5 | 26.9M/170M [00:00<00:03, 39.3MB/s]
18%|#8 | 30.9M/170M [00:00<00:04, 30.0MB/s]
20%|## | 34.1M/170M [00:01<00:04, 30.1MB/s]
22%|##2 | 37.7M/170M [00:01<00:04, 31.9MB/s]
25%|##4 | 41.7M/170M [00:01<00:03, 34.4MB/s]
27%|##6 | 45.2M/170M [00:01<00:04, 30.0MB/s]
28%|##8 | 48.2M/170M [00:01<00:04, 28.6MB/s]
31%|### | 51.8M/170M [00:01<00:04, 30.6MB/s]
33%|###3 | 56.1M/170M [00:01<00:03, 34.3MB/s]
35%|###5 | 59.5M/170M [00:01<00:03, 32.8MB/s]
38%|###7 | 63.9M/170M [00:01<00:03, 35.2MB/s]
40%|#### | 68.0M/170M [00:02<00:02, 37.1MB/
s]
42%|####2 | 71.6M/170M [00:02<00:03, 28.3MB/s]
44%|####4 | 75.2M/170M [00:02<00:03, 30.5MB/s]
46%|####6 | 78.4M/170M [00:02<00:03, 30.8MB/s]
48%|####8 | 81.6M/170M [00:02<00:02, 31.5MB/s]
50%|####9 | 84.8M/170M [00:02<00:02, 31.6MB/s]
52%|#####1 | 87.9M/170M [00:02<00:02, 31.7MB/s]
54%|#####4 | 92.2M/170M [00:02<00:02, 35.4MB/s]
56%|#####6 | 95.7M/170M [00:03<00:02, 32.7MB/s]
59%|#####8 | 99.4M/170M [00:03<00:02, 34.4MB/s]
61%|###### | 103M/170M [00:03<00:01, 36.3MB/s]
63%|######2 | 107M/170M [00:03<00:02, 32.3MB/s]
65%|######4 | 110M/170M [00:03<00:02, 29.6MB/s]
68%|######7 | 115M/170M [00:03<00:01, 34.8MB/s]
70%|######9 | 118M/170M [00:03<00:01, 34.2MB/s]
73%|#######2 | 124M/170M [00:03<00:01, 40.3MB/s]
75%|#######5 | 128M/170M [00:03<00:01, 38.3MB/s]
77%|#######7 | 131M/170M [00:04<00:01, 34.6MB/s]
79%|#######9 | 135M/170M [00:04<00:01, 28.5
MB/s]
82%|########1 | 139M/170M [00:04<00:01, 31.3MB/s]
85%|########4 | 144M/170M [00:04<00:00, 37.9MB/s]
87%|########7 | 148M/170M [00:04<00:00, 39.2MB/s]
90%|########9 | 152M/170M [00:04<00:00, 34.9MB/s]
92%|#########1| 156M/170M [00:04<00:00, 31.4MB/s]
94%|#########3| 159M/170M [00:05<00:00, 26.7MB/s]
95%|#########5| 162M/170M [00:05<00:00, 23.8MB/s]
97%|#########6| 164M/170M [00:05<00:00, 23.9MB/s]
98%|#########8| 167M/170M [00:05<00:00, 21.1MB/s]
99%|#########9| 169M/170M [00:05<00:00, 20.3MB/s]
100%|##########| 170M/170M [00:05<00:00, 31.5MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -288,7 +288,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 2.758 seconds)
+ **Total running time of the script:** ( 2 minutes 59.029 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 396ab1877e..71680efd30 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -232,7 +232,7 @@ training. Other models require a full post training calibration.
.. code-block:: none
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
67%|######6 | 9.06M/13.6M [00:00<00:00, 93.7MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 108MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
25%|##4 | 3.34M/13.6M [00:00<00:00, 34.8MB/s]
49%|####9 | 6.66M/13.6M [00:00<00:00, 34.3MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 56.4MB/s]
@@ -405,7 +405,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 89.4734 89.3994 91.1405 89.0812 0.2872
+ 90.3495 90.2487 95.1589 90.0888 0.5181
@@ -454,7 +454,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 8.869 seconds)
+ **Total running time of the script:** ( 1 minutes 9.324 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 51585e382e..b5d51393ed 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -432,7 +432,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 119.9012 119.8843 121.1959 119.0577 0.3285
+ 120.1116 120.0455 126.2227 119.3651 0.7084
@@ -469,7 +469,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 57.841 seconds)
+ **Total running time of the script:** ( 1 minutes 57.140 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 43720585a0..fe6bda41c9 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -253,7 +253,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 30.867 seconds)
+ **Total running time of the script:** ( 1 minutes 34.522 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index 8ed7001ce5..ef3fcca418 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -158,7 +158,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
4%|3 | 5107/132723 [00:00<00:02, 49152.04KB/s]
9%|8 | 11727/132723 [00:00<00:02, 59018.85KB/s]
15%|#4 | 19482/132723 [00:00<00:01, 67401.10KB/s]
20%|## | 27086/132723 [00:00<00:01, 70790.45KB/s]
26%|##6 | 34809/132723 [00:00<00:01, 73095.50KB/s]
32%|###1 | 42467/132723 [00:00<00:01, 74275.36KB/s]
38%|###7 | 50205/132723 [00:00<00:01, 75282.06KB/s]
44%|####3 | 57943/132723 [00:00<00:00, 75943.67KB/s]
50%|####9 | 65709/132723 [00:00<00:00, 76478.14KB/s]
55%|#####5 | 73378/132723 [00:01<00:00, 76540.54KB/s]
61%|######1 | 81233/132723 [00:01<00:00, 77150.06KB/s]
67%|######7 | 89026/132723 [00:01<00:00, 77380.47KB/s]
73%|#######2 | 96824/132723 [00:01<00:00, 77558.36KB/s]
79%|#######8 | 104645/132723 [00:01<00:00, 77751.58KB/s]
85%|########4 | 112421/132723 [00:01<00:00, 77618.17KB/s]
91%|#########
| 120223/132723 [00:01<00:00, 77737.47KB/s]
96%|#########6| 128035/132723 [00:01<00:00, 77846.00KB/s]
100%|##########| 132723/132723 [00:01<00:00, 75201.37KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
4%|4 | 5512/132723 [00:00<00:02, 55115.98KB/s]
10%|9 | 13005/132723 [00:00<00:01, 66765.92KB/s]
16%|#5 | 20649/132723 [00:00<00:01, 71176.96KB/s]
21%|##1 | 28349/132723 [00:00<00:01, 73469.99KB/s]
27%|##6 | 35696/132723 [00:00<00:01, 64000.96KB/s]
33%|###2 | 43285/132723 [00:00<00:01, 67680.94KB/s]
38%|###8 | 51024/132723 [00:00<00:01, 70652.13KB/s]
44%|####4 | 58736/132723 [00:00<00:01, 72616.57KB/s]
50%|##### | 66381/132723 [00:00<00:00, 73777.73KB/s]
56%|#####5 | 73953/132723 [00:01<00:00, 74361.97KB/s]
61%|######1 | 81570/132723 [00:01<00:00, 74898.88KB/s]
67%|######7 | 89288/132723 [00:01<00:00, 75583.94KB/s]
73%|#######3 | 96980/132723 [00:01<00:00, 75983.65KB/s]
79%|#######8 | 104669/132723 [00:01<00:00, 76252.11KB/s]
85%|########4 | 112419/132723 [00:01<00:00, 76617.44KB/s]
91%|#########
| 120132/132723 [00:01<00:00, 76766.23KB/s]
96%|#########6| 127890/132723 [00:01<00:00, 77008.96KB/s]
100%|##########| 132723/132723 [00:01<00:00, 73557.21KB/s]
@@ -234,7 +234,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 39.288 seconds)
+ **Total running time of the script:** ( 2 minutes 35.447 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index d1ea9ada6d..68088e09b0 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
Computation times
=================
-**11:35.774** total execution time for **how_to_deploy_models** files:
+**11:30.316** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:02.758 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:59.029 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:39.288 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:35.447 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 01:57.841 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 01:57.140 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:30.867 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:34.522 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:08.869 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:09.324 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:30.106 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:29.384 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:23.249 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:22.941 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:22.791 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:22.523 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.007 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index fd9c13badb..80e56077c7 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -472,7 +472,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip0e9f1f9d-057f-4eca-8050-7a526bd1e91b from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip39987a78-f4fe-40bb-8181-17bfdba0e090 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index ce4705730d..13e486b3c3 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:41.853** total execution time for **how_to_extend_tvm** files:
+**00:39.890** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:38.613 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:36.805 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.262 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.161 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.971 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.916 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.007 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.008 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 4570db5012..c91224af26 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 6983us [6983us] (46.28%; 46.28%)
- FoldScaleAxis: 8104us [7us] (53.72%; 53.72%)
- FoldConstant: 8097us [1650us] (53.67%; 99.91%)
- InferType: 6447us [6447us] (42.73%; 79.62%)
+ InferType: 6731us [6731us] (45.99%; 45.99%)
+ FoldScaleAxis: 7905us [5us] (54.01%; 54.01%)
+ FoldConstant: 7900us [1631us] (53.97%; 99.94%)
+ InferType: 6269us [6269us] (42.83%; 79.36%)
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6562us [6562us] (44.93%; 44.93%)
- FoldScaleAxis: 8043us [6us] (55.07%; 55.07%)
- FoldConstant: 8037us [1685us] (55.03%; 99.93%)
- InferType: 6352us [6352us] (43.49%; 79.03%)
+ InferType: 6297us [6297us] (44.74%; 44.74%)
+ FoldScaleAxis: 7777us [4us] (55.26%; 55.26%)
+ FoldConstant: 7773us [1623us] (55.23%; 99.94%)
+ InferType: 6150us [6150us] (43.70%; 79.12%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index f6eca0670a..da7affce10 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 35.264269 ms
+ Convolution: 33.716032 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 6010454d97..784fab3f6c 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -671,7 +671,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 13.373336 ms
+ conv2d with tensor core: 8.031158 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 381f8a91c3..2db68fe0b2 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.019462
- Baseline: 3.449233
+ Numpy running time: 0.017948
+ Baseline: 3.417880
@@ -239,7 +239,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.304449
+ Opt1: 0.298977
@@ -342,7 +342,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.336338
+ Opt2: 0.336220
@@ -438,7 +438,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.116521
+ Opt3: 0.116299
@@ -563,7 +563,7 @@ flattening.
.. code-block:: none
- Opt4: 0.108622
+ Opt4: 0.109688
@@ -685,7 +685,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.111192
+ Opt5: 0.110985
@@ -810,7 +810,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
.. code-block:: none
- Opt6: 0.147711
+ Opt6: 0.146757
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 6b427f887b..2122ab9e3c 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:35.024** total execution time for **how_to_optimize_operators** files:
+**00:34.317** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.552 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.100 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.372 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.234 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.100 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:00.983 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 59a2773487..50e73d728b 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**06:25.780** total execution time for **how_to_tune_with_autoscheduler** files:
+**06:33.101** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 03:26.753 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 03:38.193 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:24.039 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:22.405 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:57.241 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:56.348 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:19.789 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:18.781 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:09.077 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.746 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.880 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.627 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index 21721149c6..e41a531738 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -240,483 +240,414 @@ cooperative fetching, unrolling and operator fusion.
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
+ for (rc.outer.outer: int32, 0, 16) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*1568)
+ let cse_var_1: int32 = (rc.outer.outer*288)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod((threadIdx.x_1*4), 9)) - 8)], 0f3 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0f32, dtype=float32)
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0f32, dtype=float32)
- }
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1344)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1344), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1792)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1792), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 64512)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 96768)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 129024)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ if @tir.likely((threadIdx.x_2 < 160), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ }
+ for (rc.outer.inner: int32, 0, 2) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*1008) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 504)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 567)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 630)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 693)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 756)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 819)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 882)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 945)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 259)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 322)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 385)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 448)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 511)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 574)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 637)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 700)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 763)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 826)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 889)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 952)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 259)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 322)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 385)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 448)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 511)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 574)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 637)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 700)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 763)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 826)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 889)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 952)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 301)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 364)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 427)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 490)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 553)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 616)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 679)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 742)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 805)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 868)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 931)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 994)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 301)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 364)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 427)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 490)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 553)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 616)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 679)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 742)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 805)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 868)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 931)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 994)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 182)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 245)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 308)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 371)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 434)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 497)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 560)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 623)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 686)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 749)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 812)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 875)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 938)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 1001)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
}
}
- for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
- }
+ for (i2.inner: int32, 0, 7) {
+ compute[((((blockIdx.x*1568) + (floordiv(threadIdx.x, 7)*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[i2.inner] + bias[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
}
}
}
@@ -771,7 +702,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.365 ms
+ Execution time of this operator: 0.319 ms
@@ -820,35 +751,35 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
- conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+ conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
- conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
+ conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=7)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
- conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+ conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
- conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+ conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
- conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+ conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
- conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+ conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
- compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+ compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
- compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+ compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
- compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+ compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -868,12 +799,12 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -893,9 +824,9 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
+ extern "C" __global__ void __launch_bounds__(224) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[2016];
__shared__ float kernel_shared[3072];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
@@ -904,419 +835,377 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((int)threadIdx.x)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 64512)];
+ kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 96768)];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 129024)];
+ if (((int)threadIdx.x) < 160) {
+ kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
}
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 504)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 567)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 630)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 693)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 756)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 819)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 882)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 945)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 259)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 322)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 385)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 448)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 511)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 574)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 637)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 700)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 763)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 826)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 889)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 952)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 259)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 322)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 385)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 448)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 511)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 574)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 637)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 700)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 763)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 826)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 889)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 952)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 301)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 364)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 427)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 490)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 553)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 616)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 679)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 742)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 805)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 868)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 931)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 994)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 301)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 364)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 427)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 490)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 553)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 616)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 679)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 742)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 805)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 868)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 931)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 994)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 182)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 245)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 308)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 371)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 434)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 497)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 560)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 623)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 686)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 749)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 812)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 875)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 938)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 1001)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ }
}
}
- for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[i2_inner] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
}
@@ -1378,7 +1267,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 26.753 seconds)
+ **Total running time of the script:** ( 3 minutes 38.193 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index 678e72753e..d0acd13a45 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -643,7 +643,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 8.2227 8.2245 8.2248 8.2189 0.0027
+ 8.2273 8.2258 8.2365 8.2195 0.0070
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 2ab13a9028..b71beefe78 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -662,7 +662,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 758.5054 757.6919 760.7528 757.0715 1.6092
+ 760.0965 759.7724 760.7544 759.7626 0.4652
@@ -690,7 +690,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 24.039 seconds)
+ **Total running time of the script:** ( 1 minutes 22.405 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index e5c372b669..6bbe13646f 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -397,103 +397,28 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_5: placeholder_17: Buffer(placeholder_10, float32, [128, 256], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+ preflattened_buffer_map = {placeholder_8: placeholder_15: Buffer(placeholder_13, int32, [33], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (i.outer.inner: int32, 0, 2) {
+ for (nb_j.inner: int32, 0, 2) {
for (i.inner.init: int32, 0, 64) {
- let cse_var_1: int32 = ((i.outer.inner*1024) + (i.inner.init*16))
- {
- compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
- compute_5[(cse_var_1 + 1)] = 0f32
- compute_5[(cse_var_1 + 2)] = 0f32
- compute_5[(cse_var_1 + 3)] = 0f32
- compute_5[(cse_var_1 + 4)] = 0f32
- compute_5[(cse_var_1 + 5)] = 0f32
- compute_5[(cse_var_1 + 6)] = 0f32
- compute_5[(cse_var_1 + 7)] = 0f32
- compute_5[(cse_var_1 + 8)] = 0f32
- compute_5[(cse_var_1 + 9)] = 0f32
- compute_5[(cse_var_1 + 10)] = 0f32
- compute_5[(cse_var_1 + 11)] = 0f32
- compute_5[(cse_var_1 + 12)] = 0f32
- compute_5[(cse_var_1 + 13)] = 0f32
- compute_5[(cse_var_1 + 14)] = 0f32
- compute_5[(cse_var_1 + 15)] = 0f32
+ for (j.init: int32, 0, 16) {
+ compute_5: Buffer(compute_4, float32, [2048], [])[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32
}
}
- for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_1 + 1)] - placeholder_3[cse_var_1])) {
for (i.inner: int32, 0, 64) {
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_2: int32 = ((i.outer.inner*1024) + (i.inner*16))
- compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_3: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 1)
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_4: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 2)
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 3)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 4)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 5)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 6)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 7)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 8)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 9)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 10)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 11)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 12)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 13)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 14)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 15)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = (((i.inner*32) + (nb_j.inner*16)) + j)
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 128) {
- let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
- compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 64) {
+ let cse_var_4: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
+ compute[ramp(cse_var_4, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_4, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -549,7 +474,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.811 ms
+ Execution time of this operator: 1.806 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 87f3bce043..7dee7c06de 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**00:45.742** total execution time for **how_to_tune_with_autotvm** files:
+**00:45.665** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:45.707 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:45.628 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.019 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.022 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
-+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_arm.py` (``tune_relay_arm.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_mobile_gpu.py` (``tune_relay_mobile_gpu.py``) | 00:00.005 | 0.0 MB |
++--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index a29ccb28b6..48544d3568 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -1156,8 +1156,8 @@ for this template
TimeoutError
[('tile_f', [-1, 2, 1, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4909501
- No: 9 GFLOPS: 176.29/176.29 result: MeasureResult(costs=(0.0013131999444444444,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.034900188446045, timestamp=1663630707.773062) [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5072689
- No: 10 GFLOPS: 0.00/176.29 result: Traceback (most recent call last):
+ No: 9 GFLOPS: 80.80/80.80 result: MeasureResult(costs=(0.002865221742857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.9253158569335938, timestamp=1663637124.214335) [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5072689
+ No: 10 GFLOPS: 0.00/80.80 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1280,8 +1280,8 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5092711
- No: 11 GFLOPS: 258.30/258.30 result: MeasureResult(costs=(0.0008962458603351956,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.663536548614502, timestamp=1663630708.6986022) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
- No: 12 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 11 GFLOPS: 259.74/259.74 result: MeasureResult(costs=(0.0008912696243093924,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7628161907196045, timestamp=1663637125.1207004) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
+ No: 12 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1404,7 +1404,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 128, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,183542
- No: 13 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 13 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1527,7 +1527,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2482196
- No: 14 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 14 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1650,9 +1650,9 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10306226
- No: 15 GFLOPS: 5.44/258.30 result: MeasureResult(costs=(0.042549769499999994,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.817678451538086, timestamp=1663630713.2384973) [('tile_f', [-1, 2, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5330964
- No: 16 GFLOPS: 3.33/258.30 result: MeasureResult(costs=(0.0694369725,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.536828517913818, timestamp=1663630714.4850295) [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2140058
- No: 17 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 15 GFLOPS: 5.33/259.74 result: MeasureResult(costs=(0.04344266425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8446388244628906, timestamp=1663637129.6733298) [('tile_f', [-1, 2, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5330964
+ No: 16 GFLOPS: 3.36/259.74 result: MeasureResult(costs=(0.06896940575,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.561822891235352, timestamp=1663637130.9030292) [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2140058
+ No: 17 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1670,8 +1670,8 @@ for this template
TimeoutError
[('tile_f', [-1, 2, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10195251
- No: 18 GFLOPS: 26.26/258.30 result: MeasureResult(costs=(0.008816739166666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1723246574401855, timestamp=1663630725.408047) [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6068603
- No: 19 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 18 GFLOPS: 28.28/259.74 result: MeasureResult(costs=(0.008187352642857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2877285480499268, timestamp=1663637141.9079373) [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6068603
+ No: 19 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1794,7 +1794,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 4, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6956993
- No: 20 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+ No: 20 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1973,7 +1973,7 @@ and measure running time.
Best config:
[('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
Finish loading 20 records
- Time cost of this operator: 0.001300
+ Time cost of this operator: 0.001274
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index eec9f739da..280b233023 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -327,10 +327,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.5 98.714 (1, 2, 10, 10, 3) 2 1 [310.5]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.074 0.977 (1, 6, 10, 10) 1 1 [3.074]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.97 0.308 (1, 1, 10, 10, 3) 1 1 [0.97]
- Total_time - 314.544 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 309.8 98.729 (1, 2, 10, 10, 3) 2 1 [309.8]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.015 0.961 (1, 6, 10, 10) 1 1 [3.015]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.972 0.31 (1, 1, 10, 10, 3) 1 1 [0.972]
+ Total_time - 313.787 - - - - -
@@ -394,10 +394,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 130.3 97.903 (1, 6, 10, 10, 1) 2 1 [130.3]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.822 1.369 (1, 6, 10, 10) 1 1 [1.822]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.969 0.728 (1, 1, 10, 10, 3) 1 1 [0.969]
- Total_time - 133.09 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 79.75 96.645 (1, 6, 10, 10, 1) 2 1 [79.75]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.81 2.193 (1, 6, 10, 10) 1 1 [1.81]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.958 1.162 (1, 1, 10, 10, 3) 1 1 [0.958]
+ Total_time - 82.518 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index e08a263d3a..eded85a536 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmpbdi64p4l/images/random'
+ '/tmp/tmpplf60smu/images/random'
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmpbdi64p4l/images/target contains 8144 images
- /tmp/tmpbdi64p4l/images/random contains 5000 images
+ /tmp/tmpplf60smu/images/target contains 8144 images
+ /tmp/tmpplf60smu/images/random contains 5000 images
@@ -501,13 +501,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 46s - loss: 0.2061 - accuracy: 0.9270 - val_loss: 0.1543 - val_accuracy: 0.9551 - 46s/epoch - 141ms/step
+ 328/328 - 47s - loss: 0.2218 - accuracy: 0.9240 - val_loss: 0.1319 - val_accuracy: 0.9588 - 47s/epoch - 142ms/step
Epoch 2/3
- 328/328 - 43s - loss: 0.1013 - accuracy: 0.9608 - val_loss: 0.1133 - val_accuracy: 0.9660 - 43s/epoch - 130ms/step
+ 328/328 - 43s - loss: 0.0911 - accuracy: 0.9662 - val_loss: 0.1058 - val_accuracy: 0.9683 - 43s/epoch - 132ms/step
Epoch 3/3
- 328/328 - 43s - loss: 0.0673 - accuracy: 0.9754 - val_loss: 0.1127 - val_accuracy: 0.9671 - 43s/epoch - 130ms/step
+ 328/328 - 43s - loss: 0.0585 - accuracy: 0.9781 - val_loss: 0.0925 - val_accuracy: 0.9687 - 43s/epoch - 132ms/step
- <keras.callbacks.History object at 0x7f8517c8f490>
+ <keras.callbacks.History object at 0x7fef75526ed0>
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 4 minutes 37.654 seconds)
+ **Total running time of the script:** ( 4 minutes 46.505 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index 249b5a199c..655083d801 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,16 +5,16 @@
Computation times
=================
-**05:31.630** total execution time for **how_to_work_with_microtvm** files:
+**05:39.964** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:37.654 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:46.505 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:42.578 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:41.931 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:08.040 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:08.242 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.357 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.284 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.001 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 57c9a366d4..7bb4bcb859 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:43.645** total execution time for **how_to_work_with_relay** files:
+**00:42.724** total execution time for **how_to_work_with_relay** files:
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:31.864 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:31.046 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.090 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.155 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.684 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.517 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.007 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index 27d26f78eb..ec12b5e45b 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f84b9106170>
+ <function my_cuda_math_rule at 0x7fef7023cdd0>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 11392a0810..61710f71c5 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:06.017** total execution time for **how_to_work_with_schedules** files:
+**00:07.992** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:03.728 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:05.804 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.034 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.977 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.547 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.528 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.527 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.506 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.100 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.097 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.039 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.028 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.027 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.014 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 6d16992099..863130ebc9 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -347,7 +347,7 @@ The importing needs to happen before the tensorized GEMV being executed.
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmptunj6_nu/input0.cc'\nsource_filename = \"/tmp/tmptunj6_nu/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpoj_ww8lo/input0.cc'\nsource_filename = \"/tmp/tmpoj_ww8lo/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 5706caa4cc..539522bdd2 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:21.311** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:21.296** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:21.305 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:21.289 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index cbe9288999..44ec3b0475 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -289,7 +289,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 23.26s!
+ resnet18_v1 inference graph built in 22.50s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 15268cceb7..89a0e945fb 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -333,7 +333,7 @@ The compilation steps are:
/workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 16.25s!
+ yolov3-tiny inference graph built in 16.01s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 02957b6ffe..cf14bcc38e 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:32.026** total execution time for **topic_vta_tutorials_frontend** files:
+**01:31.432** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.429 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.823 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:43.597 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:42.609 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 09fe7e3511..767aba7f74 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:02.969** total execution time for **topic_vta_tutorials_optimize** files:
+**00:02.995** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.571 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.617 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.397 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.378 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index b45584a5a3..77f9e24572 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.724** total execution time for **topic_vta_tutorials** files:
+**00:00.705** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.384 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.379 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.341 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.326 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index ccd0185130..30b52d7103 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -326,7 +326,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 93.842 ms
+ Execution time of this operator: 94.274 ms
@@ -426,7 +426,7 @@ resume the status and do more 5 trials.
Resume search:
/usr/local/lib/python3.7/dist-packages/xgboost/training.py:17: UserWarning: Old style callback is deprecated. See: https://xgboost.readthedocs.io/en/latest/python/callbacks.html
warnings.warn(f'Old style callback is deprecated. See: {link}', UserWarning)
- *E
+
@@ -442,11 +442,6 @@ Expression (TE) language that demonstrates how TVM can optimize computational
operations.
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 1 minutes 12.747 seconds)
-
-
.. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
.. only:: html
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 4b2015baf4..a989391180 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -462,16 +462,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 9.50/9.50 result: MeasureResult(costs=(0.028242411000000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5851504802703857, timestamp=1663629452.2552645) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
- No: 2 GFLOPS: 2.62/9.50 result: MeasureResult(costs=(0.1024012224,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7875950336456299, timestamp=1663629454.592761) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
- No: 3 GFLOPS: 11.79/11.79 result: MeasureResult(costs=(0.022766467000000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6068992614746094, timestamp=1663629455.1700509) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
- No: 4 GFLOPS: 1.56/11.79 result: MeasureResult(costs=(0.1724456072,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.86409330368042, timestamp=1663629458.637208) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
- No: 5 GFLOPS: 3.58/11.79 result: MeasureResult(costs=(0.074882168,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3394227027893066, timestamp=1663629460.1069984) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
- No: 6 GFLOPS: 1.44/11.79 result: MeasureResult(costs=(0.1870415584,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.1165707111358643, timestamp=1663629463.799769) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
- No: 7 GFLOPS: 0.81/11.79 result: MeasureResult(costs=(0.3296488846,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.391564130783081, timestamp=1663629469.237862) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
- No: 8 GFLOPS: 9.90/11.79 result: MeasureResult(costs=(0.027117936400000004,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5802202224731445, timestamp=1663629469.8347194) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
- No: 9 GFLOPS: 1.90/11.79 result: MeasureResult(costs=(0.1414774956,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.3974225521087646, timestamp=1663629472.3503067) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
- No: 10 GFLOPS: 2.52/11.79 result: MeasureResult(costs=(0.10633383200000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8096964359283447, timestamp=1663629474.2119386) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
+ No: 1 GFLOPS: 10.54/10.54 result: MeasureResult(costs=(0.025474914799999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5402207374572754, timestamp=1663635891.5455291) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
+ No: 2 GFLOPS: 2.93/10.54 result: MeasureResult(costs=(0.09150912759999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6119022369384766, timestamp=1663635893.7134464) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
+ No: 3 GFLOPS: 11.87/11.87 result: MeasureResult(costs=(0.0226147316,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5953617095947266, timestamp=1663635894.2776968) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
+ No: 4 GFLOPS: 1.85/11.87 result: MeasureResult(costs=(0.145215664,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.4472649097442627, timestamp=1663635897.28952) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
+ No: 5 GFLOPS: 3.68/11.87 result: MeasureResult(costs=(0.0729050342,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3002617359161377, timestamp=1663635898.7152534) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
+ No: 6 GFLOPS: 1.76/11.87 result: MeasureResult(costs=(0.1524555144,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.6087775230407715, timestamp=1663635901.365292) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+ No: 7 GFLOPS: 0.85/11.87 result: MeasureResult(costs=(0.3159235116,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.186928987503052, timestamp=1663635907.1256719) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
+ No: 8 GFLOPS: 10.59/11.87 result: MeasureResult(costs=(0.0253505374,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5499153137207031, timestamp=1663635907.6947112) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
+ No: 9 GFLOPS: 1.76/11.87 result: MeasureResult(costs=(0.1520888326,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.529919147491455, timestamp=1663635910.3431559) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
+ No: 10 GFLOPS: 2.68/11.87 result: MeasureResult(costs=(0.10024115380000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.705235481262207, timestamp=1663635912.1068423) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 3d9328995a..8dffbac55a 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -320,7 +320,7 @@ standard deviation.
.. code-block:: none
- {'mean': 515.597588089995, 'median': 515.9454214000107, 'std': 2.1741997976212493}
+ {'mean': 514.8496878599917, 'median': 514.2742650999935, 'std': 2.569928785174047}
@@ -554,30 +554,30 @@ the tuning data to.
.. code-block:: none
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.48/ 17.48 GFLOPS | Progress: (4/20) | 6.48 s
[Task 1/25] Current/Best: 6.10/ 17.48 GFLOPS | Progress: (8/20) | 9.58 s
[Task 1/25] Current/Best: 11.22/ 21.78 GFLOPS | Progress: (12/20) | 12.13 s
[Task 1/25] Current/Best: 16.18/ 22.06 GFLOPS | Progress: (16/20) | 13.84 s
[Task 1/25] Current/Best: 11.29/ 23.52 GFLOPS | Progress: (20/20) | 15.62 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.18/ 12.25 GFLOPS | Progress: (4/20) | 3.96 s
[Task 2/25] Current/Best: 12.34/ 18.09 GFLOPS | Progress: (8/20) | 5.29 s
[Task 2/25] Current/Best: 20.97/ 20.97 GFLOPS | Progress: (12/20) | 6.64 s
[Task 2/25] Current/Best: 11.07/ 20.97 GFLOPS | Progress: (16/20) | 7.92 s
[Task 2/25] Current/Best: 18.26/ 20.97 GFLOPS | Progress: (20/20) | 9.56 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.13 GFLOPS | Progress: (4/20) | 5.94 s
[Task 3/25] Current/Best: 15.21/ 16.80 GFLOPS | Progress: (8/20) | 7.90 s
[Task 3/25] Current/Best: 14.99/ 16.80 GFLOPS | Progress: (12/20) | 9.65 s
[Task 3/25] Current/Best: 6.78/ 22.73 GFLOPS | Progress: (16/20) | 11.65 s
[Task 3/25] Current/Best: 11.02/ 22.73 GFLOPS | Progress: (20/20) | 16.29 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.13/ 18.05 GFLOPS | Progress: (4/20) | 2.48 s
[Task 4/25] Current/Best: 6.27/ 18.05 GFLOPS | Progress: (8/20) | 7.27 s
[Task 4/25] Current/Best: 20.68/ 20.68 GFLOPS | Progress: (12/20) | 12.33 s
[Task 4/25] Current/Best: 16.49/ 20.68 GFLOPS | Progress: (16/20) | 14.73 s
[Task 4/25] Current/Best: 12.78/ 20.68 GFLOPS | Progress: (20/20) | 16.86 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 9.01/ 9.78 GFLOPS | Progress: (4/20) | 2.68 s
[Task 5/25] Current/Best: 11.27/ 11.27 GFLOPS | Progress: (8/20) | 4.76 s
[Task 5/25] Current/Best: 9.65/ 17.98 GFLOPS | Progress: (12/20) | 8.02 s
[Task 5/25] Current/Best: 11.64/ 22.05 GFLOPS | Progress: (16/20) | 9.47 s
[Task 5/25] Current/Best: 11.75/ 22.05 GFLOPS | Progress: (20/20) | 11.40 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.04/ 19.76 GFLOPS | Progress: (4/20) | 4.22 s
[Task 6/25] Current/Best: 18.82/ 19.76 GFLOPS | Progress: (8/20) | 6.01 s
[Task 6/25] Current/Best: 13.08/ 19.76 GFLOPS | Progress: (12/20) | 8.06 s
[Task 6/25] Current/Best: 19.54/ 19.76 GFLOPS | Progress: (16/20) | 10.35 s
[Task 6/25] Current/Best: 3.75/ 19.76 GFLOPS | Progress: (20/20) | 12.96 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 9.69/ 12.12 GFLOPS | Progress: (4/20) | 3.72 s
[Task 7/25] Current/Best: 19.45/ 19.85 GFLOPS | Progress: (8/20) | 5.28 s
[Task 7/25] Current/Best: 15.97/ 19.85 GFLOPS | Progress: (12/20) | 7.24 s
[Task 7/25] Current/Best: 12.14/ 20.06 GFLOPS | Progress: (16/20) | 9.35 s
[Task 7/25] Current/Best: 6.04/ 20.40 GFLOPS | Progress: (20/20) | 11.88 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 10.30/ 14.02 GFLOPS | Progress: (4/20) | 2.97 s
[Task 8/25] Current/Best: 9.44/ 14.02 GFLOPS | Progress: (8/20) | 8.22 s
[Task 8/25] Current/Best: 12.84/ 14.02 GFLOPS | Progress: (12/20) | 14.79 s
[Task 8/25] Current/Best: 18.79/ 18.79 GFLOPS | Progress: (16/20) | 16.96 s
[Task 8/25] Current/Best: 18.47/ 18.79 GFLOPS | Progress: (20/20) | 24.15 s Done.
-
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.31/ 14.31 GFLOPS | Progress: (4/20) | 12.01 s
[Task 9/25] Current/Best: 22.91/ 22.91 GFLOPS | Progress: (8/20) | 13.83 s
[Task 9/25] Current/Best: 7.87/ 22.91 GFLOPS | Progress: (12/20) | 16.39 s
[Task 9/25] Current/Best: 17.84/ 22.91 GFLOPS | Progress: (16/20) | 19.31 s
[Task 9/25] Current/Best: 8.92/ 22.91 GFLOPS | Progress: (20/20) | 28.06 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.29/ 18.29 GFLOPS | Progress: (4/20) | 2.67 s
[Task 10/25] Current/Best: 15.70/ 18.29 GFLOPS | Progress: (8/20) | 4.35 s
[Task 10/25] Current/Best: 11.44/ 18.82 GFLOPS | Progress: (12/20) | 5.93 s
[Task 10/25] Current/Best: 19.08/ 20.63 GFLOPS | Progress: (16/20) | 7.06 s
[Task 10/25] Current/Best: 8.50/ 20.63 GFLOPS | Progress: (20/20
) | 8.65 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 10.79/ 18.09 GFLOPS | Progress: (4/20) | 3.45 s
[Task 11/25] Current/Best: 14.81/ 18.09 GFLOPS | Progress: (8/20) | 6.28 s
[Task 11/25] Current/Best: 15.93/ 18.09 GFLOPS | Progress: (12/20) | 8.39 s
[Task 11/25] Current/Best: 11.73/ 20.59 GFLOPS | Progress: (16/20) | 11.41 s
[Task 11/25] Current/Best: 18.46/ 20.59 GFLOPS | Progress: (20/20) | 13.56 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.74/ 17.82 GFLOPS | Progress: (4/20) | 5.86 s
[Task 12/25] Current/Best: 4.96/ 17.82 GFLOPS | Progress: (8/20) | 9.92 s
[Task 12/25] Current/Best: 19.03/ 19.03 GFLOPS | Progress: (12/20) | 11.95 s
[Task 12/25] Current/Best: 14.26/ 19.03 GFLOPS | Progress: (16/20) | 14.98 s
[Task 12/25] Current/Best: 15.16/ 19.03 GFLOPS | Progress: (20/20) | 16.94 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.33/ 17.27 GFLOPS | Progress: (4/20) | 3.91 s
[Task 13/25] Current/Best: 14.54/ 20.59 GFLOPS | Progress: (8/20) | 6.56 s
[Task 13/25] Current/Best: 18.57/ 21.09 GFLOPS | Progress: (12/20) | 9.69 s
[Task 13/25] Current/Best: 12.18/ 21.09 GFLOPS | Progress: (16/20) | 13.20 s
[Task 13/25] Current/Best: 17.68/ 21.09 GFLOPS | Progress: (20/20) | 15.57 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 12.11/ 13.21 GFLOPS | Progress: (4/20) | 3.50 s
[Task 14/25] Current/Best: 6.00/ 13.21 GFLOPS | Progress: (8/20) | 5.73 s
[Task 14/25] Current/Best: 19.44/ 19.44 GFLOPS | Progress: (12/20) | 8.44 s
[Task 14/25] Current/Best: 15.87/ 19.44 GFLOPS | Progress: (16/20) | 10.10 s Done.
-
[Task 14/25] Current/Best: 16.82/ 19.44 GFLOPS | Progress: (20/20) | 11.88 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 15.44/ 17.27 GFLOPS | Progress: (4/20) | 2.78 s
[Task 15/25] Current/Best: 12.63/ 17.76 GFLOPS | Progress: (8/20) | 4.15 s
[Task 15/25] Current/Best: 9.87/ 21.63 GFLOPS | Progress: (12/20) | 6.41 s
[Task 15/25] Current/Best: 19.81/ 21.63 GFLOPS | Progress: (16/20) | 10.22 s
[Task 15/25] Current/Best: 9.49/ 21.63 GFLOPS | Progress: (20/20) | 11.25 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 19.37/ 19.37 GFLOPS | Progress: (4/20) | 3.03 s
[Task 16/25] Current/Best: 3.03/ 19.37 GFLOPS | Progress: (8/20) | 4.66 s
[Task 16/25] Current/Best: 18.10/ 19.37 GFLOPS | Progress: (12/20) | 5.89 s
[Task 16/25] Current/Best: 17.76/ 19.37 GFLOPS | Progress: (16/20)
| 7.30 s
[Task 16/25] Current/Best: 9.81/ 20.51 GFLOPS | Progress: (20/20) | 9.48 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 13.23/ 16.06 GFLOPS | Progress: (4/20) | 4.91 s
[Task 17/25] Current/Best: 12.48/ 22.56 GFLOPS | Progress: (8/20) | 7.76 s
[Task 17/25] Current/Best: 16.43/ 22.56 GFLOPS | Progress: (12/20) | 9.89 s
[Task 17/25] Current/Best: 16.43/ 22.56 GFLOPS | Progress: (16/20) | 12.14 s
[Task 17/25] Current/Best: 9.96/ 22.56 GFLOPS | Progress: (20/20) | 14.33 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 10.09/ 16.80 GFLOPS | Progress: (4/20) | 3.92 s
[Task 18/25] Current/Best: 10.48/ 18.53 GFLOPS | Progress: (8/20) | 7.68 s
[Task 18/25] Current/Best: 18.22/ 18.53 GFLOPS | Progress: (12/20) | 9.66 s
[Task 18/25] Current/Best: 9.86/ 18.53 GFLOPS | Progress: (16/20) | 13.60 s
[Task 18/25] Current/Best: 20.52/ 20.52 GFLOPS | Progress: (20/20) | 15.16 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 6.98/ 19.68 GFLOPS | Progress: (4/20) | 6.35 s
[Task 19/25] Current/Best: 2.69/ 19.68 GFLOPS | Progress: (8/20) | 9.68 s
[Task 19/25] Current/Best: 18.35/ 20.20 GFLOPS | Progress: (12/20) | 12.66 s
[Task 19/25] Current/Best: 13.51/ 20.71 GFLOPS | Progress: (16/20) | 15.70 s
[Task 19/25] Current/Best: 2.69/ 22.09 GFLOPS | Progress: (20/20) | 18.55 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 9.03/ 15.12 GFLOPS | Progress: (4/20) | 3.39 s Done.
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.58/ 17.58 GFLOPS | Progress: (4/20) | 6.30 s
[Task 1/25] Current/Best: 6.10/ 17.58 GFLOPS | Progress: (8/20) | 9.35 s
[Task 1/25] Current/Best: 11.25/ 21.79 GFLOPS | Progress: (12/20) | 11.85 s
[Task 1/25] Current/Best: 16.53/ 21.79 GFLOPS | Progress: (16/20) | 13.54 s
[Task 1/25] Current/Best: 11.20/ 23.62 GFLOPS | Progress: (20/20) | 15.32 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.21/ 12.50 GFLOPS | Progress: (4/20) | 3.89 s
[Task 2/25] Current/Best: 12.47/ 18.32 GFLOPS | Progress: (8/20) | 5.19 s
[Task 2/25] Current/Best: 20.66/ 20.66 GFLOPS | Progress: (12/20) | 6.50 s
[Task 2/25] Current/Best: 10.72/ 20.66 GFLOPS | Progress: (16/20) | 7.75 s
[Task 2/25] Current/Best: 18.86/ 20.66 GFLOPS | Progress: (20/20) | 9.34 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.18 GFLOPS | Progress: (4/20) | 5.88 s
[Task 3/25] Current/Best: 15.38/ 16.82 GFLOPS | Progress: (8/20) | 7.82 s
[Task 3/25] Current/Best: 14.99/ 16.82 GFLOPS | Progress: (12/20) | 9.55 s
[Task 3/25] Current/Best: 6.82/ 22.85 GFLOPS | Progress: (16/20) | 11.52 s
[Task 3/25] Current/Best: 11.08/ 22.85 GFLOPS | Progress: (20/20) | 16.16 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 8.87/ 17.07 GFLOPS | Progress: (4/20) | 2.44 s
[Task 4/25] Current/Best: 6.31/ 17.07 GFLOPS | Progress: (8/20) | 7.18 s
[Task 4/25] Current/Best: 20.72/ 20.72 GFLOPS | Progress: (12/20) | 12.07 s
[Task 4/25] Current/Best: 16.06/ 20.72 GFLOPS | Progress: (16/20) | 14.52 s
[Task 4/25] Current/Best: 12.82/ 20.72 GFLOPS | Progress: (20/20) | 16.48 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 8.95/ 9.67 GFLOPS | Progress: (4/20) | 2.62 s
[Task 5/25] Current/Best: 11.59/ 11.59 GFLOPS | Progress: (8/20) | 4.71 s
[Task 5/25] Current/Best: 11.38/ 18.04 GFLOPS | Progress: (12/20) | 7.93 s
[Task 5/25] Current/Best: 11.55/ 21.43 GFLOPS | Progress: (16/20) | 9.35 s
[Task 5/25] Current/Best: 12.17/ 21.43 GFLOPS | Progress: (20/20) | 11.25 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 11.60/ 20.01 GFLOPS | Progress: (4/20) | 4.17 s
[Task 6/25] Current/Best: 18.92/ 20.01 GFLOPS | Progress: (8/20) | 5.97 s
[Task 6/25] Current/Best: 13.27/ 20.01 GFLOPS | Progress: (12/20) | 7.98 s
[Task 6/25] Current/Best: 19.27/ 20.01 GFLOPS | Progress: (16/20) | 10.24 s
[Task 6/25] Current/Best: 3.72/ 20.01 GFLOPS | Progress: (20/20) | 12.84 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 9.78/ 12.13 GFLOPS | Progress: (4/20) | 3.71 s
[Task 7/25] Current/Best: 19.58/ 19.99 GFLOPS | Progress: (8/20) | 5.26 s
[Task 7/25] Current/Best: 15.68/ 19.99 GFLOPS | Progress: (12/20) | 7.20 s
[Task 7/25] Current/Best: 12.18/ 20.02 GFLOPS | Progress: (16/20) | 9.29 s
[Task 7/25] Current/Best: 6.08/ 20.44 GFLOPS | Progress: (20/20) | 11.79 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.71/ 13.59 GFLOPS | Progress: (4/20) | 2.97 s
[Task 8/25] Current/Best: 9.14/ 13.59 GFLOPS | Progress: (8/20) | 8.18 s
[Task 8/25] Current/Best: 12.77/ 13.59 GFLOPS | Progress: (12/20) | 14.70 s
[Task 8/25] Current/Best: 19.01/ 19.01 GFLOPS | Progress: (16/20) | 16.82 s
[Task 8/25] Current/Best: 19.22/ 19.22 GFLOPS | Progress: (20/20) | 23.98 s Done.
+
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.28/ 14.28 GFLOPS | Progress: (4/20) | 11.98 s
[Task 9/25] Current/Best: 23.06/ 23.06 GFLOPS | Progress: (8/20) | 13.78 s
[Task 9/25] Current/Best: 8.00/ 23.06 GFLOPS | Progress: (12/20) | 16.36 s
[Task 9/25] Current/Best: 17.94/ 23.06 GFLOPS | Progress: (16/20) | 19.23 s
[Task 9/25] Current/Best: 9.10/ 23.06 GFLOPS | Progress: (20/20) | 27.90 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.33/ 18.33 GFLOPS | Progress: (4/20) | 2.62 s
[Task 10/25] Current/Best: 15.68/ 18.33 GFLOPS | Progress: (8/20) | 4.28 s
[Task 10/25] Current/Best: 11.34/ 18.87 GFLOPS | Progress: (12/20) | 5.84 s
[Task 10/25] Current/Best: 18.86/ 20.39 GFLOPS | Progress: (16/20) | 6.95 s
[Task 10/25] Current/Best: 8.32/ 20.39 GFLOPS | Progress: (20/20
) | 8.50 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 11.02/ 18.31 GFLOPS | Progress: (4/20) | 3.41 s
[Task 11/25] Current/Best: 14.79/ 18.31 GFLOPS | Progress: (8/20) | 6.24 s
[Task 11/25] Current/Best: 15.95/ 18.31 GFLOPS | Progress: (12/20) | 8.36 s
[Task 11/25] Current/Best: 10.76/ 20.50 GFLOPS | Progress: (16/20) | 11.39 s
[Task 11/25] Current/Best: 18.08/ 20.50 GFLOPS | Progress: (20/20) | 13.53 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.75/ 17.96 GFLOPS | Progress: (4/20) | 5.80 s
[Task 12/25] Current/Best: 4.92/ 17.96 GFLOPS | Progress: (8/20) | 9.83 s
[Task 12/25] Current/Best: 18.79/ 18.79 GFLOPS | Progress: (12/20) | 11.84 s
[Task 12/25] Current/Best: 14.94/ 18.79 GFLOPS | Progress: (16/20) | 14.83 s
[Task 12/25] Current/Best: 15.07/ 18.79 GFLOPS | Progress: (20/20) | 16.79 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.52/ 17.36 GFLOPS | Progress: (4/20) | 3.80 s
[Task 13/25] Current/Best: 15.29/ 20.54 GFLOPS | Progress: (8/20) | 6.43 s
[Task 13/25] Current/Best: 18.83/ 21.44 GFLOPS | Progress: (12/20) | 9.49 s
[Task 13/25] Current/Best: 12.26/ 21.44 GFLOPS | Progress: (16/20) | 12.89 s
[Task 13/25] Current/Best: 17.74/ 21.44 GFLOPS | Progress: (20/20) | 15.30 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 12.14/ 13.35 GFLOPS | Progress: (4/20) | 3.42 s
[Task 14/25] Current/Best: 6.08/ 13.35 GFLOPS | Progress: (8/20) | 5.62 s
[Task 14/25] Current/Best: 19.78/ 19.78 GFLOPS | Progress: (12/20) | 8.34 s
[Task 14/25] Current/Best: 16.59/ 19.78 GFLOPS | Progress: (16/20) | 10.01 s Done.
+
[Task 14/25] Current/Best: 17.04/ 19.78 GFLOPS | Progress: (20/20) | 11.81 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 15.60/ 17.25 GFLOPS | Progress: (4/20) | 2.72 s
[Task 15/25] Current/Best: 12.73/ 17.41 GFLOPS | Progress: (8/20) | 4.04 s
[Task 15/25] Current/Best: 10.03/ 21.28 GFLOPS | Progress: (12/20) | 6.30 s
[Task 15/25] Current/Best: 19.62/ 21.28 GFLOPS | Progress: (16/20) | 9.51 s
[Task 15/25] Current/Best: 9.35/ 21.28 GFLOPS | Progress: (20/20) | 10.54 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 19.32/ 19.32 GFLOPS | Progress: (4/20) | 2.97 s
[Task 16/25] Current/Best: 3.03/ 19.32 GFLOPS | Progress: (8/20) | 4.59 s
[Task 16/25] Current/Best: 17.05/ 19.32 GFLOPS | Progress: (12/20) | 5.84 s
[Task 16/25] Current/Best: 18.03/ 19.32 GFLOPS | Progress: (16/20) |
7.21 s
[Task 16/25] Current/Best: 9.94/ 20.85 GFLOPS | Progress: (20/20) | 9.37 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 12.12/ 16.08 GFLOPS | Progress: (4/20) | 4.86 s
[Task 17/25] Current/Best: 12.67/ 22.92 GFLOPS | Progress: (8/20) | 7.78 s
[Task 17/25] Current/Best: 16.44/ 22.92 GFLOPS | Progress: (12/20) | 9.88 s
[Task 17/25] Current/Best: 16.50/ 22.92 GFLOPS | Progress: (16/20) | 12.11 s
[Task 17/25] Current/Best: 10.00/ 22.92 GFLOPS | Progress: (20/20) | 14.26 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 10.84/ 17.02 GFLOPS | Progress: (4/20) | 3.85 s
[Task 18/25] Current/Best: 10.57/ 18.74 GFLOPS | Progress: (8/20) | 7.58 s
[Task 18/25] Current/Best: 18.91/ 18.91 GFLOPS | Progress: (12/20) | 9.54 s
[Task 18/25] Current/Best: 10.22/ 18.91 GFLOPS | Progress: (16/20) | 13.45 s
[Task 18/25] Current/Best: 20.72/ 20.72 GFLOPS | Progress: (20/20) | 14.98 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 7.18/ 19.89 GFLOPS | Progress: (4/20) | 6.10 s
[Task 19/25] Current/Best: 2.69/ 19.89 GFLOPS | Progress: (8/20) | 9.45 s
[Task 19/25] Current/Best: 19.07/ 20.44 GFLOPS | Progress: (12/20) | 12.50 s
[Task 19/25] Current/Best: 12.80/ 21.00 GFLOPS | Progress: (16/20) | 15.60 s
[Task 19/25] Current/Best: 2.69/ 22.69 GFLOPS | Progress: (20/20) | 18.44 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 8.76/ 14.47 GFLOPS | Progress: (4/20) | 3.37 s Done.
Done.
-
[Task 20/25] Current/Best: 9.56/ 15.12 GFLOPS | Progress: (8/20) | 7.04 s
[Task 20/25] Current/Best: 2.33/ 15.12 GFLOPS | Progress: (12/20) | 11.05 s
[Task 20/25] Current/Best: 10.94/ 15.12 GFLOPS | Progress: (16/20) | 14.90 s
[Task 20/25] Current/Best: 11.87/ 21.38 GFLOPS | Progress: (20/20) | 17.02 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.33/ 17.58 GFLOPS | Progress: (4/20) | 3.34 s
[Task 21/25] Current/Best: 14.59/ 17.58 GFLOPS | Progress: (8/20) | 4.99 s
[Task 21/25] Current/Best: 1.61/ 17.58 GFLOPS | Progress: (12/20) | 7.15 s
[Task 21/25] Current/Best: 15.99/ 17.58 GFLOPS | Progress: (16/20) | 10.71 s
[Task 21/25] Current/Best: 4.45/ 17.58 GFLOPS | Progress: (20/20) | 18.14 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 16.24 GFLOPS | Progress: (4/20
) | 2.77 s
[Task 22/25] Current/Best: 9.06/ 20.08 GFLOPS | Progress: (8/20) | 4.83 s
[Task 22/25] Current/Best: 19.63/ 20.08 GFLOPS | Progress: (12/20) | 7.23 s
[Task 22/25] Current/Best: 14.90/ 20.08 GFLOPS | Progress: (16/20) | 9.36 s
[Task 22/25] Current/Best: 13.09/ 20.08 GFLOPS | Progress: (20/20) | 11.15 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 16.43/ 19.48 GFLOPS | Progress: (4/20) | 3.34 s
[Task 23/25] Current/Best: 14.09/ 19.84 GFLOPS | Progress: (8/20) | 6.86 s
[Task 23/25] Current/Best: 20.46/ 21.22 GFLOPS | Progress: (12/20) | 8.72 s
[Task 23/25] Current/Best: 6.53/ 21.22 GFLOPS | Progress: (16/20) | 15.83 s
[Task 23/25] Current/Best: 7.36/ 21.22 GFLOPS | Progress: (20/20) | 20.14 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.00/ 8.00 GFLOPS | Progress: (4/20) | 11.87 s
[Task 24/25] Current/Best: 3.03/ 8.00 GFLOPS | Progress: (8/20) | 23.17 s
[Task 24/25] Current/Best: 3.96/ 8.00 GFLOPS | Progress: (12/20) | 33.91 s Done.
-
[Task 24/25] Current/Best: 5.51/ 8.70 GFLOPS | Progress: (16/20) | 39.65 s
[Task 24/25] Current/Best: 2.95/ 8.70 GFLOPS | Progress: (20/20) | 45.66 s Done.
-
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 2.75 GFLOPS | Progress: (4/20) | 11.65 s
[Task 25/25] Current/Best: 5.76/ 7.63 GFLOPS | Progress: (8/20) | 23.00 s
[Task 25/25] Current/Best: 5.91/ 7.63 GFLOPS | Progress: (12/20) | 34.33 s
[Task 25/25] Current/Best: 5.75/ 8.68 GFLOPS | Progress: (16/20) | 36.19 s
[Task 25/25] Current/Best: 2.89/ 8.68 GFLOPS | Progress: (20/20) | 46.87 s
+
[Task 20/25] Current/Best: 9.71/ 14.47 GFLOPS | Progress: (8/20) | 6.80 s
[Task 20/25] Current/Best: 2.32/ 14.49 GFLOPS | Progress: (12/20) | 10.75 s
[Task 20/25] Current/Best: 11.08/ 14.49 GFLOPS | Progress: (16/20) | 14.65 s
[Task 20/25] Current/Best: 11.31/ 21.98 GFLOPS | Progress: (20/20) | 16.77 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.37/ 17.72 GFLOPS | Progress: (4/20) | 3.30 s
[Task 21/25] Current/Best: 14.60/ 17.72 GFLOPS | Progress: (8/20) | 4.92 s
[Task 21/25] Current/Best: 1.61/ 17.72 GFLOPS | Progress: (12/20) | 7.07 s
[Task 21/25] Current/Best: 16.06/ 17.72 GFLOPS | Progress: (16/20) | 10.59 s
[Task 21/25] Current/Best: 4.45/ 17.72 GFLOPS | Progress: (20/20) | 17.91 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 16.87 GFLOPS | Progress: (4/20
) | 2.69 s
[Task 22/25] Current/Best: 8.83/ 21.34 GFLOPS | Progress: (8/20) | 4.73 s
[Task 22/25] Current/Best: 19.91/ 21.34 GFLOPS | Progress: (12/20) | 7.10 s
[Task 22/25] Current/Best: 15.45/ 21.34 GFLOPS | Progress: (16/20) | 9.22 s
[Task 22/25] Current/Best: 12.31/ 21.34 GFLOPS | Progress: (20/20) | 10.92 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 16.70/ 20.02 GFLOPS | Progress: (4/20) | 3.33 s
[Task 23/25] Current/Best: 13.08/ 20.02 GFLOPS | Progress: (8/20) | 6.71 s
[Task 23/25] Current/Best: 20.38/ 21.79 GFLOPS | Progress: (12/20) | 8.54 s
[Task 23/25] Current/Best: 6.59/ 21.79 GFLOPS | Progress: (16/20) | 15.64 s
[Task 23/25] Current/Best: 7.83/ 21.79 GFLOPS | Progress: (20/20) | 19.85 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.48/ 8.48 GFLOPS | Progress: (4/20) | 11.79 s
[Task 24/25] Current/Best: 3.40/ 8.48 GFLOPS | Progress: (8/20) | 23.04 s
[Task 24/25] Current/Best: 3.95/ 8.48 GFLOPS | Progress: (12/20) | 33.75 s Done.
+
[Task 24/25] Current/Best: 5.37/ 8.64 GFLOPS | Progress: (16/20) | 39.38 s
[Task 24/25] Current/Best: 3.03/ 8.64 GFLOPS | Progress: (20/20) | 45.34 s Done.
+
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.55/ 2.69 GFLOPS | Progress: (4/20) | 11.59 s
[Task 25/25] Current/Best: 5.67/ 8.07 GFLOPS | Progress: (8/20) | 22.88 s
[Task 25/25] Current/Best: 6.01/ 8.07 GFLOPS | Progress: (12/20) | 34.17 s
[Task 25/25] Current/Best: 5.77/ 8.71 GFLOPS | Progress: (16/20) | 36.02 s
[Task 25/25] Current/Best: 2.81/ 8.81 GFLOPS | Progress: (20/20) | 46.70 s
@@ -679,8 +679,8 @@ Verify that the optimized model runs and produces the same results:
.. code-block:: none
- class='n02123045 tabby, tabby cat' with probability=0.621105
- class='n02123159 tiger cat' with probability=0.356377
+ class='n02123045 tabby, tabby cat' with probability=0.621104
+ class='n02123159 tiger cat' with probability=0.356378
class='n02124075 Egyptian cat' with probability=0.019712
class='n02129604 tiger, Panthera tigris' with probability=0.001215
class='n04040759 radiator' with probability=0.000262
@@ -737,8 +737,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 409.10631044000183, 'median': 409.22197365000557, 'std': 0.5363085231038565}
- unoptimized: {'mean': 515.597588089995, 'median': 515.9454214000107, 'std': 2.1741997976212493}
+ optimized: {'mean': 409.8265123199917, 'median': 409.7755659999848, 'std': 0.7498528601946826}
+ unoptimized: {'mean': 514.8496878599917, 'median': 514.2742650999935, 'std': 2.569928785174047}
@@ -761,7 +761,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 10 minutes 31.533 seconds)
+ **Total running time of the script:** ( 10 minutes 25.503 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 6429926656..aad39ef639 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -282,7 +282,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.254e-07 secs/op
+ 1.261e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index 8e1be39edf..208daec539 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -263,7 +263,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0xb0f4f10)), stage(b, placeholder(b, 0x166b3fa0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+ [stage(a, placeholder(a, 0x2026cf00)), stage(b, placeholder(b, 0xcbfdb50)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 7cc556d857..989d2154da 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,32 +5,32 @@
Computation times
=================
-**13:42.642** total execution time for **tutorial** files:
+**13:17.572** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:31.533 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:25.503 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:12.747 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:00.936 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:00.097 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 00:55.062 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:31.354 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:30.691 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:25.523 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:24.020 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.702 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.699 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.512 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.510 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.165 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.143 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``) | 00:00.005 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.002 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 7a875f35b7..1fa41dbfef 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -501,10 +501,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 7.388040003206697e-06 1.0
- naive 6.7988e-06 0.9202440697463815
- parallel 6.9268e-06 0.9375693684649108
- vector 2.45162e-05 3.318363190962558
+ numpy 7.242370002131793e-06 1.0
+ naive 6.6874e-06 0.9233717689142591
+ parallel 6.9318e-06 0.9571176283398423
+ vector 2.46326e-05 3.4011794471629297
@@ -925,7 +925,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.018607
+ Numpy running time: 0.018218
@@ -983,7 +983,7 @@ optimizations.
.. code-block:: none
- none: 3.355525
+ none: 3.426029
@@ -1086,7 +1086,7 @@ schedule.
.. code-block:: none
- blocking: 0.301127
+ blocking: 0.294858
@@ -1182,7 +1182,7 @@ already cache friendly from our previous optimizations.
.. code-block:: none
- vectorization: 0.339047
+ vectorization: 0.336071
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1256,7 +1256,7 @@ more cache friendly.
.. code-block:: none
- loop permutation: 0.116686
+ loop permutation: 0.116965
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1355,7 +1355,7 @@ optimized schedule.
.. code-block:: none
- array packing: 0.108153
+ array packing: 0.109596
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1448,7 +1448,7 @@ to `C` when all the block results are ready.
.. code-block:: none
- block caching: 0.109207
+ block caching: 0.110568
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1534,7 +1534,7 @@ of thread-level parallelization.
.. code-block:: none
- parallelization: 0.144737
+ parallelization: 0.146289
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1615,13 +1615,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.3555254243 1.0
- blocking 0.30112654 0.08974050317703026
- vectorization 0.3390466889 0.10104131127861411
- loop permutation 0.11668559240000001 0.03477416429480397
- array packing 0.1081529775 0.03223130920623613
- block caching 0.10920717119999998 0.032545475712728894
- parallelization 0.14473724059999998 0.043134002070687266
+ none 3.4260294499000006 1.0
+ blocking 0.294857939 0.08606404098733196
+ vectorization 0.336071095 0.09809346356021817
+ loop permutation 0.11696527539999999 0.034140183880618416
+ array packing 0.1095958199 0.03198916457160019
+ block caching 0.11056752980000002 0.032272790242135044
+ parallelization 0.146289113 0.042699315677006196
@@ -1663,7 +1663,7 @@ the computation for specific platforms.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 0.097 seconds)
+ **Total running time of the script:** ( 1 minutes 0.936 seconds)
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
diff --git a/docs/commit_hash b/docs/commit_hash
index 0a6abf34f1..ca60407eb2 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-f9b692765adf19a2bd3e5cf7abab8c1c74714f81
+a75dcabd3f5306ed1c792c0877becab219004ed8
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 46904af650..936773f127 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -572,7 +572,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 4.869 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 4.971 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index 008fb6d422..04c9903bcc 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -493,7 +493,7 @@ pip install -U tensorflow --user
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 960ms/step
+1/1 [==============================] - 1s 948ms/step
Keras top-1 id: 285, class name: Egyptian cat
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 5abdebedb0..15c3871590 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -427,7 +427,7 @@ to download the full example code</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipf6e85a27-255e-48c3-be6b-71655e6a85f9 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipf899b4c2-08e5-4b21-98e5-f645fe0875be from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 912fb370c3..a4bb84cd72 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -435,13 +435,14 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 15%|#5 | 6.33M/41.5M [00:00<00:00, 45.5MB/s]
- 26%|##5 | 10.7M/41.5M [00:00<00:01, 28.2MB/s]
- 39%|###8 | 16.0M/41.5M [00:00<00:00, 33.4MB/s]
- 58%|#####7 | 24.0M/41.5M [00:00<00:00, 38.9MB/s]
- 77%|#######7 | 32.0M/41.5M [00:00<00:00, 45.4MB/s]
- 92%|#########2| 38.3M/41.5M [00:01<00:00, 35.8MB/s]
-100%|##########| 41.5M/41.5M [00:01<00:00, 36.8MB/s]
+ 15%|#5 | 6.33M/41.5M [00:00<00:01, 32.0MB/s]
+ 23%|##2 | 9.38M/41.5M [00:00<00:01, 27.7MB/s]
+ 36%|###6 | 15.0M/41.5M [00:00<00:00, 38.1MB/s]
+ 47%|####6 | 19.3M/41.5M [00:00<00:00, 40.4MB/s]
+ 58%|#####7 | 24.0M/41.5M [00:00<00:00, 32.7MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:00<00:00, 40.7MB/s]
+ 92%|#########2| 38.3M/41.5M [00:01<00:00, 36.1MB/s]
+100%|##########| 41.5M/41.5M [00:01<00:00, 36.6MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 298d017005..661b4f5ccb 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -414,8 +414,8 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 32%|###1 | 14.1M/44.7M [00:00<00:00, 148MB/s]
- 89%|########9 | 39.9M/44.7M [00:00<00:00, 220MB/s]
+ 42%|####2 | 18.9M/44.7M [00:00<00:00, 198MB/s]
+ 89%|########8 | 39.6M/44.7M [00:00<00:00, 209MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 208MB/s]
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 9f9b6194c8..c5f1f98230 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -632,7 +632,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 4.131 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 8.186 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index e87e30e0f9..99674b98e0 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:10.376</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:13.283</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -335,44 +335,44 @@
<col style="width: 8%" />
</colgroup>
<tbody>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:04.869</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
+<td><p>01:08.186</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:04.131</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
+<td><p>01:04.971</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:39.520</p></td>
+<td><p>00:39.256</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:28.548</p></td>
+<td><p>00:28.430</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:25.779</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
+<td><p>00:26.136</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:25.558</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
+<td><p>00:25.930</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:22.119</p></td>
+<td><p>00:21.534</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:20.047</p></td>
+<td><p>00:19.789</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:17.303</p></td>
+<td><p>00:16.648</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.502</p></td>
+<td><p>00:02.405</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index a4cafcc92e..fc41c33d00 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -649,7 +649,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 15.8558 15.7675 16.3276 15.6436 0.2005
+ 15.5949 15.6058 15.7256 15.4827 0.0880
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 364c7823c8..b56f462fd7 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -436,17 +436,52 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 6%|5 | 9.69M/170M [00:00<00:01, 101MB/s]
- 15%|#5 | 26.1M/170M [00:00<00:01, 143MB/s]
- 25%|##5 | 42.8M/170M [00:00<00:00, 158MB/s]
- 35%|###5 | 60.0M/170M [00:00<00:00, 166MB/s]
- 45%|####5 | 77.1M/170M [00:00<00:00, 171MB/s]
- 56%|#####5 | 94.7M/170M [00:00<00:00, 176MB/s]
- 66%|######6 | 112M/170M [00:00<00:00, 179MB/s]
- 77%|#######6 | 130M/170M [00:00<00:00, 182MB/s]
- 88%|########7 | 149M/170M [00:00<00:00, 185MB/s]
- 99%|#########8| 168M/170M [00:01<00:00, 189MB/s]
-100%|##########| 170M/170M [00:01<00:00, 176MB/s]
+ 2%|2 | 3.73M/170M [00:00<00:04, 39.2MB/s]
+ 5%|5 | 8.69M/170M [00:00<00:03, 45.4MB/s]
+ 8%|7 | 13.0M/170M [00:00<00:04, 36.0MB/s]
+ 10%|# | 17.1M/170M [00:00<00:04, 38.3MB/s]
+ 13%|#3 | 22.6M/170M [00:00<00:03, 44.3MB/s]
+ 16%|#5 | 26.9M/170M [00:00<00:03, 39.3MB/s]
+ 18%|#8 | 30.9M/170M [00:00<00:04, 30.0MB/s]
+ 20%|## | 34.1M/170M [00:01<00:04, 30.1MB/s]
+ 22%|##2 | 37.7M/170M [00:01<00:04, 31.9MB/s]
+ 25%|##4 | 41.7M/170M [00:01<00:03, 34.4MB/s]
+ 27%|##6 | 45.2M/170M [00:01<00:04, 30.0MB/s]
+ 28%|##8 | 48.2M/170M [00:01<00:04, 28.6MB/s]
+ 31%|### | 51.8M/170M [00:01<00:04, 30.6MB/s]
+ 33%|###3 | 56.1M/170M [00:01<00:03, 34.3MB/s]
+ 35%|###5 | 59.5M/170M [00:01<00:03, 32.8MB/s]
+ 38%|###7 | 63.9M/170M [00:01<00:03, 35.2MB/s]
+ 40%|#### | 68.0M/170M [00:02<00:02, 37.1MB/s]
+ 42%|####2 | 71.6M/170M [00:02<00:03, 28.3MB/s]
+ 44%|####4 | 75.2M/170M [00:02<00:03, 30.5MB/s]
+ 46%|####6 | 78.4M/170M [00:02<00:03, 30.8MB/s]
+ 48%|####8 | 81.6M/170M [00:02<00:02, 31.5MB/s]
+ 50%|####9 | 84.8M/170M [00:02<00:02, 31.6MB/s]
+ 52%|#####1 | 87.9M/170M [00:02<00:02, 31.7MB/s]
+ 54%|#####4 | 92.2M/170M [00:02<00:02, 35.4MB/s]
+ 56%|#####6 | 95.7M/170M [00:03<00:02, 32.7MB/s]
+ 59%|#####8 | 99.4M/170M [00:03<00:02, 34.4MB/s]
+ 61%|###### | 103M/170M [00:03<00:01, 36.3MB/s]
+ 63%|######2 | 107M/170M [00:03<00:02, 32.3MB/s]
+ 65%|######4 | 110M/170M [00:03<00:02, 29.6MB/s]
+ 68%|######7 | 115M/170M [00:03<00:01, 34.8MB/s]
+ 70%|######9 | 118M/170M [00:03<00:01, 34.2MB/s]
+ 73%|#######2 | 124M/170M [00:03<00:01, 40.3MB/s]
+ 75%|#######5 | 128M/170M [00:03<00:01, 38.3MB/s]
+ 77%|#######7 | 131M/170M [00:04<00:01, 34.6MB/s]
+ 79%|#######9 | 135M/170M [00:04<00:01, 28.5MB/s]
+ 82%|########1 | 139M/170M [00:04<00:01, 31.3MB/s]
+ 85%|########4 | 144M/170M [00:04<00:00, 37.9MB/s]
+ 87%|########7 | 148M/170M [00:04<00:00, 39.2MB/s]
+ 90%|########9 | 152M/170M [00:04<00:00, 34.9MB/s]
+ 92%|#########1| 156M/170M [00:04<00:00, 31.4MB/s]
+ 94%|#########3| 159M/170M [00:05<00:00, 26.7MB/s]
+ 95%|#########5| 162M/170M [00:05<00:00, 23.8MB/s]
+ 97%|#########6| 164M/170M [00:05<00:00, 23.9MB/s]
+ 98%|#########8| 167M/170M [00:05<00:00, 21.1MB/s]
+ 99%|#########9| 169M/170M [00:05<00:00, 20.3MB/s]
+100%|##########| 170M/170M [00:05<00:00, 31.5MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -540,7 +575,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 2.758 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 59.029 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 2d01ca5b3c..9b9530014b 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -480,8 +480,9 @@ training. Other models require a full post training calibration.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
- 67%|######6 | 9.06M/13.6M [00:00<00:00, 93.7MB/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 108MB/s]
+ 25%|##4 | 3.34M/13.6M [00:00<00:00, 34.8MB/s]
+ 49%|####9 | 6.66M/13.6M [00:00<00:00, 34.3MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 56.4MB/s]
</pre></div>
</div>
</div>
@@ -566,7 +567,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 89.4734 89.3994 91.1405 89.0812 0.2872
+ 90.3495 90.2487 95.1589 90.0888 0.5181
</pre></div>
</div>
<div class="admonition note">
@@ -605,7 +606,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 8.869 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 9.324 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 4e7603f681..28ee990f3d 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -569,7 +569,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 119.9012 119.8843 121.1959 119.0577 0.3285
+ 120.1116 120.0455 126.2227 119.3651 0.7084
</pre></div>
</div>
<div class="admonition note">
@@ -597,7 +597,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 57.841 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 57.140 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 15e46ca5e2..c93370d517 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -507,7 +507,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 30.867 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 34.522 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index 6f3828700e..65af534307 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -441,24 +441,24 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 4%|3 | 5107/132723 [00:00<00:02, 49152.04KB/s]
- 9%|8 | 11727/132723 [00:00<00:02, 59018.85KB/s]
- 15%|#4 | 19482/132723 [00:00<00:01, 67401.10KB/s]
- 20%|## | 27086/132723 [00:00<00:01, 70790.45KB/s]
- 26%|##6 | 34809/132723 [00:00<00:01, 73095.50KB/s]
- 32%|###1 | 42467/132723 [00:00<00:01, 74275.36KB/s]
- 38%|###7 | 50205/132723 [00:00<00:01, 75282.06KB/s]
- 44%|####3 | 57943/132723 [00:00<00:00, 75943.67KB/s]
- 50%|####9 | 65709/132723 [00:00<00:00, 76478.14KB/s]
- 55%|#####5 | 73378/132723 [00:01<00:00, 76540.54KB/s]
- 61%|######1 | 81233/132723 [00:01<00:00, 77150.06KB/s]
- 67%|######7 | 89026/132723 [00:01<00:00, 77380.47KB/s]
- 73%|#######2 | 96824/132723 [00:01<00:00, 77558.36KB/s]
- 79%|#######8 | 104645/132723 [00:01<00:00, 77751.58KB/s]
- 85%|########4 | 112421/132723 [00:01<00:00, 77618.17KB/s]
- 91%|######### | 120223/132723 [00:01<00:00, 77737.47KB/s]
- 96%|#########6| 128035/132723 [00:01<00:00, 77846.00KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 75201.37KB/s]
+ 4%|4 | 5512/132723 [00:00<00:02, 55115.98KB/s]
+ 10%|9 | 13005/132723 [00:00<00:01, 66765.92KB/s]
+ 16%|#5 | 20649/132723 [00:00<00:01, 71176.96KB/s]
+ 21%|##1 | 28349/132723 [00:00<00:01, 73469.99KB/s]
+ 27%|##6 | 35696/132723 [00:00<00:01, 64000.96KB/s]
+ 33%|###2 | 43285/132723 [00:00<00:01, 67680.94KB/s]
+ 38%|###8 | 51024/132723 [00:00<00:01, 70652.13KB/s]
+ 44%|####4 | 58736/132723 [00:00<00:01, 72616.57KB/s]
+ 50%|##### | 66381/132723 [00:00<00:00, 73777.73KB/s]
+ 56%|#####5 | 73953/132723 [00:01<00:00, 74361.97KB/s]
+ 61%|######1 | 81570/132723 [00:01<00:00, 74898.88KB/s]
+ 67%|######7 | 89288/132723 [00:01<00:00, 75583.94KB/s]
+ 73%|#######3 | 96980/132723 [00:01<00:00, 75983.65KB/s]
+ 79%|#######8 | 104669/132723 [00:01<00:00, 76252.11KB/s]
+ 85%|########4 | 112419/132723 [00:01<00:00, 76617.44KB/s]
+ 91%|######### | 120132/132723 [00:01<00:00, 76766.23KB/s]
+ 96%|#########6| 127890/132723 [00:01<00:00, 77008.96KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 73557.21KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -497,7 +497,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 39.288 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 35.447 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index e734aed3dc..1e1d166375 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>11:35.774</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>11:30.316</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -336,35 +336,35 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:02.758</p></td>
+<td><p>02:59.029</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>02:39.288</p></td>
+<td><p>02:35.447</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>01:57.841</p></td>
+<td><p>01:57.140</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:30.867</p></td>
+<td><p>01:34.522</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:08.869</p></td>
+<td><p>01:09.324</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:30.106</p></td>
+<td><p>00:29.384</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:23.249</p></td>
+<td><p>00:22.941</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:22.791</p></td>
+<td><p>00:22.523</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 4a9fab6fe4..bc7fb3dc34 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -608,7 +608,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip0e9f1f9d-057f-4eca-8050-7a526bd1e91b from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip39987a78-f4fe-40bb-8181-17bfdba0e090 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 2533f95b6f..edc95f4bf9 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:41.853</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:39.890</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,19 +336,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:38.613</p></td>
+<td><p>00:36.805</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.262</p></td>
+<td><p>00:02.161</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:00.971</p></td>
+<td><p>00:00.916</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.007</p></td>
+<td><p>00:00.008</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 59b3ee0acc..f462742448 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -512,10 +512,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6983us [6983us] (46.28%; 46.28%)
-FoldScaleAxis: 8104us [7us] (53.72%; 53.72%)
- FoldConstant: 8097us [1650us] (53.67%; 99.91%)
- InferType: 6447us [6447us] (42.73%; 79.62%)
+InferType: 6731us [6731us] (45.99%; 45.99%)
+FoldScaleAxis: 7905us [5us] (54.01%; 54.01%)
+ FoldConstant: 7900us [1631us] (53.97%; 99.94%)
+ InferType: 6269us [6269us] (42.83%; 79.36%)
</pre></div>
</div>
</div>
@@ -537,10 +537,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6562us [6562us] (44.93%; 44.93%)
-FoldScaleAxis: 8043us [6us] (55.07%; 55.07%)
- FoldConstant: 8037us [1685us] (55.03%; 99.93%)
- InferType: 6352us [6352us] (43.49%; 79.03%)
+InferType: 6297us [6297us] (44.74%; 44.74%)
+FoldScaleAxis: 7777us [4us] (55.26%; 55.26%)
+ FoldConstant: 7773us [1623us] (55.23%; 99.94%)
+ InferType: 6150us [6150us] (43.70%; 79.12%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 4dc99e26c7..75e9a8e5cf 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -564,7 +564,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 35.264269 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 33.716032 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index b236846240..3b2230edc5 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -906,7 +906,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.373336 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 8.031158 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index fa2b53718b..438377d58c 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -461,8 +461,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.019462
-Baseline: 3.449233
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.017948
+Baseline: 3.417880
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -522,7 +522,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.304449
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.298977
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -589,7 +589,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.336338
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.336220
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -650,7 +650,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116521
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116299
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -733,7 +733,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.108622
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109688
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -819,7 +819,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111192
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.110985
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -909,7 +909,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147711
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146757
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 453d30af60..9b74c83f8b 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.024</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.317</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,15 +336,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.552</p></td>
+<td><p>00:32.100</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.372</p></td>
+<td><p>00:01.234</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.100</p></td>
+<td><p>00:00.983</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 83864b517d..3a158b4818 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:25.780</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>06:33.101</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -336,27 +336,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>03:26.753</p></td>
+<td><p>03:38.193</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:24.039</p></td>
+<td><p>01:22.405</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>00:57.241</p></td>
+<td><p>00:56.348</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:19.789</p></td>
+<td><p>00:18.781</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:09.077</p></td>
+<td><p>00:08.746</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:08.880</p></td>
+<td><p>00:08.627</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 9df7f0a20d..21056bc5d7 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -491,483 +491,414 @@ cooperative fetching, unrolling and operator fusion.</p>
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 28;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [72]), storage_scope = shared;
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 16;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [7]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [2016]), storage_scope = shared;
allocate(kernel.shared: Pointer(shared float32), float32, [3072]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [7], [], scope="local", align=16)[0] = 0f32
conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[2] = 0f32
conv2d_nchw_1[3] = 0f32
conv2d_nchw_1[4] = 0f32
conv2d_nchw_1[5] = 0f32
conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 64) {
- for (ry.outer.outer: int32, 0, 3) {
- let cse_var_2: int32 = (rc.outer.outer*72)
- let cse_var_1: int32 = (ry.outer.outer*3)
+ for (rc.outer.outer: int32, 0, 16) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*1568)
+ let cse_var_1: int32 = (rc.outer.outer*288)
{
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64 {
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [72], [], scope="shared")[(threadIdx.x_1*4)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod((threadIdx.x_1*4), 9))) && (floormod((threadIdx.x_1*4), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv((threadIdx.x_1*4), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 1)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 1), 9))) && (floormod(((threadIdx.x_1*4) + 1), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 1), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 1), 9)) - 8)], 0 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 2)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 2), 9))) && (floormod(((threadIdx.x_1*4) + 2), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 2), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 2), 9)) - 8)], 0 [...]
- }
- if @tir.likely((threadIdx.x_1 < 18), dtype=bool) {
- pad_temp.shared_1[((threadIdx.x_1*4) + 3)] = @tir.if_then_else(((((1 <= (ry.outer.outer + floormod(blockIdx.x, 7))) && ((ry.outer.outer + floormod(blockIdx.x, 7)) < 8)) && (1 <= floormod(((threadIdx.x_1*4) + 3), 9))) && (floormod(((threadIdx.x_1*4) + 3), 9) < 8)), data[((((((rc.outer.outer*392) + (floordiv(((threadIdx.x_1*4) + 3), 9)*49)) + (ry.outer.outer*7)) + (floormod(blockIdx.x, 7)*7)) + floormod(((threadIdx.x_1*4) + 3), 9)) - 8)], 0 [...]
- }
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [2016], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1344)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1344), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ pad_temp.shared_1[(threadIdx.x_1 + 1792)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1792), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[(((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 224)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 224), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 448), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 672)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 32256)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 896), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1120)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1120), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 64512)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1568)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1568), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 1792), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2016)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 96768)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2240), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2464)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2464), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 64), 96), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[((((((blockIdx.x*147456) + (floordiv(threadIdx.x_2, 96)*4608)) + cse_var_1) + (floormod(threadIdx.x_2, 96)*3)) + rx.outer.outer) + 129024)]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 224;
+ if @tir.likely((threadIdx.x_2 < 160), dtype=bool) {
+ kernel.shared_1[(threadIdx.x_2 + 2912)] = kernel[((((((blockIdx.x*147456) + (floordiv((threadIdx.x_2 + 2912), 96)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2 + 32), 96), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ }
+ for (rc.outer.inner: int32, 0, 2) {
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*1008) + floormod(threadIdx.x, 7))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 63)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 126)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 189)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 252)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 315)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 378)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 441)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 504)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 567)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 630)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 693)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 756)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 819)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 882)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 945)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 259)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 322)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 385)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 448)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 511)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 574)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 637)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 700)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 763)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 826)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 889)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 952)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48))]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 3)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 6)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 9)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 12)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 15)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 18)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 21)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 24)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 27)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 30)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 33)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 36)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 39)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 42)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 45)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 70)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 133)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 196)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 259)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 322)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 385)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 448)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 511)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 574)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 637)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 700)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 763)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 826)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 889)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 952)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 1)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 4)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 7)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 10)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 301)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 13)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 364)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 16)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 427)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 19)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 490)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 22)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 553)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 25)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 616)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 28)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 679)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 31)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 742)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 34)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 805)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 37)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 868)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 40)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 931)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 43)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 994)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 46)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 77)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 140)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 203)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 266)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 329)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 392)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 455)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 518)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 581)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 644)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 707)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 770)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 833)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 896)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 959)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 147)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 210)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 273)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 336)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 399)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 462)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 525)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 588)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 651)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 714)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 777)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 840)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 903)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 966)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 28)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 154)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 217)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 280)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 343)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 406)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 469)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 532)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 595)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 658)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 721)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 784)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 847)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 910)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 973)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 35)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 161)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 224)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 287)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 350)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 413)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 476)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 539)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 602)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 665)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 728)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 791)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 854)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 917)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 980)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 42)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 168)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 231)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 294)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 357)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 420)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 483)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 546)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 609)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 672)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 735)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 798)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 861)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 924)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 987)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 49)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 112)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 175)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 238)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 301)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 364)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 427)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 490)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 553)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 616)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 679)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 742)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 805)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 868)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 931)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 994)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 56)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 2)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 119)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 5)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 182)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 8)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 245)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 11)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 308)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 14)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 371)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 17)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 434)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 20)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 497)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 23)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 560)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 26)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 623)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 29)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 686)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 32)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 749)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 35)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 812)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 38)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 875)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 41)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 938)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 44)]))
+ conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*1008) + floormod(threadIdx.x, 7)) + 1001)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*96) + (rc.outer.inner*48)) + 47)]))
}
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1: Buffer(kernel.shared, float32, [3072], [], scope="shared")[threadIdx.x_2] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 64)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 64), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 128)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 128), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 192)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 36864)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 256)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 256), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 320)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 320), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 384)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 73728)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 448)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 448), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 512)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 512), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 576)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 110592)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 640)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 640), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 704)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 704), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 768)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 147456)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 832)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 832), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 896)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 896), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 960)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 184320)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1024)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1024), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1088)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1088), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1152)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 221184)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1216)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1216), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1280)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1280), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1344)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 258048)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1408)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1408), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1472)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1472), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1536)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 294912)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1600)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1600), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1664)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1664), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1728)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 331776)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1792)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1792), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1856)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1856), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1920)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 368640)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 1984)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 1984), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2048)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2048), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2112)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 405504)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2176)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2176), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2240)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2240), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2304)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 442368)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2368)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2368), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2432)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2432), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2496)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 479232)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2560)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2560), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2624)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2624), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2688)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 516096)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2752)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2752), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2816)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2816), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2880)] = kernel[(((((((floordiv(blockIdx.x, 7)*589824) + (floordiv(threadIdx.x_2, 24)*4608)) + cse_var_2) + (floordiv(floormod(threadIdx.x_2, 24), 3)*9)) + cse_var_1) + floormod(threadIdx.x_2, 3)) + 552960)]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 2944)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 2944), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 16), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 1), 3))]
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 64;
- kernel.shared_1[(threadIdx.x_2 + 3008)] = kernel[((((((floordiv(blockIdx.x, 7)*589824) + (floordiv((threadIdx.x_2 + 3008), 24)*4608)) + cse_var_2) + (floordiv(floormod((threadIdx.x_2 + 8), 24), 3)*9)) + cse_var_1) + floormod((threadIdx.x_2 + 2), 3))]
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[0]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[1]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[2]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[3]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[4]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[5]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[6]*kernel.shared_1[(threadIdx.x*48)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 3)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[0]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[9]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 24)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 27)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 4)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[1]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[10]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 25)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 28)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 5)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[2]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[11]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[3]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[12]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[4]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[13]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[5]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[14]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[6]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[15]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[7]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[16]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[8]*kernel.shared_1[((threadIdx.x*48) + 26)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[17]*kernel.shared_1[((threadIdx.x*48) + 29)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 9)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[18]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[27]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 30)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 33)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 10)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[19]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[28]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 31)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 34)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 11)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[20]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[29]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[21]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[30]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[22]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[31]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[23]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[32]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[24]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[33]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[25]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[34]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[26]*kernel.shared_1[((threadIdx.x*48) + 32)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[35]*kernel.shared_1[((threadIdx.x*48) + 35)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 15)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[36]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[45]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 36)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 39)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 16)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[37]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[46]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 37)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 40)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[38]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[47]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[39]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[48]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[40]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[49]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[41]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[50]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[42]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[51]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[43]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[52]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[44]*kernel.shared_1[((threadIdx.x*48) + 38)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[53]*kernel.shared_1[((threadIdx.x*48) + 41)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 18)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 21)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[54]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[63]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 42)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 45)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 19)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 22)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[55]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[64]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 43)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 46)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 20)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 23)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[56]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[65]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[57]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[66]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[58]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[67]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[59]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[68]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[60]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[69]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[61]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[70]*kernel.shared_1[((threadIdx.x*48) + 47)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[62]*kernel.shared_1[((threadIdx.x*48) + 44)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[71]*kernel.shared_1[((threadIdx.x*48) + 47)]))
}
}
}
- for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((floordiv(blockIdx.x, 7)*6272) + (threadIdx.x*98)) + (i1.inner*49)) + (floormod(blockIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((floordiv(blockIdx.x, 7)*128) + (threadIdx.x*2)) + i1.inner)]), 0f32)
- }
+ for (i2.inner: int32, 0, 7) {
+ compute[((((blockIdx.x*1568) + (floordiv(threadIdx.x, 7)*49)) + (i2.inner*7)) + floormod(threadIdx.x, 7))] = max((conv2d_nchw_1[i2.inner] + bias[((blockIdx.x*32) + floordiv(threadIdx.x, 7))]), 0f32)
}
}
}
@@ -1004,7 +935,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.365 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.319 ms
</pre></div>
</div>
</div>
@@ -1034,35 +965,35 @@ conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=32)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
-conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
+conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=7)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=16)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=2)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
-conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
-compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=1)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=32)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
-compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
+compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=7)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1082,12 +1013,12 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=224)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1107,9 +1038,9 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
+extern "C" __global__ void __launch_bounds__(224) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[7];
+ __shared__ float pad_temp_shared[2016];
__shared__ float kernel_shared[3072];
conv2d_nchw[0] = 0.000000e+00f;
conv2d_nchw[1] = 0.000000e+00f;
@@ -1118,419 +1049,377 @@ extern "C" __global__ void __launch_bounds__(64) default_function_kern
conv2d_nchw[4] = 0.000000e+00f;
conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 16; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
- }
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((int)threadIdx.x)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 1568) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 224) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 448) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 672)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 672) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 896)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 896) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1120) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1344)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1344) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1792)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 1568) + (((((int)threadIdx.x) + 1792) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 224) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 448) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 896)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 896) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1120)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1120) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 64512)];
+ kernel_shared[(((int)threadIdx.x) + 1568)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1568) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 1792) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2016)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 96768)];
+ kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2240) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2464)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2464) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 64) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((int)blockIdx.x) * 147456) + ((((int)threadIdx.x) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((int)threadIdx.x) % 96) * 3)) + rx_outer_outer) + 129024)];
+ if (((int)threadIdx.x) < 160) {
+ kernel_shared[(((int)threadIdx.x) + 2912)] = kernel[((((((((int)blockIdx.x) * 147456) + (((((int)threadIdx.x) + 2912) / 96) * 4608)) + (rc_outer_outer * 288)) + ((((((int)threadIdx.x) + 32) % 96) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
}
- if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
- }
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 2; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 252)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 315)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 378)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 441)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 504)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 567)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 630)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 693)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 756)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 819)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 882)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 945)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 259)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 322)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 385)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 448)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 511)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 574)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 637)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 700)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 763)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 826)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 889)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 952)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 12)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 15)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 18)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 21)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 24)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 27)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 30)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 33)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 36)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 39)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 42)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 45)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 259)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 322)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 385)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 448)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 511)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 574)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 637)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 700)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 763)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 826)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 889)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 952)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 301)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 13)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 364)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 16)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 427)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 19)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 490)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 22)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 553)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 25)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 616)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 28)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 679)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 31)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 742)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 34)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 805)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 37)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 868)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 40)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 931)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 43)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 994)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 46)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 266)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 329)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 392)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 455)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 518)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 581)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 644)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 707)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 770)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 833)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 896)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 959)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 147)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 210)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 273)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 336)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 399)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 462)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 525)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 588)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 651)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 714)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 777)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 840)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 903)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 966)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 28)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 154)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 217)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 280)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 343)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 406)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 469)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 532)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 595)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 658)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 721)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 784)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 847)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 910)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 973)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 35)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 161)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 224)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 287)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 350)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 413)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 476)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 539)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 602)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 665)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 728)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 791)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 854)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 917)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 980)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 42)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 168)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 231)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 294)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 357)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 420)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 483)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 546)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 609)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 672)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 735)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 798)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 861)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 924)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 987)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 49)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 112)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 175)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 238)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 301)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 364)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 427)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 490)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 553)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 616)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 679)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 742)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 805)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 868)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 931)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 994)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 56)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 119)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 182)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 245)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 308)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 14)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 371)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 17)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 434)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 20)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 497)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 23)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 560)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 26)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 623)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 29)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 686)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 32)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 749)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 35)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 812)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 38)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 875)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 41)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 938)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 44)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 1008) + (((int)threadIdx.x) % 7)) + 1001)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 48)) + 47)]));
+ }
}
}
- for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ for (int i2_inner = 0; i2_inner < 7; ++i2_inner) {
+ compute[((((((int)blockIdx.x) * 1568) + ((((int)threadIdx.x) / 7) * 49)) + (i2_inner * 7)) + (((int)threadIdx.x) % 7))] = max((conv2d_nchw[i2_inner] + bias[((((int)blockIdx.x) * 32) + (((int)threadIdx.x) / 7))]), 0.000000e+00f);
}
}
</pre></div>
@@ -1567,7 +1456,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 26.753 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 38.193 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index e56c127413..1d057cebd8 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -902,7 +902,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 8.2227 8.2245 8.2248 8.2189 0.0027
+ 8.2273 8.2258 8.2365 8.2195 0.0070
</pre></div>
</div>
</div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index e15acd25fb..7e0e00bf13 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -921,7 +921,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 758.5054 757.6919 760.7528 757.0715 1.6092
+ 760.0965 759.7724 760.7544 759.7626 0.4652
</pre></div>
</div>
</div>
@@ -943,7 +943,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 24.039 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 22.405 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 513514ebdf..73108cba18 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -625,103 +625,28 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_6: placeholder_15: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_5: placeholder_17: Buffer(placeholder_10, float32, [128, 256], []), placeholder_8: placeholder_18: Buffer(placeholder_13, int32, [33], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
+ preflattened_buffer_map = {placeholder_8: placeholder_15: Buffer(placeholder_13, int32, [33], []), placeholder_6: placeholder_16: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), placeholder_9: placeholder_19: Buffer(placeholder_14, float32, [128, 512], [])} {
for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
- for (i.outer.inner: int32, 0, 2) {
+ for (nb_j.inner: int32, 0, 2) {
for (i.inner.init: int32, 0, 64) {
- let cse_var_1: int32 = ((i.outer.inner*1024) + (i.inner.init*16))
- {
- compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
- compute_5[(cse_var_1 + 1)] = 0f32
- compute_5[(cse_var_1 + 2)] = 0f32
- compute_5[(cse_var_1 + 3)] = 0f32
- compute_5[(cse_var_1 + 4)] = 0f32
- compute_5[(cse_var_1 + 5)] = 0f32
- compute_5[(cse_var_1 + 6)] = 0f32
- compute_5[(cse_var_1 + 7)] = 0f32
- compute_5[(cse_var_1 + 8)] = 0f32
- compute_5[(cse_var_1 + 9)] = 0f32
- compute_5[(cse_var_1 + 10)] = 0f32
- compute_5[(cse_var_1 + 11)] = 0f32
- compute_5[(cse_var_1 + 12)] = 0f32
- compute_5[(cse_var_1 + 13)] = 0f32
- compute_5[(cse_var_1 + 14)] = 0f32
- compute_5[(cse_var_1 + 15)] = 0f32
+ for (j.init: int32, 0, 16) {
+ compute_5: Buffer(compute_4, float32, [2048], [])[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32
}
}
- for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (elem_idx: int32, 0, let cse_var_1: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) in (placeholder_3[(cse_var_1 + 1)] - placeholder_3[cse_var_1])) {
for (i.inner: int32, 0, 64) {
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_2: int32 = ((i.outer.inner*1024) + (i.inner*16))
- compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_3: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 1)
- compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_4: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 2)
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 3)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 4)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 5)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 6)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 7)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 8)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 9)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 10)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 11)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 12)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 13)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 14)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*1024) + (i.inner*16)) + 15)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ for (j: int32, 0, 16) {
+ let cse_var_3: int32 = ((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)
+ let cse_var_2: int32 = (((i.inner*32) + (nb_j.inner*16)) + j)
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + j)]*max(placeholder[(((floordiv(i0.outer.i1.outer.fused, 16)*16384) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 128) {
- let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
- compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
+ for (i0.inner: int32, 0, 64) {
+ let cse_var_4: int32 = (((floordiv(i0.outer.i1.outer.fused, 16)*32768) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32))
+ compute[ramp(cse_var_4, 1, 32)] = max((compute_5[ramp((i0.inner*32), 1, 32)] + placeholder_4[ramp(cse_var_4, 1, 32)]), broadcast(0f32, 32))
}
}
}
@@ -759,7 +684,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.811 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.806 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index d200f6b67f..9fd7964a47 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:45.742</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:45.665</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,22 +336,22 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:45.707</p></td>
+<td><p>00:45.628</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.019</p></td>
+<td><p>00:00.022</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_arm.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-arm-py"><span class="std std-ref">Auto-tuning a Convolutional Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_arm.py</span></code>)</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_mobile_gpu.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-mobile-gpu-py"><span class="std std-ref">Auto-tuning a Convolutional Network for Mobile GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_mobile_gpu.py</span></code>)</p></td>
<td><p>00:00.005</p></td>
<td><p>0.0 MB</p></td>
</tr>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 9d7571d59a..00af779c89 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -1436,8 +1436,8 @@ No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
TimeoutError
[('tile_f', [-1, 2, 1, 64]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4909501
-No: 9 GFLOPS: 176.29/176.29 result: MeasureResult(costs=(0.0013131999444444444,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.034900188446045, timestamp=1663630707.773062) [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5072689
-No: 10 GFLOPS: 0.00/176.29 result: Traceback (most recent call last):
+No: 9 GFLOPS: 80.80/80.80 result: MeasureResult(costs=(0.002865221742857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.9253158569335938, timestamp=1663637124.214335) [('tile_f', [-1, 1, 4, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5072689
+No: 10 GFLOPS: 0.00/80.80 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1560,8 +1560,8 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,5092711
-No: 11 GFLOPS: 258.30/258.30 result: MeasureResult(costs=(0.0008962458603351956,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.663536548614502, timestamp=1663630708.6986022) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
-No: 12 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 11 GFLOPS: 259.74/259.74 result: MeasureResult(costs=(0.0008912696243093924,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7628161907196045, timestamp=1663637125.1207004) [('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
+No: 12 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1684,7 +1684,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 128, 1, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,183542
-No: 13 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 13 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1807,7 +1807,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 8, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2482196
-No: 14 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 14 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1930,9 +1930,9 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10306226
-No: 15 GFLOPS: 5.44/258.30 result: MeasureResult(costs=(0.042549769499999994,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.817678451538086, timestamp=1663630713.2384973) [('tile_f', [-1, 2, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5330964
-No: 16 GFLOPS: 3.33/258.30 result: MeasureResult(costs=(0.0694369725,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.536828517913818, timestamp=1663630714.4850295) [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2140058
-No: 17 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 15 GFLOPS: 5.33/259.74 result: MeasureResult(costs=(0.04344266425,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8446388244628906, timestamp=1663637129.6733298) [('tile_f', [-1, 2, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5330964
+No: 16 GFLOPS: 3.36/259.74 result: MeasureResult(costs=(0.06896940575,), error_no=MeasureErrorNo.NO_ERROR, all_cost=4.561822891235352, timestamp=1663637130.9030292) [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2140058
+No: 17 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1950,8 +1950,8 @@ No: 17 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
TimeoutError
[('tile_f', [-1, 2, 2, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10195251
-No: 18 GFLOPS: 26.26/258.30 result: MeasureResult(costs=(0.008816739166666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.1723246574401855, timestamp=1663630725.408047) [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6068603
-No: 19 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 18 GFLOPS: 28.28/259.74 result: MeasureResult(costs=(0.008187352642857143,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2877285480499268, timestamp=1663637141.9079373) [('tile_f', [-1, 4, 8, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 1, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6068603
+No: 19 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2074,7 +2074,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 4, 8]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6956993
-No: 20 GFLOPS: 0.00/258.30 result: Traceback (most recent call last):
+No: 20 GFLOPS: 0.00/259.74 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2237,7 +2237,7 @@ and measure running time.</p>
Best config:
[('tile_f', [-1, 8, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4264713
Finish loading 20 records
-Time cost of this operator: 0.001300
+Time cost of this operator: 0.001274
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 3787ea8b06..d3827ad917 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -582,10 +582,10 @@ the tuned operator.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.5 98.714 (1, 2, 10, 10, 3) 2 1 [310.5]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.074 0.977 (1, 6, 10, 10) 1 1 [3.074]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.97 0.308 (1, 1, 10, 10, 3) 1 1 [0.97]
-Total_time - 314.544 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 309.8 98.729 (1, 2, 10, 10, 3) 2 1 [309.8]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.015 0.961 (1, 6, 10, 10) 1 1 [3.015]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.972 0.31 (1, 1, 10, 10, 3) 1 1 [0.972]
+Total_time - 313.787 - - - - -
</pre></div>
</div>
</div>
@@ -636,10 +636,10 @@ Total_time -
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 130.3 97.903 (1, 6, 10, 10, 1) 2 1 [130.3]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.822 1.369 (1, 6, 10, 10) 1 1 [1.822]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.969 0.728 (1, 1, 10, 10, 3) 1 1 [0.969]
-Total_time - 133.09 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 79.75 96.645 (1, 6, 10, 10, 1) 2 1 [79.75]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.81 2.193 (1, 6, 10, 10) 1 1 [1.81]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.958 1.162 (1, 1, 10, 10, 3) 1 1 [0.958]
+Total_time - 82.518 - - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index d24d7d2d4a..849f474215 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -516,7 +516,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpbdi64p4l/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpplf60smu/images/random'
</pre></div>
</div>
</div>
@@ -576,8 +576,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpbdi64p4l/images/target contains 8144 images
-/tmp/tmpbdi64p4l/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpplf60smu/images/target contains 8144 images
+/tmp/tmpplf60smu/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -689,13 +689,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 46s - loss: 0.2061 - accuracy: 0.9270 - val_loss: 0.1543 - val_accuracy: 0.9551 - 46s/epoch - 141ms/step
+328/328 - 47s - loss: 0.2218 - accuracy: 0.9240 - val_loss: 0.1319 - val_accuracy: 0.9588 - 47s/epoch - 142ms/step
Epoch 2/3
-328/328 - 43s - loss: 0.1013 - accuracy: 0.9608 - val_loss: 0.1133 - val_accuracy: 0.9660 - 43s/epoch - 130ms/step
+328/328 - 43s - loss: 0.0911 - accuracy: 0.9662 - val_loss: 0.1058 - val_accuracy: 0.9683 - 43s/epoch - 132ms/step
Epoch 3/3
-328/328 - 43s - loss: 0.0673 - accuracy: 0.9754 - val_loss: 0.1127 - val_accuracy: 0.9671 - 43s/epoch - 130ms/step
+328/328 - 43s - loss: 0.0585 - accuracy: 0.9781 - val_loss: 0.0925 - val_accuracy: 0.9687 - 43s/epoch - 132ms/step
-<keras.callbacks.History object at 0x7f8517c8f490>
+<keras.callbacks.History object at 0x7fef75526ed0>
</pre></div>
</div>
</div>
@@ -957,7 +957,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 37.654 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 46.505 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index ade56191ea..2c4045d8f3 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:31.630</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>05:39.964</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,19 +336,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>04:37.654</p></td>
+<td><p>04:46.505</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:42.578</p></td>
+<td><p>00:41.931</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.040</p></td>
+<td><p>00:08.242</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.357</p></td>
+<td><p>00:03.284</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index c7b78e441e..9b0f91d8b6 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.645</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:42.724</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -336,15 +336,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:31.864</p></td>
+<td><p>00:31.046</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.090</p></td>
+<td><p>00:10.155</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.684</p></td>
+<td><p>00:01.517</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 5b3c311539..08d32712af 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -522,7 +522,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f84b9106170>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7fef7023cdd0>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index 889a5f003e..b8a3147725 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -327,7 +327,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:06.017</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:07.992</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -336,23 +336,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:03.728</p></td>
+<td><p>00:05.804</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.034</p></td>
+<td><p>00:00.977</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.547</p></td>
+<td><p>00:00.528</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.527</p></td>
+<td><p>00:00.506</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.100</p></td>
+<td><p>00:00.097</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
@@ -360,7 +360,7 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.028</p></td>
+<td><p>00:00.027</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index 43718db912..576368e031 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -577,7 +577,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmptunj6_nu/input0.cc'\nsource_filename = \"/tmp/tmptunj6_nu/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpoj_ww8lo/input0.cc'\nsource_filename = \"/tmp/tmpoj_ww8lo/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/reference/api/doxygen/block__scope_8h.html b/docs/reference/api/doxygen/block__scope_8h.html
index 85d986e07f..163ed3cdd5 100644
--- a/docs/reference/api/doxygen/block__scope_8h.html
+++ b/docs/reference/api/doxygen/block__scope_8h.html
@@ -84,7 +84,7 @@ Include dependency graph for block_scope.h:</div>
</div><div class="textblock"><div class="dynheader">
This graph shows which files directly or indirectly include this file:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="block__scope_8h__dep__incl.svg" width="1222" height="767"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="block__scope_8h__dep__incl.svg" width="1374" height="767"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
</div>
diff --git a/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg b/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
index 355f1b284a..52387078ec 100644
--- a/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/block__scope_8h__dep__incl.svg
@@ -4,291 +4,319 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: include/tvm/tir/schedule/block_scope.h Pages: 1 -->
-<svg width="916pt" height="575pt"
- viewBox="0.00 0.00 916.00 575.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="1030pt" height="575pt"
+ viewBox="0.00 0.00 1030.00 575.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 571)">
<title>include/tvm/tir/schedule/block_scope.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-571 912,-571 912,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-571 1026,-571 1026,4 -4,4"/>
<!-- Node54 -->
<g id="node1" class="node">
<title>Node54</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="463,-536.5 463,-566.5 597,-566.5 597,-536.5 463,-536.5"/>
-<text text-anchor="start" x="471" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="530" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/block_scope.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="548,-536.5 548,-566.5 682,-566.5 682,-536.5 548,-536.5"/>
+<text text-anchor="start" x="556" y="-554.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="615" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/block_scope.h</text>
</g>
<!-- Node55 -->
<g id="node2" class="node">
<title>Node55</title>
<g id="a_node2"><a xlink:href="state_8h.html" target="_top" xlink:title="This file defines ScheduleState, the core data structure of TensorIR scheduling. ">
-<polygon fill="#ffffff" stroke="#000000" points="463,-469.5 463,-499.5 597,-499.5 597,-469.5 463,-469.5"/>
-<text text-anchor="start" x="471" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="530" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/state.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="548,-469.5 548,-499.5 682,-499.5 682,-469.5 548,-469.5"/>
+<text text-anchor="start" x="556" y="-487.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="615" y="-476.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/state.h</text>
</a>
</g>
</g>
<!-- Node54->Node55 -->
<g id="edge1" class="edge">
<title>Node54->Node55</title>
-<path fill="none" stroke="#191970" d="M530,-526.0249C530,-517.128 530,-507.4287 530,-499.6432"/>
-<polygon fill="#191970" stroke="#191970" points="526.5001,-526.2966 530,-536.2967 533.5001,-526.2967 526.5001,-526.2966"/>
+<path fill="none" stroke="#191970" d="M615,-526.0249C615,-517.128 615,-507.4287 615,-499.6432"/>
+<polygon fill="#191970" stroke="#191970" points="611.5001,-526.2966 615,-536.2967 618.5001,-526.2967 611.5001,-526.2966"/>
</g>
<!-- Node56 -->
<g id="node3" class="node">
<title>Node56</title>
<g id="a_node3"><a xlink:href="tir_2schedule_2schedule_8h.html" target="_top" xlink:title="include/tvm/tir/schedule\l/schedule.h">
-<polygon fill="#ffffff" stroke="#000000" points="463,-402.5 463,-432.5 597,-432.5 597,-402.5 463,-402.5"/>
-<text text-anchor="start" x="471" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
-<text text-anchor="middle" x="530" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="548,-402.5 548,-432.5 682,-432.5 682,-402.5 548,-402.5"/>
+<text text-anchor="start" x="556" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/schedule</text>
+<text text-anchor="middle" x="615" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule.h</text>
</a>
</g>
</g>
<!-- Node55->Node56 -->
<g id="edge2" class="edge">
<title>Node55->Node56</title>
-<path fill="none" stroke="#191970" d="M530,-459.0249C530,-450.128 530,-440.4287 530,-432.6432"/>
-<polygon fill="#191970" stroke="#191970" points="526.5001,-459.2966 530,-469.2967 533.5001,-459.2967 526.5001,-459.2966"/>
+<path fill="none" stroke="#191970" d="M615,-459.0249C615,-450.128 615,-440.4287 615,-432.6432"/>
+<polygon fill="#191970" stroke="#191970" points="611.5001,-459.2966 615,-469.2967 618.5001,-459.2967 611.5001,-459.2966"/>
</g>
<!-- Node57 -->
<g id="node4" class="node">
<title>Node57</title>
<g id="a_node4"><a xlink:href="meta__schedule_2cost__model_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/cost_model.h">
-<polygon fill="#ffffff" stroke="#000000" points="0,-268.5 0,-298.5 152,-298.5 152,-268.5 0,-268.5"/>
-<text text-anchor="start" x="8" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="76" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cost_model.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="170,-268.5 170,-298.5 322,-298.5 322,-268.5 170,-268.5"/>
+<text text-anchor="start" x="178" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="246" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/cost_model.h</text>
</a>
</g>
</g>
<!-- Node56->Node57 -->
<g id="edge3" class="edge">
<title>Node56->Node57</title>
-<path fill="none" stroke="#191970" d="M452.8138,-409.4076C369.8983,-400.0236 243.9781,-383.5791 199,-366 157.5689,-349.8072 115.9628,-317.8239 93.3391,-298.7868"/>
-<polygon fill="#191970" stroke="#191970" points="452.6277,-412.9086 462.9554,-410.5445 453.4076,-405.9521 452.6277,-412.9086"/>
+<path fill="none" stroke="#191970" d="M537.8945,-409.0998C488.2308,-401.8341 423.0813,-388.7754 369,-366 328.241,-348.835 286.7277,-317.4937 263.873,-298.774"/>
+<polygon fill="#191970" stroke="#191970" points="537.538,-412.5841 547.9299,-410.5202 538.5191,-405.6532 537.538,-412.5841"/>
</g>
<!-- Node58 -->
<g id="node5" class="node">
<title>Node58</title>
<g id="a_node5"><a xlink:href="search__strategy_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/search_strategy.h">
-<polygon fill="#ffffff" stroke="#000000" points="331,-201.5 331,-231.5 483,-231.5 483,-201.5 331,-201.5"/>
-<text text-anchor="start" x="339" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="407" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/search_strategy.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="378,-201.5 378,-231.5 530,-231.5 530,-201.5 378,-201.5"/>
+<text text-anchor="start" x="386" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="454" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/search_strategy.h</text>
</a>
</g>
</g>
<!-- Node56->Node58 -->
-<g id="edge22" class="edge">
+<g id="edge25" class="edge">
<title>Node56->Node58</title>
-<path fill="none" stroke="#191970" d="M511.5019,-394.5251C493.2789,-371.2784 465.2981,-333.8876 445,-299 431.8358,-276.374 419.766,-248.4022 412.9065,-231.531"/>
-<polygon fill="#191970" stroke="#191970" points="508.7947,-396.7444 517.7419,-402.4189 514.2862,-392.4034 508.7947,-396.7444"/>
+<path fill="none" stroke="#191970" d="M610.6154,-392.5318C603.804,-360.2688 587.8853,-303.9822 555,-268 540.1281,-251.7276 518.9371,-239.8361 499.8028,-231.6069"/>
+<polygon fill="#191970" stroke="#191970" points="607.2111,-393.3588 612.5876,-402.4881 614.0777,-391.9986 607.2111,-393.3588"/>
</g>
<!-- Node62 -->
<g id="node9" class="node">
<title>Node62</title>
-<g id="a_node9"><a xlink:href="measure__candidate_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_candidate.h">
-<polygon fill="#ffffff" stroke="#000000" points="208,-335.5 208,-365.5 360,-365.5 360,-335.5 208,-335.5"/>
-<text text-anchor="start" x="216" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="284" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_candidate.h</text>
+<g id="a_node9"><a xlink:href="database_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/database.h">
+<polygon fill="#ffffff" stroke="#000000" points="0,-268.5 0,-298.5 152,-298.5 152,-268.5 0,-268.5"/>
+<text text-anchor="start" x="8" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="76" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
</a>
</g>
</g>
<!-- Node56->Node62 -->
<g id="edge11" class="edge">
<title>Node56->Node62</title>
-<path fill="none" stroke="#191970" d="M465.0845,-399.8198C425.984,-389.1704 376.7183,-375.7525 339.448,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="464.2245,-403.213 474.7928,-402.4639 466.064,-396.459 464.2245,-403.213"/>
+<path fill="none" stroke="#191970" d="M537.7756,-408.0208C476.0913,-399.578 387.7412,-385.5278 312,-366 240.1302,-347.4703 158.9749,-316.8851 113.0061,-298.6174"/>
+<polygon fill="#191970" stroke="#191970" points="537.4441,-411.5077 547.8228,-409.3787 538.3817,-404.5708 537.4441,-411.5077"/>
</g>
-<!-- Node64 -->
-<g id="node11" class="node">
-<title>Node64</title>
-<g id="a_node11"><a xlink:href="mutator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/mutator.h">
-<polygon fill="#ffffff" stroke="#000000" points="756,-201.5 756,-231.5 908,-231.5 908,-201.5 756,-201.5"/>
-<text text-anchor="start" x="764" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="832" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mutator.h</text>
+<!-- Node63 -->
+<g id="node10" class="node">
+<title>Node63</title>
+<g id="a_node10"><a xlink:href="measure__candidate_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_candidate.h">
+<polygon fill="#ffffff" stroke="#000000" points="378,-335.5 378,-365.5 530,-365.5 530,-335.5 378,-335.5"/>
+<text text-anchor="start" x="386" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="454" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_candidate.h</text>
</a>
</g>
</g>
-<!-- Node56->Node64 -->
-<g id="edge16" class="edge">
-<title>Node56->Node64</title>
-<path fill="none" stroke="#191970" d="M607.3217,-413.4851C681.2533,-407.9373 785.4396,-395.0786 813,-366 848.0963,-328.9704 840.7497,-261.6109 835.2559,-231.6123"/>
-<polygon fill="#191970" stroke="#191970" points="606.8246,-410.0118 597.1015,-414.2204 607.3269,-416.9937 606.8246,-410.0118"/>
+<!-- Node56->Node63 -->
+<g id="edge14" class="edge">
+<title>Node56->Node63</title>
+<path fill="none" stroke="#191970" d="M569.5587,-398.5897C544.4767,-388.1518 513.7512,-375.3654 490.2891,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="568.2912,-401.8531 578.8685,-402.4639 570.9807,-395.3904 568.2912,-401.8531"/>
</g>
<!-- Node65 -->
<g id="node12" class="node">
<title>Node65</title>
-<g id="a_node12"><a xlink:href="postproc_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/postproc.h">
-<polygon fill="#ffffff" stroke="#000000" points="454,-268.5 454,-298.5 606,-298.5 606,-268.5 454,-268.5"/>
-<text text-anchor="start" x="462" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="530" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/postproc.h</text>
+<g id="a_node12"><a xlink:href="mutator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/mutator.h">
+<polygon fill="#ffffff" stroke="#000000" points="548,-201.5 548,-231.5 700,-231.5 700,-201.5 548,-201.5"/>
+<text text-anchor="start" x="556" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="624" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/mutator.h</text>
</a>
</g>
</g>
<!-- Node56->Node65 -->
-<g id="edge18" class="edge">
+<g id="edge19" class="edge">
<title>Node56->Node65</title>
-<path fill="none" stroke="#191970" d="M530,-392.3415C530,-364.8131 530,-321.5714 530,-298.7614"/>
-<polygon fill="#191970" stroke="#191970" points="526.5001,-392.3889 530,-402.389 533.5001,-392.389 526.5001,-392.3889"/>
+<path fill="none" stroke="#191970" d="M616.1262,-392.348C618.0221,-350.0061 621.7942,-265.7637 623.328,-231.5088"/>
+<polygon fill="#191970" stroke="#191970" points="612.6274,-392.2457 615.6765,-402.3923 619.6204,-392.5589 612.6274,-392.2457"/>
</g>
<!-- Node66 -->
<g id="node13" class="node">
<title>Node66</title>
-<g id="a_node13"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
-<polygon fill="#ffffff" stroke="#000000" points="548,-201.5 548,-231.5 700,-231.5 700,-201.5 548,-201.5"/>
-<text text-anchor="start" x="556" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="624" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule_rule.h</text>
+<g id="a_node13"><a xlink:href="postproc_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/postproc.h">
+<polygon fill="#ffffff" stroke="#000000" points="718,-201.5 718,-231.5 870,-231.5 870,-201.5 718,-201.5"/>
+<text text-anchor="start" x="726" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="794" y="-208.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/postproc.h</text>
</a>
</g>
</g>
<!-- Node56->Node66 -->
-<g id="edge20" class="edge">
+<g id="edge21" class="edge">
<title>Node56->Node66</title>
-<path fill="none" stroke="#191970" d="M552.0266,-394.5902C572.1349,-372.1679 600.8173,-336.0655 615,-299 623.5473,-276.6623 624.7941,-248.605 624.6143,-231.638"/>
-<polygon fill="#191970" stroke="#191970" points="549.2222,-392.4701 545.049,-402.2085 554.3842,-397.1981 549.2222,-392.4701"/>
+<path fill="none" stroke="#191970" d="M635.2789,-394.7287C672.2232,-353.2438 749.5454,-266.4183 780.634,-231.5088"/>
+<polygon fill="#191970" stroke="#191970" points="632.4909,-392.5967 628.4541,-402.3923 637.7185,-397.2521 632.4909,-392.5967"/>
</g>
<!-- Node67 -->
<g id="node14" class="node">
<title>Node67</title>
-<g id="a_node14"><a xlink:href="space__generator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/space_generator.h">
-<polygon fill="#ffffff" stroke="#000000" points="652,-335.5 652,-365.5 804,-365.5 804,-335.5 652,-335.5"/>
-<text text-anchor="start" x="660" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="728" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/space_generator.h</text>
+<g id="a_node14"><a xlink:href="schedule__rule_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/schedule_rule.h">
+<polygon fill="#ffffff" stroke="#000000" points="700,-335.5 700,-365.5 852,-365.5 852,-335.5 700,-335.5"/>
+<text text-anchor="start" x="708" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="776" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/schedule_rule.h</text>
</a>
</g>
</g>
<!-- Node56->Node67 -->
<g id="edge23" class="edge">
<title>Node56->Node67</title>
-<path fill="none" stroke="#191970" d="M584.2602,-399.1392C615.3961,-388.6033 654.0087,-375.5375 683.3711,-365.6017"/>
-<polygon fill="#191970" stroke="#191970" points="582.7856,-395.9432 574.4351,-402.4639 585.0293,-402.5739 582.7856,-395.9432"/>
+<path fill="none" stroke="#191970" d="M660.4413,-398.5897C685.5233,-388.1518 716.2488,-375.3654 739.7109,-365.6017"/>
+<polygon fill="#191970" stroke="#191970" points="659.0193,-395.3904 651.1315,-402.4639 661.7088,-401.8531 659.0193,-395.3904"/>
+</g>
+<!-- Node68 -->
+<g id="node15" class="node">
+<title>Node68</title>
+<g id="a_node15"><a xlink:href="space__generator_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/space_generator.h">
+<polygon fill="#ffffff" stroke="#000000" points="870,-335.5 870,-365.5 1022,-365.5 1022,-335.5 870,-335.5"/>
+<text text-anchor="start" x="878" y="-353.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="946" y="-342.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/space_generator.h</text>
+</a>
+</g>
+</g>
+<!-- Node56->Node68 -->
+<g id="edge26" class="edge">
+<title>Node56->Node68</title>
+<path fill="none" stroke="#191970" d="M692.275,-401.8582C746.4137,-390.8996 818.3508,-376.3383 871.7985,-365.5196"/>
+<polygon fill="#191970" stroke="#191970" points="691.4886,-398.4464 682.3818,-403.8608 692.8774,-405.3072 691.4886,-398.4464"/>
</g>
<!-- Node57->Node58 -->
<g id="edge4" class="edge">
<title>Node57->Node58</title>
-<path fill="none" stroke="#191970" d="M160.3693,-266.4222C213.5372,-255.6601 281.548,-241.8936 332.6793,-231.5438"/>
-<polygon fill="#191970" stroke="#191970" points="159.3897,-263.0495 150.2829,-268.4639 160.7785,-269.9103 159.3897,-263.0495"/>
+<path fill="none" stroke="#191970" d="M302.5763,-265.2759C335.3577,-254.7165 376.1382,-241.5805 407.1171,-231.6017"/>
+<polygon fill="#191970" stroke="#191970" points="301.1245,-262.0664 292.6793,-268.4639 303.2708,-268.7292 301.1245,-262.0664"/>
</g>
<!-- Node60 -->
<g id="node7" class="node">
<title>Node60</title>
<g id="a_node7"><a xlink:href="task__scheduler_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/task_scheduler.h">
-<polygon fill="#ffffff" stroke="#000000" points="331,-.5 331,-30.5 483,-30.5 483,-.5 331,-.5"/>
-<text text-anchor="start" x="339" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="407" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/task_scheduler.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="255,-.5 255,-30.5 407,-30.5 407,-.5 255,-.5"/>
+<text text-anchor="start" x="263" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="331" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/task_scheduler.h</text>
</a>
</g>
</g>
<!-- Node57->Node60 -->
<g id="edge10" class="edge">
<title>Node57->Node60</title>
-<path fill="none" stroke="#191970" d="M98.5128,-261.2196C140.4758,-220.2673 234.3538,-131.337 322,-67 340.36,-53.5227 362.3852,-40.2517 379.4144,-30.5573"/>
-<polygon fill="#191970" stroke="#191970" points="95.9885,-258.7929 91.2947,-268.2912 100.8873,-263.7931 95.9885,-258.7929"/>
+<path fill="none" stroke="#191970" d="M253.8541,-258.7364C271.1809,-204.106 312.2521,-74.6111 326.2393,-30.5103"/>
+<polygon fill="#191970" stroke="#191970" points="250.5006,-257.7331 250.8135,-268.3233 257.173,-259.8494 250.5006,-257.7331"/>
</g>
<!-- Node59 -->
<g id="node6" class="node">
<title>Node59</title>
<g id="a_node6"><a xlink:href="measure__callback_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_callback.h">
-<polygon fill="#ffffff" stroke="#000000" points="331,-67.5 331,-97.5 483,-97.5 483,-67.5 331,-67.5"/>
-<text text-anchor="start" x="339" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="407" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_callback.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="326,-67.5 326,-97.5 478,-97.5 478,-67.5 326,-67.5"/>
+<text text-anchor="start" x="334" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="402" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/measure_callback.h</text>
</a>
</g>
</g>
<!-- Node58->Node59 -->
<g id="edge5" class="edge">
<title>Node58->Node59</title>
-<path fill="none" stroke="#191970" d="M407,-191.3415C407,-163.8131 407,-120.5714 407,-97.7614"/>
-<polygon fill="#191970" stroke="#191970" points="403.5001,-191.3889 407,-201.389 410.5001,-191.389 403.5001,-191.3889"/>
+<path fill="none" stroke="#191970" d="M444.3785,-191.706C433.703,-164.1962 416.8131,-120.6723 407.9223,-97.7614"/>
+<polygon fill="#191970" stroke="#191970" points="441.2553,-193.3325 448.136,-201.389 447.7811,-190.8001 441.2553,-193.3325"/>
</g>
<!-- Node61 -->
<g id="node8" class="node">
<title>Node61</title>
<g id="a_node8"><a xlink:href="tune__context_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/tune_context.h">
-<polygon fill="#ffffff" stroke="#000000" points="444,-134.5 444,-164.5 596,-164.5 596,-134.5 444,-134.5"/>
-<text text-anchor="start" x="452" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="520" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tune_context.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="548,-134.5 548,-164.5 700,-164.5 700,-134.5 548,-134.5"/>
+<text text-anchor="start" x="556" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="624" y="-141.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/tune_context.h</text>
</a>
</g>
</g>
<!-- Node58->Node61 -->
<g id="edge7" class="edge">
<title>Node58->Node61</title>
-<path fill="none" stroke="#191970" d="M440.974,-196.3561C458.1038,-186.1995 478.537,-174.0843 494.3343,-164.7177"/>
-<polygon fill="#191970" stroke="#191970" points="439.176,-193.3531 432.3594,-201.4639 442.7462,-199.3743 439.176,-193.3531"/>
+<path fill="none" stroke="#191970" d="M501.6318,-197.7275C528.1796,-187.2645 560.7995,-174.4084 585.6823,-164.6017"/>
+<polygon fill="#191970" stroke="#191970" points="500.1715,-194.5409 492.1513,-201.4639 502.7382,-201.0534 500.1715,-194.5409"/>
</g>
<!-- Node59->Node60 -->
<g id="edge6" class="edge">
<title>Node59->Node60</title>
-<path fill="none" stroke="#191970" d="M407,-57.0249C407,-48.128 407,-38.4287 407,-30.6432"/>
-<polygon fill="#191970" stroke="#191970" points="403.5001,-57.2966 407,-67.2967 410.5001,-57.2967 403.5001,-57.2966"/>
+<path fill="none" stroke="#191970" d="M378.3806,-60.2113C368.0905,-50.5009 356.2949,-39.3698 347.0472,-30.6432"/>
+<polygon fill="#191970" stroke="#191970" points="376.2139,-62.979 385.889,-67.2967 381.0181,-57.8879 376.2139,-62.979"/>
</g>
<!-- Node61->Node59 -->
<g id="edge8" class="edge">
<title>Node61->Node59</title>
-<path fill="none" stroke="#191970" d="M486.026,-129.3561C468.8962,-119.1995 448.463,-107.0843 432.6657,-97.7177"/>
-<polygon fill="#191970" stroke="#191970" points="484.2538,-132.3743 494.6406,-134.4639 487.824,-126.3531 484.2538,-132.3743"/>
+<path fill="none" stroke="#191970" d="M564.5186,-131.5484C529.3791,-120.9432 485.3875,-107.6665 452.0384,-97.6017"/>
+<polygon fill="#191970" stroke="#191970" points="563.5941,-134.9252 574.1789,-134.4639 565.6166,-128.2238 563.5941,-134.9252"/>
</g>
<!-- Node61->Node60 -->
<g id="edge9" class="edge">
<title>Node61->Node60</title>
-<path fill="none" stroke="#191970" d="M516.1266,-124.1859C512.3856,-106.6204 505.255,-83.5048 492,-67 479.3593,-51.2601 460.5184,-39.1502 443.785,-30.6401"/>
-<polygon fill="#191970" stroke="#191970" points="512.7418,-125.1233 518.0423,-134.297 519.6195,-123.8202 512.7418,-125.1233"/>
+<path fill="none" stroke="#191970" d="M593.92,-128.6955C566.8562,-110.6418 525.5374,-84.6949 487,-67 454.7304,-52.1831 416.9517,-39.5974 386.6546,-30.5772"/>
+<polygon fill="#191970" stroke="#191970" points="592.2776,-131.8091 602.5271,-134.4921 596.1879,-126.003 592.2776,-131.8091"/>
</g>
-<!-- Node62->Node57 -->
+<!-- Node62->Node58 -->
<g id="edge12" class="edge">
-<title>Node62->Node57</title>
-<path fill="none" stroke="#191970" d="M227.4237,-332.2759C194.6423,-321.7165 153.8618,-308.5805 122.8829,-298.6017"/>
-<polygon fill="#191970" stroke="#191970" points="226.7292,-335.7292 237.3207,-335.4639 228.8755,-329.0664 226.7292,-335.7292"/>
+<title>Node62->Node58</title>
+<path fill="none" stroke="#191970" d="M162.5661,-267.725C241.4423,-253.6213 321.0612,-239.6589 377.7848,-229.759"/>
+<polygon fill="#191970" stroke="#191970" points="161.6785,-264.3281 152.4515,-269.5351 162.9117,-271.2187 161.6785,-264.3281"/>
</g>
-<!-- Node62->Node58 -->
+<!-- Node62->Node60 -->
+<g id="edge13" class="edge">
+<title>Node62->Node60</title>
+<path fill="none" stroke="#191970" d="M97.3387,-261.0735C147.9393,-207.8932 274.1456,-75.2529 316.7178,-30.5103"/>
+<polygon fill="#191970" stroke="#191970" points="94.7981,-258.666 90.4405,-268.3233 99.8694,-263.4913 94.7981,-258.666"/>
+</g>
+<!-- Node63->Node57 -->
<g id="edge15" class="edge">
-<title>Node62->Node58</title>
-<path fill="none" stroke="#191970" d="M327.6951,-330.619C342.2181,-322.4108 357.5803,-311.7902 369,-299 386.7493,-279.1207 397.6246,-249.5342 402.9665,-231.7207"/>
-<polygon fill="#191970" stroke="#191970" points="325.9427,-327.5875 318.8036,-335.4158 329.2664,-333.7481 325.9427,-327.5875"/>
+<title>Node63->Node57</title>
+<path fill="none" stroke="#191970" d="M397.4237,-332.2759C364.6423,-321.7165 323.8618,-308.5805 292.8829,-298.6017"/>
+<polygon fill="#191970" stroke="#191970" points="396.7292,-335.7292 407.3207,-335.4639 398.8755,-329.0664 396.7292,-335.7292"/>
</g>
-<!-- Node62->Node59 -->
-<g id="edge14" class="edge">
-<title>Node62->Node59</title>
-<path fill="none" stroke="#191970" d="M232.8489,-330.7472C219.4568,-322.9655 206.6417,-312.5649 199,-299 192.2376,-286.996 193.3662,-280.5733 199,-268 235.9985,-185.4292 329.9301,-124.5092 377.8773,-97.6681"/>
-<polygon fill="#191970" stroke="#191970" points="231.2179,-333.8441 241.6841,-335.4897 234.5285,-327.6764 231.2179,-333.8441"/>
+<!-- Node63->Node58 -->
+<g id="edge18" class="edge">
+<title>Node63->Node58</title>
+<path fill="none" stroke="#191970" d="M505.1511,-330.7472C518.5432,-322.9655 531.3583,-312.5649 539,-299 545.7624,-286.996 545.7624,-280.004 539,-268 529.6844,-251.4638 512.6807,-239.6299 496.3159,-231.5103"/>
+<polygon fill="#191970" stroke="#191970" points="503.4715,-327.6764 496.3159,-335.4897 506.7821,-333.8441 503.4715,-327.6764"/>
</g>
-<!-- Node63 -->
-<g id="node10" class="node">
-<title>Node63</title>
-<g id="a_node10"><a xlink:href="feature__extractor_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/feature_extractor.h">
-<polygon fill="#ffffff" stroke="#000000" points="208,-268.5 208,-298.5 360,-298.5 360,-268.5 208,-268.5"/>
-<text text-anchor="start" x="216" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="284" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/feature_extractor.h</text>
-</a>
+<!-- Node63->Node59 -->
+<g id="edge17" class="edge">
+<title>Node63->Node59</title>
+<path fill="none" stroke="#191970" d="M402.8489,-330.7472C389.4568,-322.9655 376.6417,-312.5649 369,-299 331.0239,-231.5883 374.2766,-134.5441 393.6141,-97.576"/>
+<polygon fill="#191970" stroke="#191970" points="401.2179,-333.8441 411.6841,-335.4897 404.5285,-327.6764 401.2179,-333.8441"/>
</g>
+<!-- Node64 -->
+<g id="node11" class="node">
+<title>Node64</title>
+<g id="a_node11"><a xlink:href="feature__extractor_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/feature_extractor.h">
+<polygon fill="#ffffff" stroke="#000000" points="378,-268.5 378,-298.5 530,-298.5 530,-268.5 378,-268.5"/>
+<text text-anchor="start" x="386" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="454" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/feature_extractor.h</text>
+</a>
</g>
-<!-- Node62->Node63 -->
-<g id="edge13" class="edge">
-<title>Node62->Node63</title>
-<path fill="none" stroke="#191970" d="M284,-325.0249C284,-316.128 284,-306.4287 284,-298.6432"/>
-<polygon fill="#191970" stroke="#191970" points="280.5001,-325.2966 284,-335.2967 287.5001,-325.2967 280.5001,-325.2966"/>
</g>
-<!-- Node64->Node61 -->
-<g id="edge17" class="edge">
-<title>Node64->Node61</title>
-<path fill="none" stroke="#191970" d="M752.1609,-199.3551C702.0912,-188.6029 638.1506,-174.8721 590.0546,-164.5438"/>
-<polygon fill="#191970" stroke="#191970" points="751.4691,-202.7862 761.9811,-201.4639 752.9389,-195.9423 751.4691,-202.7862"/>
+<!-- Node63->Node64 -->
+<g id="edge16" class="edge">
+<title>Node63->Node64</title>
+<path fill="none" stroke="#191970" d="M454,-325.0249C454,-316.128 454,-306.4287 454,-298.6432"/>
+<polygon fill="#191970" stroke="#191970" points="450.5001,-325.2966 454,-335.2967 457.5001,-325.2967 450.5001,-325.2966"/>
</g>
<!-- Node65->Node61 -->
-<g id="edge19" class="edge">
+<g id="edge20" class="edge">
<title>Node65->Node61</title>
-<path fill="none" stroke="#191970" d="M528.1225,-258.3415C526.0681,-230.8131 522.8411,-187.5714 521.1389,-164.7614"/>
-<polygon fill="#191970" stroke="#191970" points="524.6377,-258.6772 528.8723,-268.389 531.6183,-258.1562 524.6377,-258.6772"/>
+<path fill="none" stroke="#191970" d="M624,-191.0249C624,-182.128 624,-172.4287 624,-164.6432"/>
+<polygon fill="#191970" stroke="#191970" points="620.5001,-191.2966 624,-201.2967 627.5001,-191.2967 620.5001,-191.2966"/>
</g>
<!-- Node66->Node61 -->
-<g id="edge21" class="edge">
+<g id="edge22" class="edge">
<title>Node66->Node61</title>
-<path fill="none" stroke="#191970" d="M591.861,-195.7951C576.2777,-185.7558 557.8939,-173.9124 543.6216,-164.7177"/>
-<polygon fill="#191970" stroke="#191970" points="590.3583,-198.9904 600.6604,-201.4639 594.1494,-193.1058 590.3583,-198.9904"/>
+<path fill="none" stroke="#191970" d="M746.3682,-197.7275C719.8204,-187.2645 687.2005,-174.4084 662.3177,-164.6017"/>
+<polygon fill="#191970" stroke="#191970" points="745.2618,-201.0534 755.8487,-201.4639 747.8285,-194.5409 745.2618,-201.0534"/>
</g>
<!-- Node67->Node61 -->
<g id="edge24" class="edge">
<title>Node67->Node61</title>
-<path fill="none" stroke="#191970" d="M732.5716,-325.0533C736.9044,-292.2693 738.918,-235.4164 709,-201 693.8837,-183.6108 641.8201,-170.2017 596.1562,-161.4311"/>
-<polygon fill="#191970" stroke="#191970" points="729.0746,-324.7735 731.0561,-335.1814 735.9975,-325.8095 729.0746,-324.7735"/>
+<path fill="none" stroke="#191970" d="M808.2654,-329.7575C850.3587,-300.1651 914.6654,-244.5885 879,-201 856.6912,-173.7353 766.8801,-160.5756 700.1192,-154.4548"/>
+<polygon fill="#191970" stroke="#191970" points="806.1811,-326.943 799.9218,-335.4913 810.1457,-332.7121 806.1811,-326.943"/>
+</g>
+<!-- Node68->Node61 -->
+<g id="edge27" class="edge">
+<title>Node68->Node61</title>
+<path fill="none" stroke="#191970" d="M945.3759,-324.6626C943.0878,-290.9641 933.8115,-232.5969 898,-201 869.1911,-175.5816 770.7465,-161.8839 700.1092,-155.145"/>
+<polygon fill="#191970" stroke="#191970" points="941.9022,-325.2641 945.9122,-335.0707 948.8929,-324.9038 941.9022,-325.2641"/>
</g>
</g>
</svg>
diff --git a/docs/reference/api/doxygen/classes.html b/docs/reference/api/doxygen/classes.html
index 98b50bdc15..2e2a188672 100644
--- a/docs/reference/api/doxygen/classes.html
+++ b/docs/reference/api/doxygen/classes.html
@@ -65,8 +65,8 @@ $(function() {
<div class="qindex"><a class="qindex" href="#letter_a">a</a> | <a class="qindex" href="#letter_b">b</a> | <a class="qindex" href="#letter_c">c</a> | <a class="qindex" href="#letter_d">d</a> | <a class="qindex" href="#letter_e">e</a> | <a class="qindex" href="#letter_f">f</a> | <a class="qindex" href="#letter_g">g</a> | <a class="qindex" href="#letter_h">h</a> | <a class="qindex" href="#letter_i">i</a> |& [...]
<table class="classindex">
<tr><td rowspan="2" valign="bottom"><a name="letter_a"></a><table border="0" cellspacing="0" cellpadding="0"><tr><td><div class="ah">  a  </div></td></tr></table>
-</td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv3DWinogradAttrs.html">Conv3DWinogradAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1IRDocsifier.html">IRDocsifier</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocatio [...]
-<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1ConvGemmWeightTransformAttrs.html">ConvGemmWeightTransformAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1IRDocsifierNode.html">IRDocsifierNode</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1Pool [...]
+</td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv3DWinogradAttrs.html">Conv3DWinogradAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1IRDocsifier.html">IRDocsifier</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocatio [...]
+<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1ConvGemmWeightTransformAttrs.html">ConvGemmWeightTransformAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1IRDocsifierNode.html">IRDocsifierNode</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1Pool [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1AccessAnalyzer.html">AccessAnalyzer</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1ConvWinogradWeightTransformAttrs.html">ConvWinogradWeightTransformAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1IRM [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1AccessAnalyzerNode.html">AccessAnalyzerNode</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1CorrelationAttrs.html">CorrelationAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1ir [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1AdaptivePool1DAttrs.html">AdaptivePool1DAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1CostModel.html">CostModel</a> (<a class="el" href="namespacetvm_1_1meta__schedule.html">tvm::meta_schedule</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1ir_1_1IRModuleFra [...]
@@ -114,8 +114,8 @@ $(function() {
<tr><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1metadata_1_1ArrayAccessor.html">ArrayAccessor</a> (<a class="el" href="namespacetvm_1_1runtime_1_1metadata.html">tvm::runtime::metadata</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticBuilder.html">DiagnosticBuilder</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1LE.html">LE</a> (<a class="el" href="namesp [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1metadata_1_1ArrayAccessor_3_01const_01char_01_5_00_01_1_1tvm_1_1runtime_1_1String_01_4.html">ArrayAccessor< const char *, ::tvm::runtime::String ></a> (<a class="el" href="namespacetvm_1_1runtime_1_1metadata.html">tvm::runtime::metadata</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticContext.html">DiagnosticContext</a> (<a class="el" href="namespacetvm.html">tvm</a>)   [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1SimpleObjAllocator_1_1ArrayHandler.html">SimpleObjAllocator::ArrayHandler</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticContextNode.html">DiagnosticContextNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1tir_1_1LENode.html">LENode</a> (<a [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1ArrayIndexPath.html">ArrayIndexPath</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticNode.html">DiagnosticNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Let.html">Let</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign= [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1ArrayIndexPathNode.html">ArrayIndexPathNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticRenderer.html">DiagnosticRenderer</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1Let.html">Let</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1ArrayIndexPath.html">ArrayIndexPath</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticNode.html">DiagnosticNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1Let.html">Let</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td v [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1ArrayIndexPathNode.html">ArrayIndexPathNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticRenderer.html">DiagnosticRenderer</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Let.html">Let</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)    [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1metadata_1_1ArrayIterator.html">ArrayIterator</a> (<a class="el" href="namespacetvm_1_1runtime_1_1metadata.html">tvm::runtime::metadata</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DiagnosticRendererNode.html">DiagnosticRendererNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1LetFrame [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1ArrayNode.html">ArrayNode</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DictAttrs.html">DictAttrs</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1LetFrameNode.html">LetFrameNode</a> (<a class="el" href="namespacetvm_1_1scr [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1AssertDoc.html">AssertDoc</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1DictAttrsNode.html">DictAttrsNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1LetNode.html">LetNode</a> (<a class="el" href="namespacetvm_1_1ti [...]
@@ -198,15 +198,15 @@ $(function() {
<tr><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1BlockInitFrameNode.html">BlockInitFrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder_1_1tir.html">tvm::script::ir_builder::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1For.html">For</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1MissingMapEntryPathNode.h [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockNode.html">BlockNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1ForDoc.html">ForDoc</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1MixedModeMutator.html">MixedModeMutator</a> (<a class="el" [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRealize.html">BlockRealize</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1ForDocNode.html">ForDocNode</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1MixedModeVisitor.html">MixedModeVisitor</a> [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html">BlockRealizeNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1ForFrame.html">ForFrame</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder_1_1tir.html">tvm::script::ir_builder::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Mod.html">Mo [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRV.html">BlockRV</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1ForFrameNode.html">ForFrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder_1_1tir.html">tvm::script::ir_builder::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1ModNode.html">ModNode< [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html">BlockRealizeNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1ForFrame.html">ForFrame</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder_1_1tir.html">tvm::script::ir_builder::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Mod.html">Mo [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRV.html">BlockRV</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1tir_1_1ForFrameNode.html">ForFrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder_1_1tir.html">tvm::script::ir_builder::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1ModNode.html">ModNode< [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockRVNode.html">BlockRVNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1ForNode.html">ForNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ModularSet.html">ModularSet</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith< [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockScope.html">BlockScope</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1Frame.html">Frame</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ModularSetAnalyzer.html">ModularSetAnalyzer</a> (<a class= [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BlockScopeNode.html">BlockScopeNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1micro__rpc_1_1FrameBuffer.html">FrameBuffer</a> (<a class="el" href="namespacetvm_1_1runtime_1_1micro__rpc.html">tvm::runtime::micro_rpc</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ModularSetNode.html">Modula [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1Bool.html">Bool</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1FrameNode.html">FrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1Module.html">Module</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm: [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Broadcast.html">Broadcast</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1micro__rpc_1_1Framer.html">Framer</a> (<a class="el" href="namespacetvm_1_1runtime_1_1micro__rpc.html">tvm::runtime::micro_rpc</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1ModuleNode.html">ModuleNode</a> (<a class="el [...]
-<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1qnn_1_1BroadcastAttrs.html">BroadcastAttrs</a> (<a class="el" href="namespacetvm_1_1relay_1_1qnn.html">tvm::relay::qnn</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1StringObj_1_1FromStd.html">StringObj::FromStd</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Mul.html">Mul</a> (<a clas [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html">BroadcastNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1ShapeTupleObj_1_1FromStd.html">ShapeTupleObj::FromStd</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1MulNode.html">MulNode</a> (<a class="el" href= [...]
+<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1qnn_1_1BroadcastAttrs.html">BroadcastAttrs</a> (<a class="el" href="namespacetvm_1_1relay_1_1qnn.html">tvm::relay::qnn</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1ShapeTupleObj_1_1FromStd.html">ShapeTupleObj::FromStd</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Mul.html">Mul</a> [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html">BroadcastNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1StringObj_1_1FromStd.html">StringObj::FromStd</a> (<a class="el" href="namespacetvm_1_1runtime.html">tvm::runtime</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1MulNode.html">MulNode</a> (<a class="el" href="namespa [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1Buffer.html">Buffer</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1Function.html">Function</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1MultiBoxPriorAttrs.html">MultiBoxPriorAttrs</a> (<a class="el" href="namespacetvm_1_1relay.ht [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1runtime_1_1vm_1_1Buffer.html">Buffer</a> (<a class="el" href="namespacetvm_1_1runtime_1_1vm.html">tvm::runtime::vm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1FunctionDoc.html">FunctionDoc</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1MultiBoxTransformLocAttr [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1tir_1_1usmp_1_1BufferInfo.html">BufferInfo</a> (<a class="el" href="namespacetvm_1_1tir_1_1usmp.html">tvm::tir::usmp</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1printer_1_1FunctionDocNode.html">FunctionDocNode</a> (<a class="el" href="namespacetvm_1_1script_1_1printer.html">tvm::script::printer</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1MultinomialAttr [...]
@@ -280,8 +280,8 @@ $(function() {
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeDAG.html">ComputeDAG</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1IfThenElseNode.html">IfThenElseNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1relay_1_1qnn_1_1SimulatedQuantizeAttrs.html">Simulate [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeDAGNode.html">ComputeDAGNode</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSEqualReduce.html">ImplSEqualReduce</a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1PackedFunc.html">Pack [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeInlineStep.html">ComputeInlineStep</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSEqualReduce_3_01T_00_01true_01_4.html">ImplSEqualReduce< T, true ></a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href=" [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeInlineStepNode.html">ComputeInlineStepNode</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSHashReduce.html">ImplSHashReduce</a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1PackedFun [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1te_1_1ComputeOp.html">ComputeOp</a> (<a class="el" href="namespacetvm_1_1te.html">tvm::te</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSHashReduce_3_01T_00_01true_01_4.html">ImplSHashReduce< T, true ></a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1runtime_1_1PackedFuncValueConverter.html">P [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeInlineStepNode.html">ComputeInlineStepNode</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSHashReduce.html">ImplSHashReduce</a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1PackedFun [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1te_1_1ComputeOp.html">ComputeOp</a> (<a class="el" href="namespacetvm_1_1te.html">tvm::te</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplSHashReduce_3_01T_00_01true_01_4.html">ImplSHashReduce< T, true ></a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1runtime_1_1PackedFuncValueConverter.html">P [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1te_1_1ComputeOpNode.html">ComputeOpNode</a> (<a class="el" href="namespacetvm_1_1te.html">tvm::te</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplVisitAttrs.html">ImplVisitAttrs</a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1runtime_1_1PackedFuncValueConverter_3_01Optional_3_01T_01_4_01_4.html">Pack [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeRootStep.html">ComputeRootStep</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1detail_1_1ImplVisitAttrs_3_01T_00_01true_01_4.html">ImplVisitAttrs< T, true ></a> (<a class="el" href="namespacetvm_1_1detail.html">tvm::detail</a>)   </td><td valign="top"><a class="el" href="structtv [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1auto__scheduler_1_1ComputeRootStepNode.html">ComputeRootStepNode</a> (<a class="el" href="namespacetvm_1_1auto__scheduler.html">tvm::auto_scheduler</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1IncompleteType.html">IncompleteType</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="structtvm_1_1runtime_1_1PackedFuncValueConverter_3_01tvm_1_1Bool_01 [...]
@@ -299,8 +299,8 @@ $(function() {
<tr><td valign="top"><a class="el" href="classtvm_1_1relay_1_1ConstantPatternNode.html">ConstantPatternNode</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1InstructionKind.html">InstructionKind</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1instrument_1_1PassInstrumentNode.html">PassInstrumentNode</a> ( [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1ConstantPoolInfo.html">ConstantPoolInfo</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1InstructionKindNode.html">InstructionKindNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1transform_1_1PassNode.html">PassNode</a> (<a class="el" href="namespacetvm_1_1transfor [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1ConstantPoolInfoNode.html">ConstantPoolInfoNode</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1InstructionKindRegEntry.html">InstructionKindRegEntry</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1Pattern.html">Pattern</a> (<a class="el" href="namespacetvm_ [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstIntBound.html">ConstIntBound</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1InstructionNode.html">InstructionNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternConstructor.html">PatternConstructor</a> (<a class="el" hre [...]
-<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstIntBoundAnalyzer.html">ConstIntBoundAnalyzer</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntConstraints.html">IntConstraints</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternConstructorNode.html">PatternConstructo [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstIntBound.html">ConstIntBound</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1InstructionNode.html">InstructionNode</a> (<a class="el" href="namespacetvm_1_1tir.html">tvm::tir</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternConstructor.html">PatternConstructor</a> (<a class="el" hre [...]
+<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstIntBoundAnalyzer.html">ConstIntBoundAnalyzer</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntConstraints.html">IntConstraints</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternConstructorNode.html">PatternConstructo [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstIntBoundNode.html">ConstIntBoundNode</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntConstraintsNode.html">IntConstraintsNode</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternFunctor.html">PatternFunctor</a> (<a cl [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1arith_1_1ConstraintContext.html">ConstraintContext</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntConstraintsTransform.html">IntConstraintsTransform</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternFunctor_3_01R_07const_01Patte [...]
<tr><td valign="top"><a class="el" href="classtvm_1_1Constructor.html">Constructor</a> (<a class="el" href="namespacetvm.html">tvm</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntConstraintsTransformNode.html">IntConstraintsTransformNode</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1relay_1_1PatternMutator.html">PatternMutator</a> (<a class="el" href="name [...]
@@ -315,9 +315,9 @@ $(function() {
<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv2DAttrs.html">Conv2DAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntSetAnalyzer.html">IntSetAnalyzer</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1runtime_1_1profiling_1_1PercentNode.html">PercentNode</a> (<a class="el" h [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv2DTransposeAttrs.html">Conv2DTransposeAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1arith_1_1IntSetNode.html">IntSetNode</a> (<a class="el" href="namespacetvm_1_1arith.html">tvm::arith</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1te_1_1PlaceholderOp.html">PlaceholderOp</a> (<a class="el" href= [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv2DWinogradAttrs.html">Conv2DWinogradAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilder.html">IRBuilder</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1te_1_1PlaceholderOpNode [...]
-<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv2DWinogradNNPACKWeightTransformAttrs.html">Conv2DWinogradNNPACKWeightTransformAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilderFrame.html">IRBuilderFrame</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a [...]
+<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv2DWinogradNNPACKWeightTransformAttrs.html">Conv2DWinogradNNPACKWeightTransformAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilderFrame.html">IRBuilderFrame</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a [...]
</td></tr>
-<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv3DAttrs.html">Conv3DAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilderFrameNode.html">IRBuilderFrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1PointerTypeNode.html" [...]
+<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv3DAttrs.html">Conv3DAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilderFrameNode.html">IRBuilderFrameNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1PointerTypeNode.html" [...]
<tr><td valign="top"><a class="el" href="structtvm_1_1relay_1_1Conv3DTransposeAttrs.html">Conv3DTransposeAttrs</a> (<a class="el" href="namespacetvm_1_1relay.html">tvm::relay</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1script_1_1ir__builder_1_1IRBuilderNode.html">IRBuilderNode</a> (<a class="el" href="namespacetvm_1_1script_1_1ir__builder.html">tvm::script::ir_builder</a>)   </td><td valign="top"><a class="el" href="classtvm_1_1tir_1_1usmp_1 [...]
<tr><td></td><td></td><td></td><td></td><td></td></tr>
</table>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database-members.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database-members.html
index 91ec6a0047..8bfabf89ba 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database-members.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database-members.html
@@ -91,7 +91,7 @@ $(function() {
<tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#a4744bf4a1b48f202d41b51dc5e08e6ee">operator<</a>(const ObjectRef &other) const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#affdf1b8cdb36e140de7b3ad7064e4617">operator==</a>(const ObjectRef &other) const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#ae4f6e0e951be446d2ab836eb8a9bcc83">OrderedUnionDatabase</a>(Array< Database, void > databases)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">tvm::meta_schedule::Database</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#ae81f559342e4628ea1bffce6db36e547">PyDatabase</a>(PyDatabaseNode::FHasWorkload f_has_workload, PyDatabaseNode::FCommitWorkload f_commit_workload, PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, PyDatabaseNode::FSize f_size)</td><td class="entry"><a class="el" href="classtvm_1_1 [...]
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#a093004b24893fba8c6855aacd8cc46e2">PyDatabase</a>(PyDatabaseNode::FHasWorkload f_has_workload, PyDatabaseNode::FCommitWorkload f_commit_workload, PyDatabaseNode::FCommitTuningRecord f_commit_tuning_record, PyDatabaseNode::FGetTopK f_get_top_k, PyDatabaseNode::FGetAllTuningRecords f_get_all_tuning_records, PyDatabaseNode::FQueryTuningRecord f_query_tuning_record, PyDatabaseNode::FQuerySche [...]
<tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html#ae31a5b9f40781d60a2901994ead700e8">same_as</a>(const ObjectRef &other) const</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1ObjectRef.html">tvm::runtime::ObjectRef</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#afd3cddb62e6fad7974e457b708c895a4">ScheduleFnDatabase</a>(runtime::TypedPackedFunc< bool(tir::Schedule)> schedule_fn)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">tvm::meta_schedule::Database</a></td><td class="entry"><span class="mlabel">static</span></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#afb40a32e35f299ee0c6cd6f99f1ed44a">TVM_DEFINE_MUTABLE_NOTNULLABLE_OBJECT_REF_METHODS</a>(Database, runtime::ObjectRef, DatabaseNode)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">tvm::meta_schedule::Database</a></td><td class="entry"></td></tr>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database.html
index 8dd3a439cc..636d45ccdc 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1Database.html
@@ -148,9 +148,9 @@ Static Public Member Functions</h2></td></tr>
<tr class="memitem:ae4f6e0e951be446d2ab836eb8a9bcc83"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">Database</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#ae4f6e0e951be446d2ab836eb8a9bcc83">OrderedUnionDatabase</a> (<a class="el" href="classtvm_1_1runtime_1_1Array.html">Array</a>< <a class="el" href="classtvm_1_1meta__schedule_1_1Databas [...]
<tr class="memdesc:ae4f6e0e951be446d2ab836eb8a9bcc83"><td class="mdescLeft"> </td><td class="mdescRight">A database composed of multiple databases, allowing users to guide IR rewriting using combined knowledge of those databases. To each query, it returns the record from the first database that responds to the query. <a href="#ae4f6e0e951be446d2ab836eb8a9bcc83">More...</a><br /></td></tr>
<tr class="separator:ae4f6e0e951be446d2ab836eb8a9bcc83"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:ae81f559342e4628ea1bffce6db36e547"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">Database</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#ae81f559342e4628ea1bffce6db36e547">PyDatabase</a> (<a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#afb177ebca29227e94c3b3036b4908bad">PyDatabaseNode::FHasWorkload</a [...]
-<tr class="memdesc:ae81f559342e4628ea1bffce6db36e547"><td class="mdescLeft"> </td><td class="mdescRight">Create a database with customized methods on the python-side. <a href="#ae81f559342e4628ea1bffce6db36e547">More...</a><br /></td></tr>
-<tr class="separator:ae81f559342e4628ea1bffce6db36e547"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a093004b24893fba8c6855aacd8cc46e2"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">Database</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#a093004b24893fba8c6855aacd8cc46e2">PyDatabase</a> (<a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#afb177ebca29227e94c3b3036b4908bad">PyDatabaseNode::FHasWorkload</a [...]
+<tr class="memdesc:a093004b24893fba8c6855aacd8cc46e2"><td class="mdescLeft"> </td><td class="mdescRight">Create a database with customized methods on the python-side. <a href="#a093004b24893fba8c6855aacd8cc46e2">More...</a><br /></td></tr>
+<tr class="separator:a093004b24893fba8c6855aacd8cc46e2"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a4b338c39afa925bc556b067b333e27a0"><td class="memItemLeft" align="right" valign="top">static <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1meta__schedule_1_1Database.html">Database</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1Database.html#a4b338c39afa925bc556b067b333e27a0">Current</a> ()</td></tr>
<tr class="separator:a4b338c39afa925bc556b067b333e27a0"><td class="memSeparator" colspan="2"> </td></tr>
</table><table class="memberdecls">
@@ -364,8 +364,8 @@ Additional Inherited Members</h2></td></tr>
</div>
</div>
-<a id="ae81f559342e4628ea1bffce6db36e547"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ae81f559342e4628ea1bffce6db36e547">◆ </a></span>PyDatabase()</h2>
+<a id="a093004b24893fba8c6855aacd8cc46e2"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a093004b24893fba8c6855aacd8cc46e2">◆ </a></span>PyDatabase()</h2>
<div class="memitem">
<div class="memproto">
@@ -403,6 +403,24 @@ Additional Inherited Members</h2></td></tr>
<td class="paramtype"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a574d90736eda21019540d4a26c155b28">PyDatabaseNode::FGetAllTuningRecords</a> </td>
<td class="paramname"><em>f_get_all_tuning_records</em>, </td>
</tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">PyDatabaseNode::FQueryTuningRecord</a> </td>
+ <td class="paramname"><em>f_query_tuning_record</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">PyDatabaseNode::FQuerySchedule</a> </td>
+ <td class="paramname"><em>f_query_schedule</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">PyDatabaseNode::FQueryIRModule</a> </td>
+ <td class="paramname"><em>f_query_ir_module</em>, </td>
+ </tr>
<tr>
<td class="paramkey"></td>
<td></td>
@@ -430,6 +448,9 @@ Additional Inherited Members</h2></td></tr>
<tr><td class="paramname">f_commit_tuning_record</td><td>The packed function of <code>CommitTuningRecord</code>. </td></tr>
<tr><td class="paramname">f_get_top_k</td><td>The packed function of <code>GetTopK</code>. </td></tr>
<tr><td class="paramname">f_get_all_tuning_records</td><td>The packed function of <code>GetAllTuningRecords</code>. </td></tr>
+ <tr><td class="paramname">f_query_tuning_record</td><td>The packed function of <code>QueryTuningRecord</code>. </td></tr>
+ <tr><td class="paramname">f_query_schedule</td><td>The packed function of <code>QuerySchedule</code>. </td></tr>
+ <tr><td class="paramname">f_query_ir_module</td><td>The packed function of <code>QueryIRModule</code>. </td></tr>
<tr><td class="paramname">f_size</td><td>The packed function of <code>Size</code>. </td></tr>
</table>
</dd>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode.html
index 4d6e3e4c93..a08b1d2e8c 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode.html
@@ -75,7 +75,7 @@ $(function() {
<div class="dynheader">
Inheritance diagram for tvm::meta_schedule::DatabaseNode:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg" width="290" height="1160"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg" width="290" height="1248"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
<div class="dynheader">
@@ -476,6 +476,8 @@ Additional Inherited Members</h2></td></tr>
</dl>
<dl class="section return"><dt>Returns</dt><dd>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> in the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload; NullOpt if not found. </dd></dl>
+<p>Reimplemented in <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a4a21df0e4369b208e8d0332c0dcdfee3">tvm::meta_schedule::PyDatabaseNode</a>.</p>
+
</div>
</div>
<a id="a638febf77b9cb7590d6babb28a97a020"></a>
@@ -529,6 +531,8 @@ Additional Inherited Members</h2></td></tr>
</dl>
<dl class="section return"><dt>Returns</dt><dd>The schedule in the best schedule of the given workload; NullOpt if not found. </dd></dl>
+<p>Reimplemented in <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a340ce2715f3f9be3ded8a4560a45f5d3">tvm::meta_schedule::PyDatabaseNode</a>.</p>
+
</div>
</div>
<a id="adb5dd2d61af2ac335d68b402c057d612"></a>
@@ -582,6 +586,8 @@ Additional Inherited Members</h2></td></tr>
</dl>
<dl class="section return"><dt>Returns</dt><dd>The best record of the given workload; NullOpt if not found. </dd></dl>
+<p>Reimplemented in <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a76186192f9e7e52d8c9f1e3b53fe0e60">tvm::meta_schedule::PyDatabaseNode</a>.</p>
+
</div>
</div>
<a id="aae5b9ab9f7e497654b90c23a2159a5cc"></a>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg
index f52d7061cc..781234be73 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1DatabaseNode__inherit__graph.svg
@@ -4,54 +4,60 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: tvm::meta_schedule::DatabaseNode Pages: 1 -->
-<svg width="217pt" height="870pt"
- viewBox="0.00 0.00 217.00 870.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 866)">
+<svg width="217pt" height="936pt"
+ viewBox="0.00 0.00 217.00 936.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 932)">
<title>tvm::meta_schedule::DatabaseNode</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-866 213,-866 213,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-932 213,-932 213,4 -4,4"/>
<!-- Node0 -->
<g id="node1" class="node">
<title>Node0</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="1,-248.5 1,-426.5 208,-426.5 208,-248.5 1,-248.5"/>
-<text text-anchor="start" x="9" y="-414.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="104.5" y="-403.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="1,-396.5 208,-396.5 "/>
-<text text-anchor="start" x="9" y="-384.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="1,-377.5 208,-377.5 "/>
-<text text-anchor="start" x="9" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
-<text text-anchor="start" x="9" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="9" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="9" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="9" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="9" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
-<text text-anchor="start" x="9" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
-<text text-anchor="start" x="9" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
-<text text-anchor="start" x="9" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
-<text text-anchor="start" x="9" y="-266.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
-<text text-anchor="start" x="9" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="1,-314.5 1,-492.5 208,-492.5 208,-314.5 1,-314.5"/>
+<text text-anchor="start" x="9" y="-480.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="104.5" y="-469.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="1,-462.5 208,-462.5 "/>
+<text text-anchor="start" x="9" y="-450.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="1,-443.5 208,-443.5 "/>
+<text text-anchor="start" x="9" y="-431.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
+<text text-anchor="start" x="9" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="9" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="9" y="-398.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="9" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="9" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="9" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
+<text text-anchor="start" x="9" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="9" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="9" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
+<text text-anchor="start" x="9" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
</g>
<!-- Node2 -->
<g id="node3" class="node">
<title>Node2</title>
<g id="a_node3"><a xlink:href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html" target="_top" xlink:title="The database with customized methods on the python-side. ">
-<polygon fill="#ffffff" stroke="#000000" points="0,-.5 0,-211.5 209,-211.5 209,-.5 0,-.5"/>
-<text text-anchor="start" x="8" y="-199.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="104.5" y="-188.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="0,-181.5 209,-181.5 "/>
-<text text-anchor="start" x="8" y="-169.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_has_workload</text>
-<text text-anchor="start" x="8" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_workload</text>
-<text text-anchor="start" x="8" y="-147.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_tuning_record</text>
-<text text-anchor="start" x="8" y="-136.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_top_k</text>
-<text text-anchor="start" x="8" y="-125.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_all_tuning_records</text>
-<text text-anchor="start" x="8" y="-114.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_size</text>
-<text text-anchor="start" x="8" y="-103.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="0,-96.5 209,-96.5 "/>
-<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
-<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<polygon fill="#ffffff" stroke="#000000" points="0,-.5 0,-277.5 209,-277.5 209,-.5 0,-.5"/>
+<text text-anchor="start" x="8" y="-265.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="104.5" y="-254.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="0,-247.5 209,-247.5 "/>
+<text text-anchor="start" x="8" y="-235.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_has_workload</text>
+<text text-anchor="start" x="8" y="-224.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_workload</text>
+<text text-anchor="start" x="8" y="-213.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_tuning_record</text>
+<text text-anchor="start" x="8" y="-202.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_top_k</text>
+<text text-anchor="start" x="8" y="-191.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_all_tuning_records</text>
+<text text-anchor="start" x="8" y="-180.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_tuning_record</text>
+<text text-anchor="start" x="8" y="-169.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_schedule</text>
+<text text-anchor="start" x="8" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_ir_module</text>
+<text text-anchor="start" x="8" y="-147.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_size</text>
+<text text-anchor="start" x="8" y="-136.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="0,-129.5 209,-129.5 "/>
+<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
+<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_FINAL_OBJECT_INFO()</text>
</a>
@@ -60,58 +66,58 @@
<!-- Node0->Node2 -->
<g id="edge2" class="edge">
<title>Node0->Node2</title>
-<path fill="none" stroke="#191970" d="M104.5,-238.1421C104.5,-229.4057 104.5,-220.5421 104.5,-211.756"/>
-<polygon fill="none" stroke="#191970" points="101.0001,-238.3272 104.5,-248.3272 108.0001,-238.3272 101.0001,-238.3272"/>
+<path fill="none" stroke="#191970" d="M104.5,-304.2113C104.5,-295.5113 104.5,-286.6081 104.5,-277.6657"/>
+<polygon fill="none" stroke="#191970" points="101.0001,-304.3211 104.5,-314.3211 108.0001,-304.3211 101.0001,-304.3211"/>
</g>
<!-- Node1 -->
<g id="node2" class="node">
<title>Node1</title>
<g id="a_node2"><a xlink:href="classtvm_1_1runtime_1_1Object.html" target="_top" xlink:title="base class of all object containers. ">
-<polygon fill="#ffffff" stroke="#000000" points="13,-463.5 13,-861.5 196,-861.5 196,-463.5 13,-463.5"/>
-<text text-anchor="middle" x="104.5" y="-849.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
-<polyline fill="none" stroke="#000000" points="13,-842.5 196,-842.5 "/>
-<text text-anchor="start" x="21" y="-830.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<text text-anchor="start" x="21" y="-819.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
-<text text-anchor="start" x="21" y="-808.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
-<text text-anchor="start" x="21" y="-797.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
-<text text-anchor="start" x="21" y="-786.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
-<text text-anchor="start" x="21" y="-775.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
-<text text-anchor="start" x="21" y="-764.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
-<text text-anchor="start" x="21" y="-753.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
-<text text-anchor="start" x="21" y="-742.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="21" y="-731.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
-<text text-anchor="start" x="21" y="-720.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="21" y="-709.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
-<text text-anchor="start" x="21" y="-698.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
-<text text-anchor="start" x="21" y="-687.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
-<text text-anchor="start" x="21" y="-676.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># deleter_</text>
-<polyline fill="none" stroke="#000000" points="13,-669.5 196,-669.5 "/>
-<text text-anchor="start" x="21" y="-657.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
-<text text-anchor="start" x="21" y="-646.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
-<text text-anchor="start" x="21" y="-635.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
-<text text-anchor="start" x="21" y="-624.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
-<text text-anchor="start" x="21" y="-613.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
-<text text-anchor="start" x="21" y="-602.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-591.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-580.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-569.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="21" y="-558.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="21" y="-547.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
-<text text-anchor="start" x="21" y="-536.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
-<text text-anchor="start" x="21" y="-525.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
-<text text-anchor="start" x="21" y="-514.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
-<text text-anchor="start" x="21" y="-503.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
-<text text-anchor="start" x="21" y="-492.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
-<text text-anchor="start" x="21" y="-481.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
-<text text-anchor="start" x="21" y="-470.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
+<polygon fill="#ffffff" stroke="#000000" points="13,-529.5 13,-927.5 196,-927.5 196,-529.5 13,-529.5"/>
+<text text-anchor="middle" x="104.5" y="-915.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
+<polyline fill="none" stroke="#000000" points="13,-908.5 196,-908.5 "/>
+<text text-anchor="start" x="21" y="-896.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<text text-anchor="start" x="21" y="-885.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
+<text text-anchor="start" x="21" y="-874.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
+<text text-anchor="start" x="21" y="-863.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
+<text text-anchor="start" x="21" y="-852.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
+<text text-anchor="start" x="21" y="-841.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
+<text text-anchor="start" x="21" y="-830.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
+<text text-anchor="start" x="21" y="-819.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
+<text text-anchor="start" x="21" y="-808.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="21" y="-797.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
+<text text-anchor="start" x="21" y="-786.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="21" y="-775.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
+<text text-anchor="start" x="21" y="-764.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
+<text text-anchor="start" x="21" y="-753.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
+<text text-anchor="start" x="21" y="-742.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># deleter_</text>
+<polyline fill="none" stroke="#000000" points="13,-735.5 196,-735.5 "/>
+<text text-anchor="start" x="21" y="-723.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
+<text text-anchor="start" x="21" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
+<text text-anchor="start" x="21" y="-701.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
+<text text-anchor="start" x="21" y="-690.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
+<text text-anchor="start" x="21" y="-679.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
+<text text-anchor="start" x="21" y="-668.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-657.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-646.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-635.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="21" y="-624.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="21" y="-613.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
+<text text-anchor="start" x="21" y="-602.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
+<text text-anchor="start" x="21" y="-591.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
+<text text-anchor="start" x="21" y="-580.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
+<text text-anchor="start" x="21" y="-569.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
+<text text-anchor="start" x="21" y="-558.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
+<text text-anchor="start" x="21" y="-547.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
+<text text-anchor="start" x="21" y="-536.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
</a>
</g>
</g>
<!-- Node1->Node0 -->
<g id="edge1" class="edge">
<title>Node1->Node0</title>
-<path fill="none" stroke="#191970" d="M104.5,-452.883C104.5,-443.8603 104.5,-435.0496 104.5,-426.5763"/>
-<polygon fill="none" stroke="#191970" points="101.0001,-453.1535 104.5,-463.1535 108.0001,-453.1535 101.0001,-453.1535"/>
+<path fill="none" stroke="#191970" d="M104.5,-518.883C104.5,-509.8603 104.5,-501.0496 104.5,-492.5763"/>
+<polygon fill="none" stroke="#191970" points="101.0001,-519.1535 104.5,-529.1535 108.0001,-519.1535 101.0001,-519.1535"/>
</g>
</g>
</svg>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode-members.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode-members.html
index 8d0532c070..0d17cee104 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode-members.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode-members.html
@@ -87,13 +87,19 @@ $(function() {
<tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#aafdd0874be052072521b2aa8a6c56d5f">f_get_all_tuning_records</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a52fb1116090619e95fb6b28352308eed">f_get_top_k</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#add146bf1e2006f72ed1534b2004bcb06">f_has_workload</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ac7ae1a05fe5c7858f5860133a82bc7b7">f_size</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a653d04c0c6349350489c0ea5f68563f1">FCommitTuningRecord</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a44b8d5e2721f12bdaf1a457b85f23124">FCommitWorkload</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a9e84841ca982bff376a978ade0132631">FDeleter</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a574d90736eda21019540d4a26c155b28">FGetAllTuningRecords</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ad5e04e950cd2a63f439d95285b5674b6">FGetTopK</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#afb177ebca29227e94c3b3036b4908bad">FHasWorkload</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#abd9fc8fc83bc6c252465ffdbcb310bfc">f_query_ir_module</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a4a03c70569c9a18059861dfb5c90e845">f_query_schedule</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a65fcb9b59b8ce6e685fb62c4459c57ba">f_query_tuning_record</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ac7ae1a05fe5c7858f5860133a82bc7b7">f_size</a></td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a653d04c0c6349350489c0ea5f68563f1">FCommitTuningRecord</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a44b8d5e2721f12bdaf1a457b85f23124">FCommitWorkload</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a9e84841ca982bff376a978ade0132631">FDeleter</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a574d90736eda21019540d4a26c155b28">FGetAllTuningRecords</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ad5e04e950cd2a63f439d95285b5674b6">FGetTopK</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#afb177ebca29227e94c3b3036b4908bad">FHasWorkload</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">FQueryIRModule</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">FQuerySchedule</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">FQueryTuningRecord</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a34efc3d18473d179b13332abe5c63324">FSize</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ad07d7d9e78771eaa2e6e65f84e032401">GetAllTuningRecords</a>() final</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a726972ff315c446192df94027ddea032">GetOrAllocRuntimeTypeIndex</a>(const std::string &key, uint32_t static_tindex, uint32_t parent_tindex, uint32_t type_child_slots, bool type_child_slots_can_overflow)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">protected</span><span class="mlabel">static</span></td></tr>
@@ -108,9 +114,9 @@ $(function() {
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#aa1612f69ea5b4225d4cda759cd517323">Object</a>(Object &&other)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a69c32fbd96181f5c21d2c878ab285e4f">operator=</a>(const Object &other)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#ae341e561272ff43cdcbc927bc29ac50d">operator=</a>(Object &&other)</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">inline</span></td></tr>
- <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#aeb4101db551afa93ea144b9b173783a0">QueryIRModule</a>(const IRModule &mod, const Target &target, const String &workload_name)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html">tvm::meta_schedule::DatabaseNode</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
- <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a638febf77b9cb7590d6babb28a97a020">QuerySchedule</a>(const IRModule &mod, const Target &target, const String &workload_name)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html">tvm::meta_schedule::DatabaseNode</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
- <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#adb5dd2d61af2ac335d68b402c057d612">QueryTuningRecord</a>(const IRModule &mod, const Target &target, const String &workload_name)</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html">tvm::meta_schedule::DatabaseNode</a></td><td class="entry"><span class="mlabel">virtual</span></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a4a21df0e4369b208e8d0332c0dcdfee3">QueryIRModule</a>(const IRModule &mod, const Target &target, const String &workload_name) final</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+ <tr class="even"><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a340ce2715f3f9be3ded8a4560a45f5d3">QuerySchedule</a>(const IRModule &mod, const Target &target, const String &workload_name) final</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
+ <tr><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a76186192f9e7e52d8c9f1e3b53fe0e60">QueryTuningRecord</a>(const IRModule &mod, const Target &target, const String &workload_name) final</td><td class="entry"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html">tvm::meta_schedule::PyDatabaseNode</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">virtual</span></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a0d492efee331e2239a093f4b2017c10f">ref_counter_</a></td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">protected</span></td></tr>
<tr><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#a55549a6c23987890246248682560a03d">RefCounterType</a> typedef</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"></td></tr>
<tr class="even"><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html#ad94d79729ac85aa7c976e23d39066383">RuntimeTypeIndex</a>()</td><td class="entry"><a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td><td class="entry"><span class="mlabel">inline</span><span class="mlabel">static</span></td></tr>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode.html b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode.html
index 7eefff8f05..233aeddfe0 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode.html
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode.html
@@ -80,13 +80,13 @@ $(function() {
<div class="dynheader">
Inheritance diagram for tvm::meta_schedule::PyDatabaseNode:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg" width="290" height="1160"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg" width="290" height="1248"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
<div class="dynheader">
Collaboration diagram for tvm::meta_schedule::PyDatabaseNode:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg" width="1816" height="1074"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg" width="2527" height="1118"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
<table class="memberdecls">
@@ -107,6 +107,15 @@ Public Types</h2></td></tr>
<tr class="memitem:a574d90736eda21019540d4a26c155b28"><td class="memItemLeft" align="right" valign="top">using </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a574d90736eda21019540d4a26c155b28">FGetAllTuningRecords</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a>< <a class="el" href="classtvm_1_1runtime_1_1Array.html">Array</a>< <a class="el" href="classtvm_1_ [...]
<tr class="memdesc:a574d90736eda21019540d4a26c155b28"><td class="mdescLeft"> </td><td class="mdescRight">The function type of <code>GetAllTuningRecords</code> method. <a href="#a574d90736eda21019540d4a26c155b28">More...</a><br /></td></tr>
<tr class="separator:a574d90736eda21019540d4a26c155b28"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:acd7fb3619d530c0ae85fb1d6e94f6e7d"><td class="memItemLeft" align="right" valign="top">using </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">FQueryTuningRecord</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a>< <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtv [...]
+<tr class="memdesc:acd7fb3619d530c0ae85fb1d6e94f6e7d"><td class="mdescLeft"> </td><td class="mdescRight">The function type of <code>QueryTuningRecord</code> method. <a href="#acd7fb3619d530c0ae85fb1d6e94f6e7d">More...</a><br /></td></tr>
+<tr class="separator:acd7fb3619d530c0ae85fb1d6e94f6e7d"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a16c17595db4a845b3511d6d7fa0f741d"><td class="memItemLeft" align="right" valign="top">using </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">FQuerySchedule</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a>< <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_ [...]
+<tr class="memdesc:a16c17595db4a845b3511d6d7fa0f741d"><td class="mdescLeft"> </td><td class="mdescRight">The function type of <code>QuerySchedule</code> method. <a href="#a16c17595db4a845b3511d6d7fa0f741d">More...</a><br /></td></tr>
+<tr class="separator:a16c17595db4a845b3511d6d7fa0f741d"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a713ae7e8634c0aedc366dffda2c899df"><td class="memItemLeft" align="right" valign="top">using </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">FQueryIRModule</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a>< <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_ [...]
+<tr class="memdesc:a713ae7e8634c0aedc366dffda2c899df"><td class="mdescLeft"> </td><td class="mdescRight">The function type of <code>QueryIRModule</code> method. <a href="#a713ae7e8634c0aedc366dffda2c899df">More...</a><br /></td></tr>
+<tr class="separator:a713ae7e8634c0aedc366dffda2c899df"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a34efc3d18473d179b13332abe5c63324"><td class="memItemLeft" align="right" valign="top">using </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a34efc3d18473d179b13332abe5c63324">FSize</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a>< int64_t()></td></tr>
<tr class="memdesc:a34efc3d18473d179b13332abe5c63324"><td class="mdescLeft"> </td><td class="mdescRight">The function type of <code>Size</code> method. <a href="#a34efc3d18473d179b13332abe5c63324">More...</a><br /></td></tr>
<tr class="separator:a34efc3d18473d179b13332abe5c63324"><td class="memSeparator" colspan="2"> </td></tr>
@@ -136,6 +145,15 @@ Public Member Functions</h2></td></tr>
<tr class="memitem:ad07d7d9e78771eaa2e6e65f84e032401"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Array.html">Array</a>< <a class="el" href="classtvm_1_1meta__schedule_1_1TuningRecord.html">TuningRecord</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ad07d7d9e78771eaa2e6e65f84e032401">GetAllTuningRecords</a> () final</td></tr>
<tr class="memdesc:ad07d7d9e78771eaa2e6e65f84e032401"><td class="mdescLeft"> </td><td class="mdescRight">Get all tuning records from the database. <a href="#ad07d7d9e78771eaa2e6e65f84e032401">More...</a><br /></td></tr>
<tr class="separator:ad07d7d9e78771eaa2e6e65f84e032401"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a76186192f9e7e52d8c9f1e3b53fe0e60"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1meta__schedule_1_1TuningRecord.html">TuningRecord</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a76186192f9e7e52d8c9f1e3b53fe0e60">QueryTuningRecord</a> (const <a class="el" href="classtvm_1_1IRMo [...]
+<tr class="memdesc:a76186192f9e7e52d8c9f1e3b53fe0e60"><td class="mdescLeft"> </td><td class="mdescRight">Query the best record of the given workload from the database. <a href="#a76186192f9e7e52d8c9f1e3b53fe0e60">More...</a><br /></td></tr>
+<tr class="separator:a76186192f9e7e52d8c9f1e3b53fe0e60"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a340ce2715f3f9be3ded8a4560a45f5d3"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1tir_1_1Schedule.html">tir::Schedule</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a340ce2715f3f9be3ded8a4560a45f5d3">QuerySchedule</a> (const <a class="el" href="classtvm_1_1IRModule.html">IRModul [...]
+<tr class="memdesc:a340ce2715f3f9be3ded8a4560a45f5d3"><td class="mdescLeft"> </td><td class="mdescRight">Query the best schedule of the given workload from the database. <a href="#a340ce2715f3f9be3ded8a4560a45f5d3">More...</a><br /></td></tr>
+<tr class="separator:a340ce2715f3f9be3ded8a4560a45f5d3"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a4a21df0e4369b208e8d0332c0dcdfee3"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a4a21df0e4369b208e8d0332c0dcdfee3">QueryIRModule</a> (const <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> &m [...]
+<tr class="memdesc:a4a21df0e4369b208e8d0332c0dcdfee3"><td class="mdescLeft"> </td><td class="mdescRight">Query the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload from the database. <a href="#a4a21df0e4369b208e8d0332c0dcdfee3">More...</a><br /></td></tr>
+<tr class="separator:a4a21df0e4369b208e8d0332c0dcdfee3"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:a36817d04978253571fef7d01427ce9c0"><td class="memItemLeft" align="right" valign="top">int64_t </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a36817d04978253571fef7d01427ce9c0">Size</a> () final</td></tr>
<tr class="memdesc:a36817d04978253571fef7d01427ce9c0"><td class="mdescLeft"> </td><td class="mdescRight">Get the size of the database. <a href="#a36817d04978253571fef7d01427ce9c0">More...</a><br /></td></tr>
<tr class="separator:a36817d04978253571fef7d01427ce9c0"><td class="memSeparator" colspan="2"> </td></tr>
@@ -145,15 +163,6 @@ Public Member Functions</h2></td></tr>
<tr class="memitem:a776359f44ac6b51e337d4a1efc3f04a9 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memItemLeft" align="right" valign="top">virtual </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a776359f44ac6b51e337d4a1efc3f04a9">~DatabaseNode</a> ()=default</td></tr>
<tr class="memdesc:a776359f44ac6b51e337d4a1efc3f04a9 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="mdescLeft"> </td><td class="mdescRight">Default destructor. <a href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a776359f44ac6b51e337d4a1efc3f04a9">More...</a><br /></td></tr>
<tr class="separator:a776359f44ac6b51e337d4a1efc3f04a9 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:adb5dd2d61af2ac335d68b402c057d612 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memItemLeft" align="right" valign="top">virtual <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1meta__schedule_1_1TuningRecord.html">TuningRecord</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#adb5dd2d61af2ac335d68b402c057d6 [...]
-<tr class="memdesc:adb5dd2d61af2ac335d68b402c057d612 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="mdescLeft"> </td><td class="mdescRight">Query the best record of the given workload from the database. <a href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#adb5dd2d61af2ac335d68b402c057d612">More...</a><br /></td></tr>
-<tr class="separator:adb5dd2d61af2ac335d68b402c057d612 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:a638febf77b9cb7590d6babb28a97a020 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memItemLeft" align="right" valign="top">virtual <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1tir_1_1Schedule.html">tir::Schedule</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a638febf77b9cb7590d6babb28a97a020">QuerySched [...]
-<tr class="memdesc:a638febf77b9cb7590d6babb28a97a020 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="mdescLeft"> </td><td class="mdescRight">Query the best schedule of the given workload from the database. <a href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a638febf77b9cb7590d6babb28a97a020">More...</a><br /></td></tr>
-<tr class="separator:a638febf77b9cb7590d6babb28a97a020 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memSeparator" colspan="2"> </td></tr>
-<tr class="memitem:aeb4101db551afa93ea144b9b173783a0 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memItemLeft" align="right" valign="top">virtual <a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a>< <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> > </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#aeb4101db551afa93ea144b9b173783a0">QueryIRModule</a> (con [...]
-<tr class="memdesc:aeb4101db551afa93ea144b9b173783a0 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="mdescLeft"> </td><td class="mdescRight">Query the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload from the database. <a href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#aeb4101db551afa93ea144b9b173783a0">More...</a><br /></td></tr>
-<tr class="separator:aeb4101db551afa93ea144b9b173783a0 inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ae7ea55bfa3703dfb6452573afc31a45e inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#ae7ea55bfa3703dfb6452573afc31a45e">TVM_DECLARE_BASE_OBJECT_INFO</a> (<a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html">DatabaseNode</a>, <a class="el" href="classtvm_1_1runtime_1_1Objec [...]
<tr class="separator:ae7ea55bfa3703dfb6452573afc31a45e inherit pub_methods_classtvm_1_1meta__schedule_1_1DatabaseNode"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="inherit_header pub_methods_classtvm_1_1runtime_1_1Object"><td colspan="2" onclick="javascript:toggleInherit('pub_methods_classtvm_1_1runtime_1_1Object')"><img src="closed.png" alt="-"/> Public Member Functions inherited from <a class="el" href="classtvm_1_1runtime_1_1Object.html">tvm::runtime::Object</a></td></tr>
@@ -196,6 +205,15 @@ Public Attributes</h2></td></tr>
<tr class="memitem:aafdd0874be052072521b2aa8a6c56d5f"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a574d90736eda21019540d4a26c155b28">FGetAllTuningRecords</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#aafdd0874be052072521b2aa8a6c56d5f">f_get_all_tuning_records</a></td></tr>
<tr class="memdesc:aafdd0874be052072521b2aa8a6c56d5f"><td class="mdescLeft"> </td><td class="mdescRight">The packed function to the <code>GetAllTuningRecords</code> function. <a href="#aafdd0874be052072521b2aa8a6c56d5f">More...</a><br /></td></tr>
<tr class="separator:aafdd0874be052072521b2aa8a6c56d5f"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a65fcb9b59b8ce6e685fb62c4459c57ba"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">FQueryTuningRecord</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a65fcb9b59b8ce6e685fb62c4459c57ba">f_query_tuning_record</a></td></tr>
+<tr class="memdesc:a65fcb9b59b8ce6e685fb62c4459c57ba"><td class="mdescLeft"> </td><td class="mdescRight">The packed function to the <code>QueryTuningRecord</code> function. <a href="#a65fcb9b59b8ce6e685fb62c4459c57ba">More...</a><br /></td></tr>
+<tr class="separator:a65fcb9b59b8ce6e685fb62c4459c57ba"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:a4a03c70569c9a18059861dfb5c90e845"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">FQuerySchedule</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a4a03c70569c9a18059861dfb5c90e845">f_query_schedule</a></td></tr>
+<tr class="memdesc:a4a03c70569c9a18059861dfb5c90e845"><td class="mdescLeft"> </td><td class="mdescRight">The packed function to the <code>QuerySchedule</code> function. <a href="#a4a03c70569c9a18059861dfb5c90e845">More...</a><br /></td></tr>
+<tr class="separator:a4a03c70569c9a18059861dfb5c90e845"><td class="memSeparator" colspan="2"> </td></tr>
+<tr class="memitem:abd9fc8fc83bc6c252465ffdbcb310bfc"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">FQueryIRModule</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#abd9fc8fc83bc6c252465ffdbcb310bfc">f_query_ir_module</a></td></tr>
+<tr class="memdesc:abd9fc8fc83bc6c252465ffdbcb310bfc"><td class="mdescLeft"> </td><td class="mdescRight">The packed function to the <code>QueryIRModule</code> function. <a href="#abd9fc8fc83bc6c252465ffdbcb310bfc">More...</a><br /></td></tr>
+<tr class="separator:abd9fc8fc83bc6c252465ffdbcb310bfc"><td class="memSeparator" colspan="2"> </td></tr>
<tr class="memitem:ac7ae1a05fe5c7858f5860133a82bc7b7"><td class="memItemLeft" align="right" valign="top"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a34efc3d18473d179b13332abe5c63324">FSize</a> </td><td class="memItemRight" valign="bottom"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#ac7ae1a05fe5c7858f5860133a82bc7b7">f_size</a></td></tr>
<tr class="memdesc:ac7ae1a05fe5c7858f5860133a82bc7b7"><td class="mdescLeft"> </td><td class="mdescRight">The packed function to the <code>Size</code> function. <a href="#ac7ae1a05fe5c7858f5860133a82bc7b7">More...</a><br /></td></tr>
<tr class="separator:ac7ae1a05fe5c7858f5860133a82bc7b7"><td class="memSeparator" colspan="2"> </td></tr>
@@ -373,6 +391,81 @@ Additional Inherited Members</h2></td></tr>
</dl>
<dl class="section return"><dt>Returns</dt><dd>Whether the database has the given workload. </dd></dl>
+</div>
+</div>
+<a id="a713ae7e8634c0aedc366dffda2c899df"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a713ae7e8634c0aedc366dffda2c899df">◆ </a></span>FQueryIRModule</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname">using <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">tvm::meta_schedule::PyDatabaseNode::FQueryIRModule</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a><<a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1IRModule.html">IRModule</a>>(const <a class="el" href="classtvm_1_1IRModule.html">IRMod [...]
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The function type of <code>QueryIRModule</code> method. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> in the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload; NullOpt if not found. </dd></dl>
+
+</div>
+</div>
+<a id="a16c17595db4a845b3511d6d7fa0f741d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a16c17595db4a845b3511d6d7fa0f741d">◆ </a></span>FQuerySchedule</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname">using <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">tvm::meta_schedule::PyDatabaseNode::FQuerySchedule</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a><<a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1tir_1_1Schedule.html">tir::Schedule</a>>( const <a class="el" href="classtvm_1_1IRModul [...]
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The function type of <code>QuerySchedule</code> method. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The schedule in the best schedule of the given workload; NullOpt if not found. </dd></dl>
+
+</div>
+</div>
+<a id="acd7fb3619d530c0ae85fb1d6e94f6e7d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#acd7fb3619d530c0ae85fb1d6e94f6e7d">◆ </a></span>FQueryTuningRecord</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname">using <a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">tvm::meta_schedule::PyDatabaseNode::FQueryTuningRecord</a> = <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html">runtime::TypedPackedFunc</a><<a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1meta__schedule_1_1TuningRecord.html">TuningRecord</a>>( const <a class="el" href="c [...]
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The function type of <code>QueryTuningRecord</code> method. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The best record of the given workload; NullOpt if not found. </dd></dl>
+
</div>
</div>
<a id="a34efc3d18473d179b13332abe5c63324"></a>
@@ -579,6 +672,171 @@ Additional Inherited Members</h2></td></tr>
<p>Implements <a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a04b2ddf6acb509d5cc848c8636f9619d">tvm::meta_schedule::DatabaseNode</a>.</p>
+</div>
+</div>
+<a id="a4a21df0e4369b208e8d0332c0dcdfee3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4a21df0e4369b208e8d0332c0dcdfee3">◆ </a></span>QueryIRModule()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+ <tr>
+ <td class="mlabels-left">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1IRModule.html">IRModule</a>> tvm::meta_schedule::PyDatabaseNode::QueryIRModule </td>
+ <td>(</td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> & </td>
+ <td class="paramname"><em>mod</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1Target.html">Target</a> & </td>
+ <td class="paramname"><em>target</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1runtime_1_1String.html">String</a> & </td>
+ <td class="paramname"><em>workload_name</em> </td>
+ </tr>
+ <tr>
+ <td></td>
+ <td>)</td>
+ <td></td><td></td>
+ </tr>
+ </table>
+ </td>
+ <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">final</span><span class="mlabel">virtual</span></span> </td>
+ </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Query the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload from the database. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> in the best <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> of the given workload; NullOpt if not found. </dd></dl>
+
+<p>Reimplemented from <a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#aeb4101db551afa93ea144b9b173783a0">tvm::meta_schedule::DatabaseNode</a>.</p>
+
+</div>
+</div>
+<a id="a340ce2715f3f9be3ded8a4560a45f5d3"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a340ce2715f3f9be3ded8a4560a45f5d3">◆ </a></span>QuerySchedule()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+ <tr>
+ <td class="mlabels-left">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1tir_1_1Schedule.html">tir::Schedule</a>> tvm::meta_schedule::PyDatabaseNode::QuerySchedule </td>
+ <td>(</td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> & </td>
+ <td class="paramname"><em>mod</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1Target.html">Target</a> & </td>
+ <td class="paramname"><em>target</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1runtime_1_1String.html">String</a> & </td>
+ <td class="paramname"><em>workload_name</em> </td>
+ </tr>
+ <tr>
+ <td></td>
+ <td>)</td>
+ <td></td><td></td>
+ </tr>
+ </table>
+ </td>
+ <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">final</span><span class="mlabel">virtual</span></span> </td>
+ </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Query the best schedule of the given workload from the database. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The schedule in the best schedule of the given workload; NullOpt if not found. </dd></dl>
+
+<p>Reimplemented from <a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#a638febf77b9cb7590d6babb28a97a020">tvm::meta_schedule::DatabaseNode</a>.</p>
+
+</div>
+</div>
+<a id="a76186192f9e7e52d8c9f1e3b53fe0e60"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a76186192f9e7e52d8c9f1e3b53fe0e60">◆ </a></span>QueryTuningRecord()</h2>
+
+<div class="memitem">
+<div class="memproto">
+<table class="mlabels">
+ <tr>
+ <td class="mlabels-left">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1runtime_1_1Optional.html">Optional</a><<a class="el" href="classtvm_1_1meta__schedule_1_1TuningRecord.html">TuningRecord</a>> tvm::meta_schedule::PyDatabaseNode::QueryTuningRecord </td>
+ <td>(</td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1IRModule.html">IRModule</a> & </td>
+ <td class="paramname"><em>mod</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1Target.html">Target</a> & </td>
+ <td class="paramname"><em>target</em>, </td>
+ </tr>
+ <tr>
+ <td class="paramkey"></td>
+ <td></td>
+ <td class="paramtype">const <a class="el" href="classtvm_1_1runtime_1_1String.html">String</a> & </td>
+ <td class="paramname"><em>workload_name</em> </td>
+ </tr>
+ <tr>
+ <td></td>
+ <td>)</td>
+ <td></td><td></td>
+ </tr>
+ </table>
+ </td>
+ <td class="mlabels-right">
+<span class="mlabels"><span class="mlabel">inline</span><span class="mlabel">final</span><span class="mlabel">virtual</span></span> </td>
+ </tr>
+</table>
+</div><div class="memdoc">
+
+<p>Query the best record of the given workload from the database. </p>
+<dl class="params"><dt>Parameters</dt><dd>
+ <table class="params">
+ <tr><td class="paramname">mod</td><td>The <a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> to be searched for. </td></tr>
+ <tr><td class="paramname">target</td><td>The target to be searched for. </td></tr>
+ <tr><td class="paramname">workload_name</td><td>The name of the workload to be searched for. </td></tr>
+ </table>
+ </dd>
+</dl>
+<dl class="section return"><dt>Returns</dt><dd>The best record of the given workload; NullOpt if not found. </dd></dl>
+
+<p>Reimplemented from <a class="el" href="classtvm_1_1meta__schedule_1_1DatabaseNode.html#adb5dd2d61af2ac335d68b402c057d612">tvm::meta_schedule::DatabaseNode</a>.</p>
+
</div>
</div>
<a id="a36817d04978253571fef7d01427ce9c0"></a>
@@ -766,6 +1024,54 @@ Additional Inherited Members</h2></td></tr>
<p>The packed function to the <code>HasWorkload</code> function. </p>
+</div>
+</div>
+<a id="abd9fc8fc83bc6c252465ffdbcb310bfc"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#abd9fc8fc83bc6c252465ffdbcb310bfc">◆ </a></span>f_query_ir_module</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a713ae7e8634c0aedc366dffda2c899df">FQueryIRModule</a> tvm::meta_schedule::PyDatabaseNode::f_query_ir_module</td>
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The packed function to the <code>QueryIRModule</code> function. </p>
+
+</div>
+</div>
+<a id="a4a03c70569c9a18059861dfb5c90e845"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a4a03c70569c9a18059861dfb5c90e845">◆ </a></span>f_query_schedule</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#a16c17595db4a845b3511d6d7fa0f741d">FQuerySchedule</a> tvm::meta_schedule::PyDatabaseNode::f_query_schedule</td>
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The packed function to the <code>QuerySchedule</code> function. </p>
+
+</div>
+</div>
+<a id="a65fcb9b59b8ce6e685fb62c4459c57ba"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#a65fcb9b59b8ce6e685fb62c4459c57ba">◆ </a></span>f_query_tuning_record</h2>
+
+<div class="memitem">
+<div class="memproto">
+ <table class="memname">
+ <tr>
+ <td class="memname"><a class="el" href="classtvm_1_1meta__schedule_1_1PyDatabaseNode.html#acd7fb3619d530c0ae85fb1d6e94f6e7d">FQueryTuningRecord</a> tvm::meta_schedule::PyDatabaseNode::f_query_tuning_record</td>
+ </tr>
+ </table>
+</div><div class="memdoc">
+
+<p>The packed function to the <code>QueryTuningRecord</code> function. </p>
+
</div>
</div>
<a id="ac7ae1a05fe5c7858f5860133a82bc7b7"></a>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg
index 8b5caf632e..e6d6349bc5 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__coll__graph.svg
@@ -4,241 +4,313 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: tvm::meta_schedule::PyDatabaseNode Pages: 1 -->
-<svg width="1362pt" height="805pt"
- viewBox="0.00 0.00 1362.00 805.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 801)">
+<svg width="1895pt" height="838pt"
+ viewBox="0.00 0.00 1895.00 838.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 834)">
<title>tvm::meta_schedule::PyDatabaseNode</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-801 1358,-801 1358,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-834 1891,-834 1891,4 -4,4"/>
<!-- Node3 -->
<g id="node1" class="node">
<title>Node3</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="566,-.5 566,-145.5 775,-145.5 775,-.5 566,-.5"/>
-<text text-anchor="start" x="574" y="-133.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="670.5" y="-122.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="566,-115.5 775,-115.5 "/>
-<text text-anchor="start" x="574" y="-103.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="566,-96.5 775,-96.5 "/>
-<text text-anchor="start" x="574" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
-<text text-anchor="start" x="574" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="574" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="574" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="574" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="574" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
-<text text-anchor="start" x="574" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
-<text text-anchor="start" x="574" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_FINAL_OBJECT_INFO()</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="827,-.5 827,-178.5 1036,-178.5 1036,-.5 827,-.5"/>
+<text text-anchor="start" x="835" y="-166.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="931.5" y="-155.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="827,-148.5 1036,-148.5 "/>
+<text text-anchor="start" x="835" y="-136.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="827,-129.5 1036,-129.5 "/>
+<text text-anchor="start" x="835" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
+<text text-anchor="start" x="835" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="835" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="835" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="835" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="835" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="835" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="835" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="835" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
+<text text-anchor="start" x="835" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
+<text text-anchor="start" x="835" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_FINAL_OBJECT_INFO()</text>
</g>
<!-- Node4 -->
<g id="node2" class="node">
<title>Node4</title>
<g id="a_node2"><a xlink:href="classtvm_1_1meta__schedule_1_1DatabaseNode.html" target="_top" xlink:title="{tvm::meta_schedule\l::DatabaseNode\n|+ _type_key\l|+ ~DatabaseNode()\l+ HasWorkload()\l+ CommitWorkload()\l+ CommitTuningRecord()\l+ GetTopK()\l+ GetAllTuningRecords()\l+ Size()\l+ QueryTuningRecord()\l+ QuerySchedule()\l+ QueryIRModule()\l+ TVM_DECLARE_BASE_OBJECT_INFO()\l}">
-<polygon fill="#ffffff" stroke="#000000" points="0,-193.5 0,-371.5 207,-371.5 207,-193.5 0,-193.5"/>
-<text text-anchor="start" x="8" y="-359.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="103.5" y="-348.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="0,-341.5 207,-341.5 "/>
-<text text-anchor="start" x="8" y="-329.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="0,-322.5 207,-322.5 "/>
-<text text-anchor="start" x="8" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
-<text text-anchor="start" x="8" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="8" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="8" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="8" y="-266.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="8" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
-<text text-anchor="start" x="8" y="-244.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
-<text text-anchor="start" x="8" y="-233.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
-<text text-anchor="start" x="8" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
-<text text-anchor="start" x="8" y="-211.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
-<text text-anchor="start" x="8" y="-200.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
+<polygon fill="#ffffff" stroke="#000000" points="0,-226.5 0,-404.5 207,-404.5 207,-226.5 0,-226.5"/>
+<text text-anchor="start" x="8" y="-392.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="103.5" y="-381.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="0,-374.5 207,-374.5 "/>
+<text text-anchor="start" x="8" y="-362.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="0,-355.5 207,-355.5 "/>
+<text text-anchor="start" x="8" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
+<text text-anchor="start" x="8" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="8" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="8" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="8" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="8" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="8" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
+<text text-anchor="start" x="8" y="-266.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="8" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="8" y="-244.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
+<text text-anchor="start" x="8" y="-233.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
</a>
</g>
</g>
<!-- Node4->Node3 -->
<g id="edge1" class="edge">
<title>Node4->Node3</title>
-<path fill="none" stroke="#191970" d="M216.4437,-192.8137C335.4573,-134.6718 473.0549,-103.9018 565.8164,-87.8866"/>
-<polygon fill="none" stroke="#191970" points="214.5805,-189.8311 207.181,-197.4138 217.6941,-196.1005 214.5805,-189.8311"/>
+<path fill="none" stroke="#191970" d="M216.3934,-225.8991C426.5422,-131.5736 685.0934,-102.8631 826.8539,-93.7431"/>
+<polygon fill="none" stroke="#191970" points="214.7057,-222.8217 207.0537,-230.1495 217.6053,-229.1929 214.7057,-222.8217"/>
</g>
<!-- Node5 -->
<g id="node3" class="node">
<title>Node5</title>
<g id="a_node3"><a xlink:href="classtvm_1_1runtime_1_1Object.html" target="_top" xlink:title="base class of all object containers. ">
-<polygon fill="#ffffff" stroke="#000000" points="12,-409.5 12,-796.5 195,-796.5 195,-409.5 12,-409.5"/>
-<text text-anchor="middle" x="103.5" y="-784.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
-<polyline fill="none" stroke="#000000" points="12,-777.5 195,-777.5 "/>
-<text text-anchor="start" x="20" y="-765.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<text text-anchor="start" x="20" y="-754.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
-<text text-anchor="start" x="20" y="-743.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
-<text text-anchor="start" x="20" y="-732.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
-<text text-anchor="start" x="20" y="-721.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
-<text text-anchor="start" x="20" y="-710.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
-<text text-anchor="start" x="20" y="-699.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
-<text text-anchor="start" x="20" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
-<text text-anchor="start" x="20" y="-677.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="20" y="-666.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
-<text text-anchor="start" x="20" y="-655.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="20" y="-644.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
-<text text-anchor="start" x="20" y="-633.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
-<text text-anchor="start" x="20" y="-622.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
-<polyline fill="none" stroke="#000000" points="12,-615.5 195,-615.5 "/>
-<text text-anchor="start" x="20" y="-603.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
-<text text-anchor="start" x="20" y="-592.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
-<text text-anchor="start" x="20" y="-581.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
-<text text-anchor="start" x="20" y="-570.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
-<text text-anchor="start" x="20" y="-559.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
-<text text-anchor="start" x="20" y="-548.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="20" y="-537.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="20" y="-526.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="20" y="-515.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="20" y="-504.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="20" y="-493.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
-<text text-anchor="start" x="20" y="-482.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
-<text text-anchor="start" x="20" y="-471.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
-<text text-anchor="start" x="20" y="-460.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
-<text text-anchor="start" x="20" y="-449.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
-<text text-anchor="start" x="20" y="-438.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
-<text text-anchor="start" x="20" y="-427.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
-<text text-anchor="start" x="20" y="-416.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
+<polygon fill="#ffffff" stroke="#000000" points="12,-442.5 12,-829.5 195,-829.5 195,-442.5 12,-442.5"/>
+<text text-anchor="middle" x="103.5" y="-817.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
+<polyline fill="none" stroke="#000000" points="12,-810.5 195,-810.5 "/>
+<text text-anchor="start" x="20" y="-798.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<text text-anchor="start" x="20" y="-787.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
+<text text-anchor="start" x="20" y="-776.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
+<text text-anchor="start" x="20" y="-765.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
+<text text-anchor="start" x="20" y="-754.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
+<text text-anchor="start" x="20" y="-743.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
+<text text-anchor="start" x="20" y="-732.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
+<text text-anchor="start" x="20" y="-721.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
+<text text-anchor="start" x="20" y="-710.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="20" y="-699.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
+<text text-anchor="start" x="20" y="-688.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="20" y="-677.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
+<text text-anchor="start" x="20" y="-666.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
+<text text-anchor="start" x="20" y="-655.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
+<polyline fill="none" stroke="#000000" points="12,-648.5 195,-648.5 "/>
+<text text-anchor="start" x="20" y="-636.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
+<text text-anchor="start" x="20" y="-625.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
+<text text-anchor="start" x="20" y="-614.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
+<text text-anchor="start" x="20" y="-603.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
+<text text-anchor="start" x="20" y="-592.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
+<text text-anchor="start" x="20" y="-581.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="20" y="-570.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="20" y="-559.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="20" y="-548.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="20" y="-537.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="20" y="-526.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
+<text text-anchor="start" x="20" y="-515.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
+<text text-anchor="start" x="20" y="-504.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
+<text text-anchor="start" x="20" y="-493.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
+<text text-anchor="start" x="20" y="-482.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
+<text text-anchor="start" x="20" y="-471.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
+<text text-anchor="start" x="20" y="-460.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
+<text text-anchor="start" x="20" y="-449.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
</a>
</g>
</g>
<!-- Node5->Node4 -->
<g id="edge2" class="edge">
<title>Node5->Node4</title>
-<path fill="none" stroke="#191970" d="M103.5,-398.9464C103.5,-389.5963 103.5,-380.4618 103.5,-371.684"/>
-<polygon fill="none" stroke="#191970" points="100.0001,-399.1701 103.5,-409.1701 107.0001,-399.1701 100.0001,-399.1701"/>
+<path fill="none" stroke="#191970" d="M103.5,-431.9464C103.5,-422.5963 103.5,-413.4618 103.5,-404.684"/>
+<polygon fill="none" stroke="#191970" points="100.0001,-432.1701 103.5,-442.1701 107.0001,-432.1701 100.0001,-432.1701"/>
</g>
<!-- Node5->Node5 -->
<g id="edge3" class="edge">
<title>Node5->Node5</title>
-<path fill="none" stroke="#404040" d="M195.3625,-636.9248C206.0482,-630.6637 213,-619.3555 213,-603 213,-592.0112 209.8618,-583.3007 204.5615,-576.8687"/>
-<polygon fill="none" stroke="#404040" points="204.5184,-576.8322 197.3548,-576.0056 195.3625,-569.0752 202.5261,-569.9017 204.5184,-576.8322"/>
-<text text-anchor="middle" x="239" y="-600.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> #deleter_</text>
+<path fill="none" stroke="#404040" d="M195.3625,-669.9248C206.0482,-663.6637 213,-652.3555 213,-636 213,-625.0112 209.8618,-616.3007 204.5615,-609.8687"/>
+<polygon fill="none" stroke="#404040" points="204.5184,-609.8322 197.3548,-609.0056 195.3625,-602.0752 202.5261,-602.9017 204.5184,-609.8322"/>
+<text text-anchor="middle" x="239" y="-633.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> #deleter_</text>
</g>
<!-- Node6 -->
<g id="node4" class="node">
<title>Node6</title>
-<g id="a_node4"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< int64_t()\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="225,-248.5 225,-316.5 374,-316.5 374,-248.5 225,-248.5"/>
-<text text-anchor="start" x="233" y="-304.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="middle" x="299.5" y="-293.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< int64_t()></text>
-<polyline fill="none" stroke="#000000" points="225,-286.5 374,-286.5 "/>
-<text text-anchor="middle" x="299.5" y="-274.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="225,-267.5 374,-267.5 "/>
-<text text-anchor="middle" x="299.5" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node4"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Optional\< TuningRecord\l \>(const IRModule &, const\l Target &, const String &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="225,-270.5 225,-360.5 392,-360.5 392,-270.5 225,-270.5"/>
+<text text-anchor="start" x="233" y="-348.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="start" x="233" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Optional< TuningRecord</text>
+<text text-anchor="start" x="233" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> >(const IRModule &, const</text>
+<text text-anchor="middle" x="308.5" y="-315.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> Target &, const String &)></text>
+<polyline fill="none" stroke="#000000" points="225,-308.5 392,-308.5 "/>
+<text text-anchor="middle" x="308.5" y="-296.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="225,-289.5 392,-289.5 "/>
+<text text-anchor="middle" x="308.5" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node6->Node3 -->
<g id="edge4" class="edge">
<title>Node6->Node3</title>
-<path fill="none" stroke="#404040" d="M325.3898,-248.0796C340.4739,-229.8451 360.7719,-208.1515 382.5,-193 434.5579,-156.6989 499.2917,-128.4898 554.1938,-108.5698"/>
-<polygon fill="none" stroke="#404040" points="554.4501,-108.4783 558.7565,-102.6943 565.7519,-104.4449 561.4455,-110.2289 554.4501,-108.4783"/>
-<text text-anchor="middle" x="446.5" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_size</text>
+<path fill="none" stroke="#404040" d="M347.4664,-270.4891C362.7811,-254.8218 381.2886,-238.1463 400.5,-226 528.0961,-145.3282 701.3945,-112.2226 814.7972,-98.709"/>
+<polygon fill="none" stroke="#404040" points="815.0286,-98.6824 820.5312,-94.0218 826.9497,-97.3085 821.4472,-101.9692 815.0286,-98.6824"/>
+<text text-anchor="middle" x="513" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_query_tuning_record</text>
</g>
<!-- Node7 -->
<g id="node5" class="node">
<title>Node7</title>
-<g id="a_node5"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< bool(const IRModule &)\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="392,-248.5 392,-316.5 563,-316.5 563,-248.5 392,-248.5"/>
-<text text-anchor="start" x="400" y="-304.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="middle" x="477.5" y="-293.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< bool(const IRModule &)></text>
-<polyline fill="none" stroke="#000000" points="392,-286.5 563,-286.5 "/>
-<text text-anchor="middle" x="477.5" y="-274.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="392,-267.5 563,-267.5 "/>
-<text text-anchor="middle" x="477.5" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node5"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< int64_t()\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="410,-281.5 410,-349.5 559,-349.5 559,-281.5 410,-281.5"/>
+<text text-anchor="start" x="418" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="middle" x="484.5" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< int64_t()></text>
+<polyline fill="none" stroke="#000000" points="410,-319.5 559,-319.5 "/>
+<text text-anchor="middle" x="484.5" y="-307.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="410,-300.5 559,-300.5 "/>
+<text text-anchor="middle" x="484.5" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node7->Node3 -->
<g id="edge5" class="edge">
<title>Node7->Node3</title>
-<path fill="none" stroke="#404040" d="M505.2713,-248.3819C525.1558,-224.3384 552.8415,-191.6337 578.5,-164 581.3769,-160.9016 584.3281,-157.7669 587.327,-154.6181"/>
-<polygon fill="none" stroke="#404040" points="587.5633,-154.3725 588.8407,-147.2754 595.8831,-145.7248 594.6058,-152.8219 587.5633,-154.3725"/>
-<text text-anchor="middle" x="622.5" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_has_workload</text>
+<path fill="none" stroke="#404040" d="M509.8187,-281.2095C524.8734,-262.7895 545.3265,-240.8758 567.5,-226 643.1785,-175.2286 740.289,-140.0413 815.1694,-118.1232"/>
+<polygon fill="none" stroke="#404040" points="815.4583,-118.04 820.117,-112.5357 826.9896,-114.719 822.331,-120.2233 815.4583,-118.04"/>
+<text text-anchor="middle" x="633.5" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_size</text>
</g>
<!-- Node8 -->
<g id="node6" class="node">
<title>Node8</title>
-<g id="a_node6"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Array\< TuningRecord \>()\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="581,-248.5 581,-316.5 760,-316.5 760,-248.5 581,-248.5"/>
-<text text-anchor="start" x="589" y="-304.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="middle" x="670.5" y="-293.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Array< TuningRecord >()></text>
-<polyline fill="none" stroke="#000000" points="581,-286.5 760,-286.5 "/>
-<text text-anchor="middle" x="670.5" y="-274.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="581,-267.5 760,-267.5 "/>
-<text text-anchor="middle" x="670.5" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node6"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< bool(const IRModule &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="577,-281.5 577,-349.5 748,-349.5 748,-281.5 577,-281.5"/>
+<text text-anchor="start" x="585" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="middle" x="662.5" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< bool(const IRModule &)></text>
+<polyline fill="none" stroke="#000000" points="577,-319.5 748,-319.5 "/>
+<text text-anchor="middle" x="662.5" y="-307.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="577,-300.5 748,-300.5 "/>
+<text text-anchor="middle" x="662.5" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node8->Node3 -->
<g id="edge6" class="edge">
<title>Node8->Node3</title>
-<path fill="none" stroke="#404040" d="M670.5,-248.3739C670.5,-223.802 670.5,-189.5252 670.5,-157.7873"/>
-<polygon fill="none" stroke="#404040" points="670.5001,-157.7733 666.5,-151.7734 670.5,-145.7733 674.5,-151.7733 670.5001,-157.7733"/>
-<text text-anchor="middle" x="736" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_get_all_tuning_records</text>
+<path fill="none" stroke="#404040" d="M697.5361,-281.2596C715.215,-264.3013 737.1719,-243.7165 757.5,-226 776.3654,-209.5583 796.9035,-192.6407 817.0396,-176.5551"/>
+<polygon fill="none" stroke="#404040" points="817.1745,-176.4479 819.3821,-169.583 826.568,-168.9805 824.3604,-175.8454 817.1745,-176.4479"/>
+<text text-anchor="middle" x="834.5" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_has_workload</text>
</g>
<!-- Node9 -->
<g id="node7" class="node">
<title>Node9</title>
-<g id="a_node7"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Array\< TuningRecord\l \>(const Workload &, int)\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="778.5,-243 778.5,-322 930.5,-322 930.5,-243 778.5,-243"/>
-<text text-anchor="start" x="786.5" y="-310" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="start" x="786.5" y="-299" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Array< TuningRecord</text>
-<text text-anchor="middle" x="854.5" y="-288" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> >(const Workload &, int)></text>
-<polyline fill="none" stroke="#000000" points="778.5,-281 930.5,-281 "/>
-<text text-anchor="middle" x="854.5" y="-269" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="778.5,-262 930.5,-262 "/>
-<text text-anchor="middle" x="854.5" y="-250" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node7"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Optional\< IRModule\l \>(const IRModule &, const\l Target &, const String &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="766,-270.5 766,-360.5 915,-360.5 915,-270.5 766,-270.5"/>
+<text text-anchor="start" x="774" y="-348.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="start" x="774" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Optional< IRModule</text>
+<text text-anchor="start" x="774" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> >(const IRModule &, const</text>
+<text text-anchor="middle" x="840.5" y="-315.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> Target &, const String &)></text>
+<polyline fill="none" stroke="#000000" points="766,-308.5 915,-308.5 "/>
+<text text-anchor="middle" x="840.5" y="-296.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="766,-289.5 915,-289.5 "/>
+<text text-anchor="middle" x="840.5" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node9->Node3 -->
<g id="edge7" class="edge">
<title>Node9->Node3</title>
-<path fill="none" stroke="#404040" d="M844.4929,-242.8783C836.8181,-218.2546 824.3006,-187.0181 805.5,-164 799.1386,-156.2116 791.9379,-148.8304 784.2541,-141.8936"/>
-<polygon fill="none" stroke="#404040" points="784.1282,-141.7852 776.9713,-140.9026 775.0333,-133.9568 782.1902,-134.8394 784.1282,-141.7852"/>
-<text text-anchor="middle" x="846" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_get_top_k</text>
+<path fill="none" stroke="#404040" d="M858.6965,-270.3087C868.0375,-247.1102 879.8148,-217.8611 891.1588,-189.6881"/>
+<polygon fill="none" stroke="#404040" points="891.1644,-189.6739 889.695,-182.6141 895.6467,-178.5424 897.116,-185.6022 891.1644,-189.6739"/>
+<text text-anchor="middle" x="937" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_query_ir_module</text>
</g>
<!-- Node10 -->
<g id="node8" class="node">
<title>Node10</title>
-<g id="a_node8"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Workload(const IRModule &)\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="948.5,-248.5 948.5,-316.5 1144.5,-316.5 1144.5,-248.5 948.5,-248.5"/>
-<text text-anchor="start" x="956.5" y="-304.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="middle" x="1046.5" y="-293.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Workload(const IRModule &)></text>
-<polyline fill="none" stroke="#000000" points="948.5,-286.5 1144.5,-286.5 "/>
-<text text-anchor="middle" x="1046.5" y="-274.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="948.5,-267.5 1144.5,-267.5 "/>
-<text text-anchor="middle" x="1046.5" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node8"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Array\< TuningRecord \>()\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="933,-281.5 933,-349.5 1112,-349.5 1112,-281.5 933,-281.5"/>
+<text text-anchor="start" x="941" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="middle" x="1022.5" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Array< TuningRecord >()></text>
+<polyline fill="none" stroke="#000000" points="933,-319.5 1112,-319.5 "/>
+<text text-anchor="middle" x="1022.5" y="-307.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="933,-300.5 1112,-300.5 "/>
+<text text-anchor="middle" x="1022.5" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node10->Node3 -->
<g id="edge8" class="edge">
<title>Node10->Node3</title>
-<path fill="none" stroke="#404040" d="M1009.4089,-248.1356C989.4874,-230.5585 964.0363,-209.4604 939.5,-193 892.4197,-161.4157 835.6676,-134.6821 786.4503,-114.5214"/>
-<polygon fill="none" stroke="#404040" points="786.2196,-114.4282 779.1577,-115.8878 775.0943,-109.9306 782.1561,-108.4709 786.2196,-114.4282"/>
-<text text-anchor="middle" x="960" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_commit_workload</text>
+<path fill="none" stroke="#404040" d="M1016.3733,-281.2803C1011.4537,-257.1896 1003.5372,-224.4703 992.5,-197 991.5469,-194.6277 990.5503,-192.2434 989.5165,-189.8534"/>
+<polygon fill="none" stroke="#404040" points="989.3905,-189.5758 983.2676,-185.7665 984.4289,-178.6495 990.5518,-182.4588 989.3905,-189.5758"/>
+<text text-anchor="middle" x="1061" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_get_all_tuning_records</text>
</g>
<!-- Node11 -->
<g id="node9" class="node">
<title>Node11</title>
-<g id="a_node9"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< void(const TuningRecord &)\>\n||}">
-<polygon fill="#ffffff" stroke="#000000" points="1163,-248.5 1163,-316.5 1354,-316.5 1354,-248.5 1163,-248.5"/>
-<text text-anchor="start" x="1171" y="-304.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
-<text text-anchor="middle" x="1258.5" y="-293.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< void(const TuningRecord &)></text>
-<polyline fill="none" stroke="#000000" points="1163,-286.5 1354,-286.5 "/>
-<text text-anchor="middle" x="1258.5" y="-274.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
-<polyline fill="none" stroke="#000000" points="1163,-267.5 1354,-267.5 "/>
-<text text-anchor="middle" x="1258.5" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<g id="a_node9"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Array\< TuningRecord\l \>(const Workload &, int)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="1130.5,-276 1130.5,-355 1282.5,-355 1282.5,-276 1130.5,-276"/>
+<text text-anchor="start" x="1138.5" y="-343" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="start" x="1138.5" y="-332" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Array< TuningRecord</text>
+<text text-anchor="middle" x="1206.5" y="-321" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> >(const Workload &, int)></text>
+<polyline fill="none" stroke="#000000" points="1130.5,-314 1282.5,-314 "/>
+<text text-anchor="middle" x="1206.5" y="-302" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="1130.5,-295 1282.5,-295 "/>
+<text text-anchor="middle" x="1206.5" y="-283" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
</a>
</g>
</g>
<!-- Node11->Node3 -->
<g id="edge9" class="edge">
<title>Node11->Node3</title>
-<path fill="none" stroke="#404040" d="M1226.4522,-248.2601C1207.0196,-229.3885 1180.7696,-207.0135 1153.5,-193 1036.8202,-133.0397 888.6495,-101.9927 787.3822,-86.6538"/>
-<polygon fill="none" stroke="#404040" points="787.1409,-86.6182 780.6196,-89.696 775.2704,-84.8602 781.7916,-81.7823 787.1409,-86.6182"/>
-<text text-anchor="middle" x="1169.5" y="-167" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_commit_tuning_record</text>
+<path fill="none" stroke="#404040" d="M1187.6774,-275.8672C1174.3542,-250.9317 1154.5993,-219.3568 1130.5,-197 1106.1432,-174.4043 1076.1489,-155.0019 1046.9043,-139.1503"/>
+<polygon fill="none" stroke="#404040" points="1046.7816,-139.0854 1039.6073,-139.8138 1036.1757,-133.4716 1043.3499,-132.7432 1046.7816,-139.0854"/>
+<text text-anchor="middle" x="1175" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_get_top_k</text>
+</g>
+<!-- Node12 -->
+<g id="node10" class="node">
+<title>Node12</title>
+<g id="a_node10"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Workload(const IRModule &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="1300.5,-281.5 1300.5,-349.5 1496.5,-349.5 1496.5,-281.5 1300.5,-281.5"/>
+<text text-anchor="start" x="1308.5" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="middle" x="1398.5" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Workload(const IRModule &)></text>
+<polyline fill="none" stroke="#000000" points="1300.5,-319.5 1496.5,-319.5 "/>
+<text text-anchor="middle" x="1398.5" y="-307.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="1300.5,-300.5 1496.5,-300.5 "/>
+<text text-anchor="middle" x="1398.5" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+</a>
+</g>
+</g>
+<!-- Node12->Node3 -->
+<g id="edge10" class="edge">
+<title>Node12->Node3</title>
+<path fill="none" stroke="#404040" d="M1363.6442,-281.1889C1343.7503,-262.9875 1317.6272,-241.2907 1291.5,-226 1214.7002,-181.0537 1120.4214,-145.9403 1047.6119,-122.588"/>
+<polygon fill="none" stroke="#404040" points="1047.555,-122.57 1040.6263,-124.5684 1036.1168,-118.9413 1043.0454,-116.9429 1047.555,-122.57"/>
+<text text-anchor="middle" x="1309" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_commit_workload</text>
+</g>
+<!-- Node13 -->
+<g id="node11" class="node">
+<title>Node13</title>
+<g id="a_node11"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< void(const TuningRecord &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="1515,-281.5 1515,-349.5 1706,-349.5 1706,-281.5 1515,-281.5"/>
+<text text-anchor="start" x="1523" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="middle" x="1610.5" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< void(const TuningRecord &)></text>
+<polyline fill="none" stroke="#000000" points="1515,-319.5 1706,-319.5 "/>
+<text text-anchor="middle" x="1610.5" y="-307.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="1515,-300.5 1706,-300.5 "/>
+<text text-anchor="middle" x="1610.5" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+</a>
+</g>
+</g>
+<!-- Node13->Node3 -->
+<g id="edge11" class="edge">
+<title>Node13->Node3</title>
+<path fill="none" stroke="#404040" d="M1578.5048,-281.1573C1559.0897,-262.2513 1532.8397,-239.8763 1505.5,-226 1358.0267,-151.1495 1168.256,-116.4185 1048.1997,-100.9742"/>
+<polygon fill="none" stroke="#404040" points="1047.9878,-100.9476 1041.5356,-104.1678 1036.0815,-99.4504 1042.5337,-96.2303 1047.9878,-100.9476"/>
+<text text-anchor="middle" x="1526.5" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_commit_tuning_record</text>
+</g>
+<!-- Node14 -->
+<g id="node12" class="node">
+<title>Node14</title>
+<g id="a_node12"><a xlink:href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_top" xlink:title="{tvm::runtime::TypedPacked\lFunc\< Optional\< tir::Schedule\l \>(const IRModule &, const\l Target &, const String &)\>\n||}">
+<polygon fill="#ffffff" stroke="#000000" points="1724,-270.5 1724,-360.5 1887,-360.5 1887,-270.5 1724,-270.5"/>
+<text text-anchor="start" x="1732" y="-348.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::TypedPacked</text>
+<text text-anchor="start" x="1732" y="-337.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">Func< Optional< tir::Schedule</text>
+<text text-anchor="start" x="1732" y="-326.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> >(const IRModule &, const</text>
+<text text-anchor="middle" x="1805.5" y="-315.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> Target &, const String &)></text>
+<polyline fill="none" stroke="#000000" points="1724,-308.5 1887,-308.5 "/>
+<text text-anchor="middle" x="1805.5" y="-296.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+<polyline fill="none" stroke="#000000" points="1724,-289.5 1887,-289.5 "/>
+<text text-anchor="middle" x="1805.5" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> </text>
+</a>
+</g>
+</g>
+<!-- Node14->Node3 -->
+<g id="edge12" class="edge">
+<title>Node14->Node3</title>
+<path fill="none" stroke="#404040" d="M1769.1469,-270.141C1754.1063,-254.0532 1735.5024,-237.185 1715.5,-226 1667.6101,-199.2208 1649.2878,-207.8369 1595.5,-197 1404.8356,-158.586 1181.2113,-124.6544 1048.2153,-105.6321"/>
+<polygon fill="none" stroke="#404040" points="1047.9081,-105.5884 1041.4035,-108.7014 1036.0282,-103.8945 1042.5328,-100.7815 1047.9081,-105.5884"/>
+<text text-anchor="middle" x="1718" y="-200" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"> +f_query_schedule</text>
</g>
</g>
</svg>
diff --git a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg
index ce4c658d1c..67997f9697 100644
--- a/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg
+++ b/docs/reference/api/doxygen/classtvm_1_1meta__schedule_1_1PyDatabaseNode__inherit__graph.svg
@@ -4,32 +4,38 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: tvm::meta_schedule::PyDatabaseNode Pages: 1 -->
-<svg width="217pt" height="870pt"
- viewBox="0.00 0.00 217.00 870.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 866)">
+<svg width="217pt" height="936pt"
+ viewBox="0.00 0.00 217.00 936.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 932)">
<title>tvm::meta_schedule::PyDatabaseNode</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-866 213,-866 213,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-932 213,-932 213,4 -4,4"/>
<!-- Node0 -->
<g id="node1" class="node">
<title>Node0</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-211.5 209,-211.5 209,-.5 0,-.5"/>
-<text text-anchor="start" x="8" y="-199.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="104.5" y="-188.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="0,-181.5 209,-181.5 "/>
-<text text-anchor="start" x="8" y="-169.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_has_workload</text>
-<text text-anchor="start" x="8" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_workload</text>
-<text text-anchor="start" x="8" y="-147.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_tuning_record</text>
-<text text-anchor="start" x="8" y="-136.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_top_k</text>
-<text text-anchor="start" x="8" y="-125.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_all_tuning_records</text>
-<text text-anchor="start" x="8" y="-114.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_size</text>
-<text text-anchor="start" x="8" y="-103.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="0,-96.5 209,-96.5 "/>
-<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
-<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="0,-.5 0,-277.5 209,-277.5 209,-.5 0,-.5"/>
+<text text-anchor="start" x="8" y="-265.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="104.5" y="-254.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::PyDatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="0,-247.5 209,-247.5 "/>
+<text text-anchor="start" x="8" y="-235.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_has_workload</text>
+<text text-anchor="start" x="8" y="-224.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_workload</text>
+<text text-anchor="start" x="8" y="-213.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_commit_tuning_record</text>
+<text text-anchor="start" x="8" y="-202.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_top_k</text>
+<text text-anchor="start" x="8" y="-191.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_get_all_tuning_records</text>
+<text text-anchor="start" x="8" y="-180.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_tuning_record</text>
+<text text-anchor="start" x="8" y="-169.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_schedule</text>
+<text text-anchor="start" x="8" y="-158.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_query_ir_module</text>
+<text text-anchor="start" x="8" y="-147.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ f_size</text>
+<text text-anchor="start" x="8" y="-136.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="0,-129.5 209,-129.5 "/>
+<text text-anchor="start" x="8" y="-117.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ VisitAttrs()</text>
+<text text-anchor="start" x="8" y="-106.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="8" y="-95.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="8" y="-84.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="8" y="-73.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="8" y="-62.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="8" y="-51.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="8" y="-40.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="8" y="-29.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
<text text-anchor="start" x="8" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
<text text-anchor="start" x="8" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_FINAL_OBJECT_INFO()</text>
</g>
@@ -37,81 +43,81 @@
<g id="node2" class="node">
<title>Node1</title>
<g id="a_node2"><a xlink:href="classtvm_1_1meta__schedule_1_1DatabaseNode.html" target="_top" xlink:title="{tvm::meta_schedule\l::DatabaseNode\n|+ _type_key\l|+ ~DatabaseNode()\l+ HasWorkload()\l+ CommitWorkload()\l+ CommitTuningRecord()\l+ GetTopK()\l+ GetAllTuningRecords()\l+ Size()\l+ QueryTuningRecord()\l+ QuerySchedule()\l+ QueryIRModule()\l+ TVM_DECLARE_BASE_OBJECT_INFO()\l}">
-<polygon fill="#ffffff" stroke="#000000" points="1,-248.5 1,-426.5 208,-426.5 208,-248.5 1,-248.5"/>
-<text text-anchor="start" x="9" y="-414.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
-<text text-anchor="middle" x="104.5" y="-403.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
-<polyline fill="none" stroke="#000000" points="1,-396.5 208,-396.5 "/>
-<text text-anchor="start" x="9" y="-384.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<polyline fill="none" stroke="#000000" points="1,-377.5 208,-377.5 "/>
-<text text-anchor="start" x="9" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
-<text text-anchor="start" x="9" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
-<text text-anchor="start" x="9" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
-<text text-anchor="start" x="9" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
-<text text-anchor="start" x="9" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
-<text text-anchor="start" x="9" y="-310.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
-<text text-anchor="start" x="9" y="-299.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
-<text text-anchor="start" x="9" y="-288.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
-<text text-anchor="start" x="9" y="-277.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
-<text text-anchor="start" x="9" y="-266.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
-<text text-anchor="start" x="9" y="-255.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
+<polygon fill="#ffffff" stroke="#000000" points="1,-314.5 1,-492.5 208,-492.5 208,-314.5 1,-314.5"/>
+<text text-anchor="start" x="9" y="-480.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::meta_schedule</text>
+<text text-anchor="middle" x="104.5" y="-469.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">::DatabaseNode</text>
+<polyline fill="none" stroke="#000000" points="1,-462.5 208,-462.5 "/>
+<text text-anchor="start" x="9" y="-450.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<polyline fill="none" stroke="#000000" points="1,-443.5 208,-443.5 "/>
+<text text-anchor="start" x="9" y="-431.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ ~DatabaseNode()</text>
+<text text-anchor="start" x="9" y="-420.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ HasWorkload()</text>
+<text text-anchor="start" x="9" y="-409.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitWorkload()</text>
+<text text-anchor="start" x="9" y="-398.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ CommitTuningRecord()</text>
+<text text-anchor="start" x="9" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTopK()</text>
+<text text-anchor="start" x="9" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetAllTuningRecords()</text>
+<text text-anchor="start" x="9" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Size()</text>
+<text text-anchor="start" x="9" y="-354.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryTuningRecord()</text>
+<text text-anchor="start" x="9" y="-343.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QuerySchedule()</text>
+<text text-anchor="start" x="9" y="-332.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ QueryIRModule()</text>
+<text text-anchor="start" x="9" y="-321.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TVM_DECLARE_BASE_OBJECT_INFO()</text>
</a>
</g>
</g>
<!-- Node1->Node0 -->
<g id="edge1" class="edge">
<title>Node1->Node0</title>
-<path fill="none" stroke="#191970" d="M104.5,-238.1421C104.5,-229.4057 104.5,-220.5421 104.5,-211.756"/>
-<polygon fill="none" stroke="#191970" points="101.0001,-238.3272 104.5,-248.3272 108.0001,-238.3272 101.0001,-238.3272"/>
+<path fill="none" stroke="#191970" d="M104.5,-304.2113C104.5,-295.5113 104.5,-286.6081 104.5,-277.6657"/>
+<polygon fill="none" stroke="#191970" points="101.0001,-304.3211 104.5,-314.3211 108.0001,-304.3211 101.0001,-304.3211"/>
</g>
<!-- Node2 -->
<g id="node3" class="node">
<title>Node2</title>
<g id="a_node3"><a xlink:href="classtvm_1_1runtime_1_1Object.html" target="_top" xlink:title="base class of all object containers. ">
-<polygon fill="#ffffff" stroke="#000000" points="13,-463.5 13,-861.5 196,-861.5 196,-463.5 13,-463.5"/>
-<text text-anchor="middle" x="104.5" y="-849.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
-<polyline fill="none" stroke="#000000" points="13,-842.5 196,-842.5 "/>
-<text text-anchor="start" x="21" y="-830.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
-<text text-anchor="start" x="21" y="-819.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
-<text text-anchor="start" x="21" y="-808.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
-<text text-anchor="start" x="21" y="-797.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
-<text text-anchor="start" x="21" y="-786.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
-<text text-anchor="start" x="21" y="-775.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
-<text text-anchor="start" x="21" y="-764.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
-<text text-anchor="start" x="21" y="-753.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
-<text text-anchor="start" x="21" y="-742.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="21" y="-731.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
-<text text-anchor="start" x="21" y="-720.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
-<text text-anchor="start" x="21" y="-709.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
-<text text-anchor="start" x="21" y="-698.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
-<text text-anchor="start" x="21" y="-687.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
-<text text-anchor="start" x="21" y="-676.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># deleter_</text>
-<polyline fill="none" stroke="#000000" points="13,-669.5 196,-669.5 "/>
-<text text-anchor="start" x="21" y="-657.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
-<text text-anchor="start" x="21" y="-646.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
-<text text-anchor="start" x="21" y="-635.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
-<text text-anchor="start" x="21" y="-624.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
-<text text-anchor="start" x="21" y="-613.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
-<text text-anchor="start" x="21" y="-602.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-591.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-580.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
-<text text-anchor="start" x="21" y="-569.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="21" y="-558.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
-<text text-anchor="start" x="21" y="-547.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
-<text text-anchor="start" x="21" y="-536.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
-<text text-anchor="start" x="21" y="-525.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
-<text text-anchor="start" x="21" y="-514.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
-<text text-anchor="start" x="21" y="-503.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
-<text text-anchor="start" x="21" y="-492.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
-<text text-anchor="start" x="21" y="-481.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
-<text text-anchor="start" x="21" y="-470.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
+<polygon fill="#ffffff" stroke="#000000" points="13,-529.5 13,-927.5 196,-927.5 196,-529.5 13,-529.5"/>
+<text text-anchor="middle" x="104.5" y="-915.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm::runtime::Object</text>
+<polyline fill="none" stroke="#000000" points="13,-908.5 196,-908.5 "/>
+<text text-anchor="start" x="21" y="-896.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_key</text>
+<text text-anchor="start" x="21" y="-885.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_final</text>
+<text text-anchor="start" x="21" y="-874.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots</text>
+<text text-anchor="start" x="21" y="-863.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_child_slots_can</text>
+<text text-anchor="start" x="21" y="-852.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_overflow</text>
+<text text-anchor="start" x="21" y="-841.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_visit</text>
+<text text-anchor="start" x="21" y="-830.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_attrs</text>
+<text text-anchor="start" x="21" y="-819.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_sequal</text>
+<text text-anchor="start" x="21" y="-808.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="21" y="-797.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_has_method_shash</text>
+<text text-anchor="start" x="21" y="-786.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_reduce</text>
+<text text-anchor="start" x="21" y="-775.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _type_index</text>
+<text text-anchor="start" x="21" y="-764.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># type_index_</text>
+<text text-anchor="start" x="21" y="-753.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># ref_counter_</text>
+<text text-anchor="start" x="21" y="-742.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># deleter_</text>
+<polyline fill="none" stroke="#000000" points="13,-735.5 196,-735.5 "/>
+<text text-anchor="start" x="21" y="-723.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ type_index()</text>
+<text text-anchor="start" x="21" y="-712.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKey()</text>
+<text text-anchor="start" x="21" y="-701.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ GetTypeKeyHash()</text>
+<text text-anchor="start" x="21" y="-690.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ IsInstance()</text>
+<text text-anchor="start" x="21" y="-679.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ unique()</text>
+<text text-anchor="start" x="21" y="-668.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-657.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-646.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ Object()</text>
+<text text-anchor="start" x="21" y="-635.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="21" y="-624.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ operator=()</text>
+<text text-anchor="start" x="21" y="-613.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2Key()</text>
+<text text-anchor="start" x="21" y="-602.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeIndex2KeyHash()</text>
+<text text-anchor="start" x="21" y="-591.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ TypeKey2Index()</text>
+<text text-anchor="start" x="21" y="-580.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ _GetOrAllocRuntimeTypeIndex()</text>
+<text text-anchor="start" x="21" y="-569.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">+ RuntimeTypeIndex()</text>
+<text text-anchor="start" x="21" y="-558.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># IncRef()</text>
+<text text-anchor="start" x="21" y="-547.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># DecRef()</text>
+<text text-anchor="start" x="21" y="-536.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000"># GetOrAllocRuntimeTypeIndex()</text>
</a>
</g>
</g>
<!-- Node2->Node1 -->
<g id="edge2" class="edge">
<title>Node2->Node1</title>
-<path fill="none" stroke="#191970" d="M104.5,-452.883C104.5,-443.8603 104.5,-435.0496 104.5,-426.5763"/>
-<polygon fill="none" stroke="#191970" points="101.0001,-453.1535 104.5,-463.1535 108.0001,-453.1535 101.0001,-453.1535"/>
+<path fill="none" stroke="#191970" d="M104.5,-518.883C104.5,-509.8603 104.5,-501.0496 104.5,-492.5763"/>
+<polygon fill="none" stroke="#191970" points="101.0001,-519.1535 104.5,-529.1535 108.0001,-519.1535 101.0001,-519.1535"/>
</g>
</g>
</svg>
diff --git a/docs/reference/api/doxygen/database_8h.html b/docs/reference/api/doxygen/database_8h.html
index 8df2360bc7..d38e3b4e15 100644
--- a/docs/reference/api/doxygen/database_8h.html
+++ b/docs/reference/api/doxygen/database_8h.html
@@ -78,11 +78,12 @@ $(function() {
<code>#include <<a class="el" href="object_8h_source.html">tvm/runtime/object.h</a>></code><br />
<code>#include <<a class="el" href="packed__func_8h_source.html">tvm/runtime/packed_func.h</a>></code><br />
<code>#include <<a class="el" href="target_8h_source.html">tvm/target/target.h</a>></code><br />
+<code>#include <<a class="el" href="tir_2schedule_2schedule_8h_source.html">tvm/tir/schedule/schedule.h</a>></code><br />
<code>#include <<a class="el" href="trace_8h_source.html">tvm/tir/schedule/trace.h</a>></code><br />
</div><div class="textblock"><div class="dynheader">
Include dependency graph for database.h:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="database_8h__incl.svg" width="4382" height="1246"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="database_8h__incl.svg" width="4518" height="1260"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
</div><div class="textblock"><div class="dynheader">
diff --git a/docs/reference/api/doxygen/database_8h__dep__incl.svg b/docs/reference/api/doxygen/database_8h__dep__incl.svg
index 5390b70cbb..5cbf1aceb4 100644
--- a/docs/reference/api/doxygen/database_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/database_8h__dep__incl.svg
@@ -9,16 +9,16 @@
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 303)">
<title>include/tvm/meta_schedule/database.h</title>
<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-303 279,-303 279,4 -4,4"/>
-<!-- Node77 -->
+<!-- Node84 -->
<g id="node1" class="node">
-<title>Node77</title>
+<title>Node84</title>
<polygon fill="#bfbfbf" stroke="#000000" points="123,-268.5 123,-298.5 275,-298.5 275,-268.5 123,-268.5"/>
<text text-anchor="start" x="131" y="-286.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
<text text-anchor="middle" x="199" y="-275.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
</g>
-<!-- Node78 -->
+<!-- Node85 -->
<g id="node2" class="node">
-<title>Node78</title>
+<title>Node85</title>
<g id="a_node2"><a xlink:href="search__strategy_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/search_strategy.h">
<polygon fill="#ffffff" stroke="#000000" points="71,-201.5 71,-231.5 223,-231.5 223,-201.5 71,-201.5"/>
<text text-anchor="start" x="79" y="-219.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -26,15 +26,15 @@
</a>
</g>
</g>
-<!-- Node77->Node78 -->
+<!-- Node84->Node85 -->
<g id="edge1" class="edge">
-<title>Node77->Node78</title>
+<title>Node84->Node85</title>
<path fill="none" stroke="#191970" d="M181.0335,-260.3509C173.6583,-250.8482 165.3266,-240.1132 158.7529,-231.6432"/>
<polygon fill="#191970" stroke="#191970" points="178.3042,-262.5427 187.2004,-268.2967 183.8341,-258.2508 178.3042,-262.5427"/>
</g>
-<!-- Node80 -->
+<!-- Node87 -->
<g id="node4" class="node">
-<title>Node80</title>
+<title>Node87</title>
<g id="a_node4"><a xlink:href="task__scheduler_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/task_scheduler.h">
<polygon fill="#ffffff" stroke="#000000" points="104,-.5 104,-30.5 256,-30.5 256,-.5 104,-.5"/>
<text text-anchor="start" x="112" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -42,15 +42,15 @@
</a>
</g>
</g>
-<!-- Node77->Node80 -->
+<!-- Node84->Node87 -->
<g id="edge7" class="edge">
-<title>Node77->Node80</title>
+<title>Node84->Node87</title>
<path fill="none" stroke="#191970" d="M217.8572,-260.2678C223.5567,-251.8686 229.0757,-241.9839 232,-232 244.2431,-190.2006 240.9414,-176.6279 232,-134 223.7443,-94.6414 201.6105,-52.3879 189.0426,-30.5305"/>
<polygon fill="#191970" stroke="#191970" points="214.9812,-258.2719 211.9535,-268.4249 220.6519,-262.376 214.9812,-258.2719"/>
</g>
-<!-- Node79 -->
+<!-- Node86 -->
<g id="node3" class="node">
-<title>Node79</title>
+<title>Node86</title>
<g id="a_node3"><a xlink:href="measure__callback_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/measure_callback.h">
<polygon fill="#ffffff" stroke="#000000" points="0,-67.5 0,-97.5 152,-97.5 152,-67.5 0,-67.5"/>
<text text-anchor="start" x="8" y="-85.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -58,15 +58,15 @@
</a>
</g>
</g>
-<!-- Node78->Node79 -->
+<!-- Node85->Node86 -->
<g id="edge2" class="edge">
-<title>Node78->Node79</title>
+<title>Node85->Node86</title>
<path fill="none" stroke="#191970" d="M95.8489,-196.7472C82.4568,-188.9655 69.6417,-178.5649 62,-165 50.0208,-143.7356 59.7936,-114.8509 67.9279,-97.5054"/>
<polygon fill="#191970" stroke="#191970" points="94.2179,-199.8441 104.6841,-201.4897 97.5285,-193.6764 94.2179,-199.8441"/>
</g>
-<!-- Node81 -->
+<!-- Node88 -->
<g id="node5" class="node">
-<title>Node81</title>
+<title>Node88</title>
<g id="a_node5"><a xlink:href="tune__context_8h.html" target="_top" xlink:title="include/tvm/meta_schedule\l/tune_context.h">
<polygon fill="#ffffff" stroke="#000000" points="71,-134.5 71,-164.5 223,-164.5 223,-134.5 71,-134.5"/>
<text text-anchor="start" x="79" y="-152.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
@@ -74,27 +74,27 @@
</a>
</g>
</g>
-<!-- Node78->Node81 -->
+<!-- Node85->Node88 -->
<g id="edge4" class="edge">
-<title>Node78->Node81</title>
+<title>Node85->Node88</title>
<path fill="none" stroke="#191970" d="M147,-191.0249C147,-182.128 147,-172.4287 147,-164.6432"/>
<polygon fill="#191970" stroke="#191970" points="143.5001,-191.2966 147,-201.2967 150.5001,-191.2967 143.5001,-191.2966"/>
</g>
-<!-- Node79->Node80 -->
+<!-- Node86->Node87 -->
<g id="edge3" class="edge">
-<title>Node79->Node80</title>
+<title>Node86->Node87</title>
<path fill="none" stroke="#191970" d="M108.139,-61.7951C123.7223,-51.7558 142.1061,-39.9124 156.3784,-30.7177"/>
<polygon fill="#191970" stroke="#191970" points="105.8506,-59.1058 99.3396,-67.4639 109.6417,-64.9904 105.8506,-59.1058"/>
</g>
-<!-- Node81->Node79 -->
+<!-- Node88->Node86 -->
<g id="edge5" class="edge">
-<title>Node81->Node79</title>
+<title>Node88->Node86</title>
<path fill="none" stroke="#191970" d="M123.3806,-127.2113C113.0905,-117.5009 101.2949,-106.3698 92.0472,-97.6432"/>
<polygon fill="#191970" stroke="#191970" points="121.2139,-129.979 130.889,-134.2967 126.0181,-124.8879 121.2139,-129.979"/>
</g>
-<!-- Node81->Node80 -->
+<!-- Node88->Node87 -->
<g id="edge6" class="edge">
-<title>Node81->Node80</title>
+<title>Node88->Node87</title>
<path fill="none" stroke="#191970" d="M154.0598,-124.3068C156.3443,-115.9529 158.8433,-106.5936 161,-98 166.8676,-74.6206 173.0149,-47.338 176.6755,-30.7481"/>
<polygon fill="#191970" stroke="#191970" points="150.6112,-123.6465 151.3227,-134.2174 157.3586,-125.5101 150.6112,-123.6465"/>
</g>
diff --git a/docs/reference/api/doxygen/database_8h__incl.svg b/docs/reference/api/doxygen/database_8h__incl.svg
index 90eec1d3a9..8215663697 100644
--- a/docs/reference/api/doxygen/database_8h__incl.svg
+++ b/docs/reference/api/doxygen/database_8h__incl.svg
@@ -4,1601 +4,1602 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: include/tvm/meta_schedule/database.h Pages: 1 -->
-<svg width="3286pt" height="934pt"
- viewBox="0.00 0.00 3286.48 934.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 930)">
+<svg width="3388pt" height="945pt"
+ viewBox="0.00 0.00 3388.00 945.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 941)">
<title>include/tvm/meta_schedule/database.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-930 3282.4804,-930 3282.4804,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-941 3384,-941 3384,4 -4,4"/>
<!-- Node0 -->
<g id="node1" class="node">
<title>Node0</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="1000.4804,-895.5 1000.4804,-925.5 1152.4804,-925.5 1152.4804,-895.5 1000.4804,-895.5"/>
-<text text-anchor="start" x="1008.4804" y="-913.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
-<text text-anchor="middle" x="1076.4804" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="608,-906.5 608,-936.5 760,-936.5 760,-906.5 608,-906.5"/>
+<text text-anchor="start" x="616" y="-924.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/meta_schedule</text>
+<text text-anchor="middle" x="684" y="-913.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/database.h</text>
</g>
<!-- Node1 -->
<g id="node2" class="node">
<title>Node1</title>
<g id="a_node2"><a xlink:href="ir_2expr_8h.html" target="_top" xlink:title="Base expr nodes in TVM. ">
-<polygon fill="#ffffff" stroke="#000000" points="1516.9804,-660.5 1516.9804,-679.5 1595.9804,-679.5 1595.9804,-660.5 1516.9804,-660.5"/>
-<text text-anchor="middle" x="1556.4804" y="-667.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/expr.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1504.5,-666 1504.5,-685 1583.5,-685 1583.5,-666 1504.5,-666"/>
+<text text-anchor="middle" x="1544" y="-673" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/expr.h</text>
</a>
</g>
</g>
<!-- Node0->Node1 -->
<g id="edge1" class="edge">
<title>Node0->Node1</title>
-<path fill="none" stroke="#191970" d="M1106.5066,-895.4556C1193.3629,-851.937 1444.0488,-726.3329 1528.4017,-684.0686"/>
-<polygon fill="#191970" stroke="#191970" points="1530.0217,-687.1718 1537.3943,-679.5629 1526.8859,-680.9134 1530.0217,-687.1718"/>
+<path fill="none" stroke="#191970" d="M736.5023,-906.4819C893.3809,-861.6073 1356.6689,-729.0854 1500.7411,-687.8741"/>
+<polygon fill="#191970" stroke="#191970" points="1501.9022,-691.1824 1510.554,-685.0671 1499.977,-684.4523 1501.9022,-691.1824"/>
</g>
<!-- Node4 -->
<g id="node5" class="node">
<title>Node4</title>
<g id="a_node5"><a xlink:href="reflection_8h.html" target="_top" xlink:title="Reflection and serialization of compiler IR/AST nodes. ">
-<polygon fill="#ffffff" stroke="#000000" points="736.9804,-436.5 736.9804,-455.5 857.9804,-455.5 857.9804,-436.5 736.9804,-436.5"/>
-<text text-anchor="middle" x="797.4804" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/reflection.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="826.5,-436.5 826.5,-455.5 947.5,-455.5 947.5,-436.5 826.5,-436.5"/>
+<text text-anchor="middle" x="887" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/reflection.h</text>
</a>
</g>
</g>
<!-- Node0->Node4 -->
-<g id="edge174" class="edge">
+<g id="edge173" class="edge">
<title>Node0->Node4</title>
-<path fill="none" stroke="#191970" d="M1000.2925,-907.7396C781.1026,-898.7928 164.4804,-866.1459 164.4804,-782 164.4804,-782 164.4804,-782 164.4804,-614 164.4804,-498.7334 561.5498,-460.6683 726.7218,-449.8205"/>
-<polygon fill="#191970" stroke="#191970" points="727.0305,-453.308 736.7857,-449.1745 726.582,-446.3223 727.0305,-453.308"/>
+<path fill="none" stroke="#191970" d="M607.716,-913.2353C529.3664,-903.9807 413.2258,-887.9123 372,-870 320.553,-847.6466 272,-849.0934 272,-793 272,-793 272,-793 272,-675.5 272,-497.8905 458.7504,-548.8964 627,-492 689.2124,-470.9619 763.2221,-459.0643 816.3403,-452.6655"/>
+<polygon fill="#191970" stroke="#191970" points="816.8158,-456.1338 826.3415,-451.4958 816.0026,-449.1812 816.8158,-456.1338"/>
</g>
<!-- Node32 -->
<g id="node7" class="node">
<title>Node32</title>
<g id="a_node7"><a xlink:href="array_8h.html" target="_top" xlink:title="Runtime Array container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="2008.4804,-302.5 2008.4804,-332.5 2134.4804,-332.5 2134.4804,-302.5 2008.4804,-302.5"/>
-<text text-anchor="start" x="2016.4804" y="-320.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="2071.4804" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2110,-302.5 2110,-332.5 2236,-332.5 2236,-302.5 2110,-302.5"/>
+<text text-anchor="start" x="2118" y="-320.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="2173" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array.h</text>
</a>
</g>
</g>
<!-- Node0->Node32 -->
-<g id="edge175" class="edge">
+<g id="edge174" class="edge">
<title>Node0->Node32</title>
-<path fill="none" stroke="#191970" d="M1152.5274,-909.8801C1483.0865,-906.9885 2786.5519,-893.4813 2966.4804,-859 3064.1134,-840.2897 3176.4804,-881.4096 3176.4804,-782 3176.4804,-782 3176.4804,-782 3176.4804,-670 3176.4804,-615.2226 2377.9407,-398.8393 2138.5295,-335.1961"/>
-<polygon fill="#191970" stroke="#191970" points="2139.1266,-331.7334 2128.5632,-332.5492 2137.3297,-338.4989 2139.1266,-331.7334"/>
+<path fill="none" stroke="#191970" d="M760.1449,-920.5328C1090.3836,-916.1875 2390.215,-897.4757 2571,-870 2770.4784,-839.6833 2886.1791,-907.8427 3008,-747 3114.3333,-606.6059 2446.3354,-397.2378 2235.3661,-335.3408"/>
+<polygon fill="#191970" stroke="#191970" points="2236.2508,-331.953 2225.6703,-332.5057 2234.2861,-338.6716 2236.2508,-331.953"/>
</g>
<!-- Node8 -->
<g id="node16" class="node">
<title>Node8</title>
<g id="a_node16"><a xlink:href="object_8h.html" target="_top" xlink:title="A managed object in the TVM runtime. ">
-<polygon fill="#ffffff" stroke="#000000" points="658.9804,-67.5 658.9804,-86.5 777.9804,-86.5 777.9804,-67.5 658.9804,-67.5"/>
-<text text-anchor="middle" x="718.4804" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/object.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="618.5,-67.5 618.5,-86.5 737.5,-86.5 737.5,-67.5 618.5,-67.5"/>
+<text text-anchor="middle" x="678" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/object.h</text>
</a>
</g>
</g>
<!-- Node0->Node8 -->
-<g id="edge177" class="edge">
+<g id="edge176" class="edge">
<title>Node0->Node8</title>
-<path fill="none" stroke="#191970" d="M1000.3872,-909.3052C756.8634,-904.4845 12.4804,-881.6026 12.4804,-782 12.4804,-782 12.4804,-782 12.4804,-384.5 12.4804,-192.8052 171.238,-175.9304 355.4804,-123 454.9032,-94.4372 574.7178,-83.5993 648.7534,-79.4939"/>
-<polygon fill="#191970" stroke="#191970" points="649.0882,-82.9812 658.8895,-78.9581 648.7186,-75.9909 649.0882,-82.9812"/>
+<path fill="none" stroke="#191970" d="M607.888,-915.8315C500.5557,-907.1932 315.0429,-889.6506 291,-870 117.3189,-728.0481 226.6532,-593.2861 230,-369 231.2745,-283.5871 206.8331,-246.6432 259,-179 288.0012,-141.395 305.3172,-139.0828 350,-123 396.1595,-106.3857 526.5055,-91.4831 608.3042,-83.3915"/>
+<polygon fill="#191970" stroke="#191970" points="608.6652,-86.873 618.2764,-82.415 607.9829,-79.9063 608.6652,-86.873"/>
</g>
<!-- Node26 -->
<g id="node26" class="node">
<title>Node26</title>
<g id="a_node26"><a xlink:href="string_8h.html" target="_top" xlink:title="Runtime String container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="1338.4804,-235.5 1338.4804,-265.5 1464.4804,-265.5 1464.4804,-235.5 1338.4804,-235.5"/>
-<text text-anchor="start" x="1346.4804" y="-253.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="1401.4804" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/string.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1646,-235.5 1646,-265.5 1772,-265.5 1772,-235.5 1646,-235.5"/>
+<text text-anchor="start" x="1654" y="-253.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="1709" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/string.h</text>
</a>
</g>
</g>
<!-- Node0->Node26 -->
-<g id="edge176" class="edge">
+<g id="edge175" class="edge">
<title>Node0->Node26</title>
-<path fill="none" stroke="#191970" d="M1063.4055,-895.3311C1055.7477,-885.5948 1046.6023,-872.3188 1041.4804,-859 1021.6721,-807.4911 1028.2671,-791.0563 1024.4804,-736 1015.0927,-599.5079 1036.7733,-560.5091 1093.4804,-436 1129.7159,-356.4396 1158.4023,-339.2762 1237.4804,-302 1266.8261,-288.1669 1300.8224,-276.7953 1330.2277,-268.2892"/>
-<polygon fill="#191970" stroke="#191970" points="1331.3448,-271.6105 1340.0105,-265.515 1329.435,-264.8761 1331.3448,-271.6105"/>
+<path fill="none" stroke="#191970" d="M760.16,-919.7957C1090.8929,-911.9033 2386,-875.7169 2386,-793 2386,-793 2386,-793 2386,-737 2386,-472.7468 2131.2154,-526.4622 1919,-369 1877.2156,-337.9963 1868.235,-327.7091 1823,-302 1801.7145,-289.9025 1777.0685,-278.4726 1756.1496,-269.4683"/>
+<polygon fill="#191970" stroke="#191970" points="1757.3984,-266.1961 1746.8259,-265.5082 1754.6618,-272.6391 1757.3984,-266.1961"/>
</g>
<!-- Node41 -->
<g id="node32" class="node">
<title>Node41</title>
<g id="a_node32"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type-erased function used across TVM API. ">
-<polygon fill="#ffffff" stroke="#000000" points="1191.4804,-369.5 1191.4804,-399.5 1307.4804,-399.5 1307.4804,-369.5 1191.4804,-369.5"/>
-<text text-anchor="start" x="1199.4804" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/packed</text>
-<text text-anchor="middle" x="1249.4804" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_func.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1167,-369.5 1167,-399.5 1283,-399.5 1283,-369.5 1167,-369.5"/>
+<text text-anchor="start" x="1175" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/packed</text>
+<text text-anchor="middle" x="1225" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_func.h</text>
</a>
</g>
</g>
<!-- Node0->Node41 -->
-<g id="edge178" class="edge">
+<g id="edge177" class="edge">
<title>Node0->Node41</title>
-<path fill="none" stroke="#191970" d="M1068.9728,-895.4979C1057.8985,-871.9437 1038.4804,-824.5732 1038.4804,-782 1038.4804,-782 1038.4804,-782 1038.4804,-670 1038.4804,-550.4551 1159.6047,-448.1681 1218.3112,-405.6123"/>
-<polygon fill="#191970" stroke="#191970" points="1220.3898,-408.4289 1226.4981,-399.7721 1216.3246,-402.7303 1220.3898,-408.4289"/>
+<path fill="none" stroke="#191970" d="M647.6206,-906.4692C582.0125,-876.985 454.3816,-807.3867 496,-727 562.8295,-597.9175 1001.8931,-452.6103 1164.535,-402.5609"/>
+<polygon fill="#191970" stroke="#191970" points="1165.8457,-405.82 1174.3813,-399.5435 1163.7947,-399.1272 1165.8457,-405.82"/>
</g>
<!-- Node50 -->
-<g id="node39" class="node">
+<g id="node38" class="node">
<title>Node50</title>
-<g id="a_node39"><a xlink:href="ir_2module_8h.html" target="_top" xlink:title="IRModule that holds the functions and type definitions. ">
-<polygon fill="#ffffff" stroke="#000000" points="1982.9804,-772.5 1982.9804,-791.5 2077.9804,-791.5 2077.9804,-772.5 1982.9804,-772.5"/>
-<text text-anchor="middle" x="2030.4804" y="-779.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/module.h</text>
+<g id="a_node38"><a xlink:href="ir_2module_8h.html" target="_top" xlink:title="IRModule that holds the functions and type definitions. ">
+<polygon fill="#ffffff" stroke="#000000" points="2414.5,-783.5 2414.5,-802.5 2509.5,-802.5 2509.5,-783.5 2414.5,-783.5"/>
+<text text-anchor="middle" x="2462" y="-790.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/module.h</text>
</a>
</g>
</g>
<!-- Node0->Node50 -->
-<g id="edge131" class="edge">
+<g id="edge130" class="edge">
<title>Node0->Node50</title>
-<path fill="none" stroke="#191970" d="M1152.5367,-900.2555C1337.5172,-875.3394 1807.8089,-811.993 1972.4699,-789.8138"/>
-<polygon fill="#191970" stroke="#191970" points="1973.4243,-793.2169 1982.8675,-788.4133 1972.4898,-786.2796 1973.4243,-793.2169"/>
+<path fill="none" stroke="#191970" d="M760.1069,-920.9853C1082.9029,-918.5762 2325.7331,-906.9296 2400,-870 2424.8809,-857.6279 2443.089,-830.1072 2453.1128,-811.5472"/>
+<polygon fill="#191970" stroke="#191970" points="2456.2778,-813.0453 2457.7156,-802.5485 2450.0457,-809.8577 2456.2778,-813.0453"/>
</g>
<!-- Node60 -->
-<g id="node44" class="node">
+<g id="node43" class="node">
<title>Node60</title>
-<g id="a_node44"><a xlink:href="arg__info_8h.html" target="_top" xlink:title="tvm/meta_schedule/arg\l_info.h">
-<polygon fill="#ffffff" stroke="#000000" points="787.4804,-828.5 787.4804,-858.5 919.4804,-858.5 919.4804,-828.5 787.4804,-828.5"/>
-<text text-anchor="start" x="795.4804" y="-846.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/meta_schedule/arg</text>
-<text text-anchor="middle" x="853.4804" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_info.h</text>
+<g id="a_node43"><a xlink:href="arg__info_8h.html" target="_top" xlink:title="tvm/meta_schedule/arg\l_info.h">
+<polygon fill="#ffffff" stroke="#000000" points="618,-839.5 618,-869.5 750,-869.5 750,-839.5 618,-839.5"/>
+<text text-anchor="start" x="626" y="-857.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/meta_schedule/arg</text>
+<text text-anchor="middle" x="684" y="-846.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_info.h</text>
</a>
</g>
</g>
<!-- Node0->Node60 -->
-<g id="edge163" class="edge">
+<g id="edge162" class="edge">
<title>Node0->Node60</title>
-<path fill="none" stroke="#191970" d="M1026.4349,-895.4639C992.9821,-885.4131 948.7964,-872.1375 913.46,-861.5208"/>
-<polygon fill="#191970" stroke="#191970" points="914.3285,-858.1272 903.7443,-858.6017 912.3142,-864.8312 914.3285,-858.1272"/>
+<path fill="none" stroke="#191970" d="M684,-906.2967C684,-898.5013 684,-888.7991 684,-879.9064"/>
+<polygon fill="#191970" stroke="#191970" points="687.5001,-879.6431 684,-869.6432 680.5001,-879.6432 687.5001,-879.6431"/>
</g>
<!-- Node66 -->
-<g id="node46" class="node">
+<g id="node45" class="node">
<title>Node66</title>
-<g id="a_node46"><a xlink:href="target_8h.html" target="_top" xlink:title="Compilation target object. ">
-<polygon fill="#ffffff" stroke="#000000" points="2822.4804,-834 2822.4804,-853 2932.4804,-853 2932.4804,-834 2822.4804,-834"/>
-<text text-anchor="middle" x="2877.4804" y="-841" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/target/target.h</text>
+<g id="a_node45"><a xlink:href="target_8h.html" target="_top" xlink:title="Compilation target object. ">
+<polygon fill="#ffffff" stroke="#000000" points="2452,-845 2452,-864 2562,-864 2562,-845 2452,-845"/>
+<text text-anchor="middle" x="2507" y="-852" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/target/target.h</text>
</a>
</g>
</g>
<!-- Node0->Node66 -->
-<g id="edge179" class="edge">
+<g id="edge178" class="edge">
<title>Node0->Node66</title>
-<path fill="none" stroke="#191970" d="M1152.7168,-907.6639C1450.6709,-896.5795 2533.7222,-856.2883 2811.9623,-845.9374"/>
-<polygon fill="#191970" stroke="#191970" points="2812.4301,-849.4225 2822.293,-845.5531 2812.1698,-842.4273 2812.4301,-849.4225"/>
+<path fill="none" stroke="#191970" d="M760.0698,-921.2057C1052.7134,-919.7724 2106.0785,-911.762 2438,-870 2445.4536,-869.0622 2453.286,-867.6872 2460.8903,-866.1376"/>
+<polygon fill="#191970" stroke="#191970" points="2461.6666,-869.5507 2470.7045,-864.022 2460.1915,-862.7079 2461.6666,-869.5507"/>
</g>
<!-- Node75 -->
-<g id="node49" class="node">
+<g id="node48" class="node">
<title>Node75</title>
-<g id="a_node49"><a xlink:href="trace_8h.html" target="_top" xlink:title="tvm/tir/schedule/trace.h">
-<polygon fill="#ffffff" stroke="#000000" points="2483.9804,-660.5 2483.9804,-679.5 2616.9804,-679.5 2616.9804,-660.5 2483.9804,-660.5"/>
-<text text-anchor="middle" x="2550.4804" y="-667.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/schedule/trace.h</text>
+<g id="a_node48"><a xlink:href="tir_2schedule_2schedule_8h.html" target="_top" xlink:title="tvm/tir/schedule/schedule.h">
+<polygon fill="#ffffff" stroke="#ff0000" points="0,-845 0,-864 152,-864 152,-845 0,-845"/>
+<text text-anchor="middle" x="76" y="-852" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/schedule/schedule.h</text>
</a>
</g>
</g>
<!-- Node0->Node75 -->
-<g id="edge195" class="edge">
+<g id="edge194" class="edge">
<title>Node0->Node75</title>
-<path fill="none" stroke="#191970" d="M1152.5141,-909.8351C1498.624,-906.6314 2909.7814,-891.5864 2941.4804,-859 2966.6516,-833.1242 3003.474,-791.0403 2929.4804,-716 2908.6785,-694.9038 2732.0597,-680.8372 2627.4232,-674.2715"/>
-<polygon fill="#191970" stroke="#191970" points="2627.4149,-670.7644 2617.2179,-673.6406 2626.9829,-677.7511 2627.4149,-670.7644"/>
+<path fill="none" stroke="#191970" d="M607.8764,-915.6448C506.7083,-907.4719 322.4953,-891.2278 166,-870 156.8851,-868.7636 147.2792,-867.2916 137.8731,-865.7541"/>
+<polygon fill="#191970" stroke="#191970" points="138.2174,-862.2633 127.7775,-864.0683 137.0644,-869.1677 138.2174,-862.2633"/>
+</g>
+<!-- Node82 -->
+<g id="node50" class="node">
+<title>Node82</title>
+<g id="a_node50"><a xlink:href="trace_8h.html" target="_top" xlink:title="tvm/tir/schedule/trace.h">
+<polygon fill="#ffffff" stroke="#ff0000" points="56.5,-783.5 56.5,-802.5 189.5,-802.5 189.5,-783.5 56.5,-783.5"/>
+<text text-anchor="middle" x="123" y="-790.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/tir/schedule/trace.h</text>
+</a>
+</g>
+</g>
+<!-- Node0->Node82 -->
+<g id="edge198" class="edge">
+<title>Node0->Node82</title>
+<path fill="none" stroke="#191970" d="M607.5802,-919.2379C490.7393,-914.7892 273.8798,-902.3436 204,-870 175.2495,-856.693 150.2225,-829.0364 135.8651,-810.7485"/>
+<polygon fill="#191970" stroke="#191970" points="138.5367,-808.4791 129.7045,-802.6273 132.9597,-812.7097 138.5367,-808.4791"/>
</g>
<!-- Node2 -->
<g id="node3" class="node">
<title>Node2</title>
<g id="a_node3"><a xlink:href="ir_2span_8h.html" target="_top" xlink:title="Span information for debugging purposes. ">
-<polygon fill="#ffffff" stroke="#000000" points="1516.9804,-548.5 1516.9804,-567.5 1597.9804,-567.5 1597.9804,-548.5 1516.9804,-548.5"/>
-<text text-anchor="middle" x="1557.4804" y="-555.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/span.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1487.5,-548.5 1487.5,-567.5 1568.5,-567.5 1568.5,-548.5 1487.5,-548.5"/>
+<text text-anchor="middle" x="1528" y="-555.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/span.h</text>
</a>
</g>
</g>
<!-- Node1->Node2 -->
<g id="edge2" class="edge">
<title>Node1->Node2</title>
-<path fill="none" stroke="#191970" d="M1556.5657,-660.4509C1556.7288,-642.184 1557.088,-601.9553 1557.3046,-577.6976"/>
-<polygon fill="#191970" stroke="#191970" points="1560.8059,-577.5558 1557.3954,-567.5249 1553.8061,-577.4932 1560.8059,-577.5558"/>
+<path fill="none" stroke="#191970" d="M1531.3213,-665.6407C1520.4047,-656.2782 1505.4254,-641.1188 1499,-624 1492.7225,-607.2752 1502.4603,-588.683 1512.306,-575.5051"/>
+<polygon fill="#191970" stroke="#191970" points="1515.052,-577.6756 1518.6244,-567.7012 1509.6116,-573.2708 1515.052,-577.6756"/>
</g>
<!-- Node3 -->
<g id="node4" class="node">
<title>Node3</title>
<g id="a_node4"><a xlink:href="node_8h.html" target="_top" xlink:title="Definitions and helper macros for IR/AST nodes. ">
-<polygon fill="#ffffff" stroke="#000000" points="1422.9804,-492.5 1422.9804,-511.5 1521.9804,-511.5 1521.9804,-492.5 1422.9804,-492.5"/>
-<text text-anchor="middle" x="1472.4804" y="-499.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/node.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1478.5,-492.5 1478.5,-511.5 1577.5,-511.5 1577.5,-492.5 1478.5,-492.5"/>
+<text text-anchor="middle" x="1528" y="-499.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/node.h</text>
</a>
</g>
</g>
<!-- Node1->Node3 -->
-<g id="edge124" class="edge">
+<g id="edge123" class="edge">
<title>Node1->Node3</title>
-<path fill="none" stroke="#191970" d="M1519.767,-660.44C1489.0988,-651.6241 1448.9127,-637.8435 1439.4804,-624 1417.4735,-591.7009 1442.068,-545.2844 1459.0677,-520.0335"/>
-<polygon fill="#191970" stroke="#191970" points="1462.0004,-521.9476 1464.8868,-511.7535 1456.2733,-517.9226 1462.0004,-521.9476"/>
+<path fill="none" stroke="#191970" d="M1532.0148,-665.9974C1520.6365,-656.4355 1503.7227,-640.742 1493,-624 1478.6638,-601.6161 1476.7428,-594.1548 1472,-568 1470.414,-559.2537 1468.2013,-556.0363 1472,-548 1478.0246,-535.2547 1489.5032,-524.8542 1500.5074,-517.1874"/>
+<polygon fill="#191970" stroke="#191970" points="1502.6121,-519.9948 1509.1176,-511.6324 1498.8172,-514.1127 1502.6121,-519.9948"/>
</g>
<!-- Node24 -->
<g id="node8" class="node">
<title>Node24</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1973.4804,-123.5 1973.4804,-142.5 2037.4804,-142.5 2037.4804,-123.5 1973.4804,-123.5"/>
-<text text-anchor="middle" x="2005.4804" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">algorithm</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2055,-123.5 2055,-142.5 2119,-142.5 2119,-123.5 2055,-123.5"/>
+<text text-anchor="middle" x="2087" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">algorithm</text>
</g>
<!-- Node1->Node24 -->
-<g id="edge127" class="edge">
+<g id="edge126" class="edge">
<title>Node1->Node24</title>
-<path fill="none" stroke="#191970" d="M1596.0697,-665.0553C1690.8391,-653.3297 1938.2751,-623.3727 2145.4804,-604 2198.7313,-599.0213 2589.1038,-607.207 2625.4804,-568 2669.5649,-520.4853 2614.8883,-475.4755 2563.4804,-436 2538.5122,-416.8271 2527.5082,-419.0951 2502.4804,-400 2452.7539,-362.061 2448.5449,-343.1951 2401.4804,-302 2335.0432,-243.8481 2325.7897,-215.6875 2245.4804,-179 2180.4525,-149.2934 2097.0423,-138.764 2047.627,-135.0368"/>
-<polygon fill="#191970" stroke="#191970" points="2047.836,-131.5429 2037.6172,-134.3402 2047.35,-138.526 2047.836,-131.5429"/>
+<path fill="none" stroke="#191970" d="M1558.0068,-665.9022C1572.6667,-655.7597 1596.2542,-639.1646 1616,-624 1678.1021,-576.3063 1694.0737,-564.6868 1752,-512 1880.566,-395.063 1887.9873,-337.9558 2028,-235 2055.8667,-214.5087 2079.0694,-228.5794 2097,-199 2105.5213,-184.9427 2101.5297,-166.2009 2096.2753,-152.2815"/>
+<polygon fill="#191970" stroke="#191970" points="2099.3443,-150.5559 2092.1975,-142.7346 2092.9069,-153.3056 2099.3443,-150.5559"/>
</g>
<!-- Node1->Node8 -->
-<g id="edge126" class="edge">
+<g id="edge125" class="edge">
<title>Node1->Node8</title>
-<path fill="none" stroke="#191970" d="M1516.6166,-663.2391C1375.8369,-639.1915 902.5855,-556.8499 755.4804,-512 691.3781,-492.4563 672.3466,-491.4585 615.4804,-456 480.4166,-371.782 341.7317,-331.1495 388.4804,-179 397.3145,-150.2485 401.0836,-139.115 426.4804,-123 462.3826,-100.219 574.1497,-87.6565 648.5263,-81.6288"/>
-<polygon fill="#191970" stroke="#191970" points="649.0675,-85.0971 658.7612,-80.8212 648.5168,-78.1188 649.0675,-85.0971"/>
+<path fill="none" stroke="#191970" d="M1504.2337,-674.412C1336.0212,-669.6633 687.569,-649.8291 600,-624 503.7684,-595.6158 476.6244,-581.2233 404,-512 291.3763,-404.6506 197.4877,-299.4298 296,-179 337.2233,-128.6051 376.2291,-163.5759 438,-143 459.9267,-135.6962 464.0258,-130.1596 486,-123 529.8284,-108.7199 580.5937,-96.7488 618.9126,-88.6016"/>
+<polygon fill="#191970" stroke="#191970" points="619.806,-91.9904 628.8734,-86.5101 618.3675,-85.1398 619.806,-91.9904"/>
</g>
<!-- Node14 -->
<g id="node18" class="node">
<title>Node14</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2895.4804,-6 2895.4804,-25 2939.4804,-25 2939.4804,-6 2895.4804,-6"/>
-<text text-anchor="middle" x="2917.4804" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2997,-6 2997,-25 3041,-25 3041,-6 2997,-6"/>
+<text text-anchor="middle" x="3019" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string</text>
</g>
<!-- Node1->Node14 -->
-<g id="edge129" class="edge">
+<g id="edge128" class="edge">
<title>Node1->Node14</title>
-<path fill="none" stroke="#191970" d="M1596.3437,-669.0228C1708.2924,-665.9611 2034.9682,-654.8488 2304.4804,-624 2357.7632,-617.9012 2370.4521,-612.0155 2423.4804,-604 2542.3115,-586.038 2579.1539,-610.7341 2691.4804,-568 2855.7329,-505.511 2913.2936,-480.0777 3009.4804,-333 3074.7136,-233.253 3116.7745,-160.9831 3043.4804,-67 3020.8299,-37.9558 2979.0556,-25.1879 2949.8394,-19.6432"/>
-<polygon fill="#191970" stroke="#191970" points="2950.1492,-16.1454 2939.7024,-17.9104 2948.9697,-23.0454 2950.1492,-16.1454"/>
+<path fill="none" stroke="#191970" d="M1583.5651,-674.6535C1785.9182,-670.218 2697.7522,-648.9107 2752,-624 2856.014,-576.2366 2834.1526,-501.5203 2928,-436 2961.6392,-412.5145 2976.9072,-419.5056 3013,-400 3060.0646,-374.5649 3078.0861,-373.5344 3113,-333 3139.4686,-302.2705 3152,-291.0572 3152,-250.5 3152,-250.5 3152,-250.5 3152,-133 3152,-79.1753 3090.0653,-44.0942 3050.7248,-27.2401"/>
+<polygon fill="#191970" stroke="#191970" points="3051.8409,-23.9147 3041.2612,-23.3475 3049.178,-30.3884 3051.8409,-23.9147"/>
</g>
<!-- Node15 -->
<g id="node19" class="node">
<title>Node15</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="445.9804,-6 445.9804,-25 514.9804,-25 514.9804,-6 445.9804,-6"/>
-<text text-anchor="middle" x="480.4804" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">type_traits</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="921.5,-6 921.5,-25 990.5,-25 990.5,-6 921.5,-6"/>
+<text text-anchor="middle" x="956" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">type_traits</text>
</g>
<!-- Node1->Node15 -->
-<g id="edge130" class="edge">
+<g id="edge129" class="edge">
<title>Node1->Node15</title>
-<path fill="none" stroke="#191970" d="M1516.9299,-666.1388C1448.3967,-659.2274 1303.224,-643.6127 1181.4804,-624 1056.1869,-603.8154 1025.1753,-596.3803 901.4804,-568 719.9118,-526.3412 221.632,-421.7808 119.4804,-266 75.095,-198.3124 352.3622,-70.7102 448.5123,-29.0385"/>
-<polygon fill="#191970" stroke="#191970" points="449.9586,-32.2265 457.7575,-25.0551 447.1887,-25.7978 449.9586,-32.2265"/>
+<path fill="none" stroke="#191970" d="M1504.2672,-673.9203C1360.1086,-668.0272 869.5801,-646.506 803,-624 760.1428,-609.513 744.3265,-604.7904 718,-568 630.577,-445.8292 601.3166,-373.2923 660,-235 707.8775,-122.1728 849.8656,-55.2878 918.2958,-28.7698"/>
+<polygon fill="#191970" stroke="#191970" points="919.7301,-31.9691 927.8384,-25.1495 917.2471,-25.4243 919.7301,-31.9691"/>
</g>
<!-- Node1->Node26 -->
-<g id="edge125" class="edge">
+<g id="edge124" class="edge">
<title>Node1->Node26</title>
-<path fill="none" stroke="#191970" d="M1516.7902,-660.7356C1483.2443,-652.0646 1438.877,-638.3288 1427.4804,-624 1396.2867,-584.7806 1417.3741,-562.0284 1414.4804,-512 1412.5163,-478.043 1406.8633,-465.6784 1423.4804,-436 1462.5372,-366.2442 1589.3728,-367.4614 1543.4804,-302 1534.651,-289.4057 1504.098,-277.54 1473.5836,-268.3797"/>
-<polygon fill="#191970" stroke="#191970" points="1474.2851,-264.9386 1463.7053,-265.5031 1472.3279,-271.6595 1474.2851,-264.9386"/>
+<path fill="none" stroke="#191970" d="M1557.1894,-665.8965C1569.2056,-656.4907 1586.6148,-641.1094 1597,-624 1668.5756,-506.0809 1697.1664,-339.248 1705.8539,-276.0759"/>
+<polygon fill="#191970" stroke="#191970" points="1709.3648,-276.2242 1707.2085,-265.8512 1702.4254,-275.3049 1709.3648,-276.2242"/>
</g>
<!-- Node45 -->
<g id="node35" class="node">
<title>Node45</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1308.4804,-308 1308.4804,-327 1352.4804,-327 1352.4804,-308 1308.4804,-308"/>
-<text text-anchor="middle" x="1330.4804" y="-315" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">limits</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="712,-308 712,-327 756,-327 756,-308 712,-308"/>
+<text text-anchor="middle" x="734" y="-315" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">limits</text>
</g>
<!-- Node1->Node45 -->
-<g id="edge128" class="edge">
+<g id="edge127" class="edge">
<title>Node1->Node45</title>
-<path fill="none" stroke="#191970" d="M1563.1787,-660.4571C1574.9311,-643.0634 1598.8378,-604.6109 1607.4804,-568 1618.8794,-519.7132 1590.0409,-457.3137 1572.4804,-436 1505.1057,-354.2252 1452.9368,-378.9842 1357.4804,-333 1356.7865,-332.6657 1356.0862,-332.3206 1355.3824,-331.967"/>
-<polygon fill="#191970" stroke="#191970" points="1356.7312,-328.7185 1346.2656,-327.0687 1353.4181,-334.8849 1356.7312,-328.7185"/>
+<path fill="none" stroke="#191970" d="M1504.3532,-674.0666C1340.1556,-667.6138 722,-638.0082 722,-558 722,-558 722,-558 722,-446 722,-407.3337 727.453,-362.4053 731.0072,-337.2279"/>
+<polygon fill="#191970" stroke="#191970" points="734.4981,-337.544 732.4802,-327.1431 727.5716,-336.5323 734.4981,-337.544"/>
</g>
<!-- Node49 -->
-<g id="node38" class="node">
+<g id="node37" class="node">
<title>Node49</title>
-<g id="a_node38"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
-<polygon fill="#ffffff" stroke="#000000" points="1448.4804,-604.5 1448.4804,-623.5 1528.4804,-623.5 1528.4804,-604.5 1448.4804,-604.5"/>
-<text text-anchor="middle" x="1488.4804" y="-611.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/type.h</text>
+<g id="a_node37"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
+<polygon fill="#ffffff" stroke="#000000" points="1508,-604.5 1508,-623.5 1588,-623.5 1588,-604.5 1508,-604.5"/>
+<text text-anchor="middle" x="1548" y="-611.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/type.h</text>
</a>
</g>
</g>
<!-- Node1->Node49 -->
-<g id="edge117" class="edge">
+<g id="edge116" class="edge">
<title>Node1->Node49</title>
-<path fill="none" stroke="#191970" d="M1544.6357,-660.2455C1534.5925,-651.9746 1519.981,-639.9416 1508.1801,-630.2232"/>
-<polygon fill="#191970" stroke="#191970" points="1510.1337,-627.298 1500.1894,-623.6427 1505.6837,-632.7015 1510.1337,-627.298"/>
+<path fill="none" stroke="#191970" d="M1544.625,-665.8906C1545.1795,-657.3657 1546.0072,-644.6392 1546.7042,-633.9235"/>
+<polygon fill="#191970" stroke="#191970" points="1550.2097,-633.951 1547.3662,-623.7449 1543.2244,-633.4966 1550.2097,-633.951"/>
</g>
<!-- Node2->Node3 -->
<g id="edge3" class="edge">
<title>Node2->Node3</title>
-<path fill="none" stroke="#191970" d="M1542.6746,-548.2455C1529.7548,-539.7337 1510.7873,-527.2375 1495.8252,-517.3801"/>
-<polygon fill="#191970" stroke="#191970" points="1497.3928,-514.2216 1487.1166,-511.6427 1493.5417,-520.067 1497.3928,-514.2216"/>
+<path fill="none" stroke="#191970" d="M1528,-548.2455C1528,-540.9382 1528,-530.6944 1528,-521.7046"/>
+<polygon fill="#191970" stroke="#191970" points="1531.5001,-521.6426 1528,-511.6427 1524.5001,-521.6427 1531.5001,-521.6426"/>
</g>
<!-- Node2->Node8 -->
-<g id="edge115" class="edge">
+<g id="edge114" class="edge">
<title>Node2->Node8</title>
-<path fill="none" stroke="#191970" d="M1516.9418,-557.5981C1351.8786,-555.0555 733.2436,-536.43 608.4804,-400 541.0088,-326.219 532.3998,-272.9923 566.4804,-179 576.5662,-151.1839 579.6324,-141.7996 602.4804,-123 621.2677,-107.5417 645.8945,-96.7937 667.6222,-89.5981"/>
-<polygon fill="#191970" stroke="#191970" points="668.8773,-92.8726 677.3698,-86.5379 666.7805,-86.194 668.8773,-92.8726"/>
+<path fill="none" stroke="#191970" d="M1487.4299,-555.1064C1343.0467,-544.4767 854.9213,-505.6097 707,-456 659.8038,-440.1714 642.3989,-437.7934 610,-400 589.2112,-375.7498 589.8782,-364.3958 584,-333 566.584,-239.98 633.6704,-136.1517 663.919,-95.1015"/>
+<polygon fill="#191970" stroke="#191970" points="666.8779,-96.9905 670.0975,-86.8967 661.286,-92.7796 666.8779,-96.9905"/>
</g>
<!-- Node2->Node14 -->
-<g id="edge116" class="edge">
+<g id="edge115" class="edge">
<title>Node2->Node14</title>
-<path fill="none" stroke="#191970" d="M1597.9939,-557.5525C1743.9492,-555.6654 2242.4069,-546.7652 2397.4804,-512 2603.4857,-465.8167 2656.4624,-441.644 2837.4804,-333 2886.5895,-303.5256 2936.4804,-307.7752 2936.4804,-250.5 2936.4804,-250.5 2936.4804,-250.5 2936.4804,-133 2936.4804,-98.1936 2928.2551,-58.3226 2922.6284,-35.1197"/>
-<polygon fill="#191970" stroke="#191970" points="2925.9574,-34.0065 2920.1223,-25.1633 2919.1692,-35.7152 2925.9574,-34.0065"/>
+<path fill="none" stroke="#191970" d="M1568.6442,-555.0917C1771.1763,-540.0517 2671.2795,-466.6029 2913,-333 2997.2539,-286.4314 2997.7495,-237.3227 3017,-143 3023.7831,-109.7644 3025.6409,-100.881 3024,-67 3023.494,-56.5527 3022.414,-44.9549 3021.3867,-35.4145"/>
+<polygon fill="#191970" stroke="#191970" points="3024.85,-34.8928 3020.2426,-25.3522 3017.8948,-35.6837 3024.85,-34.8928"/>
</g>
<!-- Node3->Node4 -->
<g id="edge4" class="edge">
<title>Node3->Node4</title>
-<path fill="none" stroke="#191970" d="M1422.8461,-497.8822C1304.4813,-488.0623 1005.3061,-463.2418 868.1507,-451.863"/>
-<polygon fill="#191970" stroke="#191970" points="868.4187,-448.3733 858.1635,-451.0344 867.8399,-455.3493 868.4187,-448.3733"/>
+<path fill="none" stroke="#191970" d="M1478.2352,-497.6524C1365.252,-487.7818 1089.0471,-463.6515 958.1012,-452.2116"/>
+<polygon fill="#191970" stroke="#191970" points="958.055,-448.6944 947.7883,-451.3107 957.4458,-455.6678 958.055,-448.6944"/>
</g>
<!-- Node5 -->
<g id="node6" class="node">
<title>Node5</title>
<g id="a_node6"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="1572.9804,-369.5 1572.9804,-399.5 1685.9804,-399.5 1685.9804,-369.5 1572.9804,-369.5"/>
-<text text-anchor="start" x="1580.9804" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
-<text text-anchor="middle" x="1629.4804" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_equal.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1516.5,-369.5 1516.5,-399.5 1629.5,-399.5 1629.5,-369.5 1516.5,-369.5"/>
+<text text-anchor="start" x="1524.5" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
+<text text-anchor="middle" x="1573" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_equal.h</text>
</a>
</g>
</g>
<!-- Node3->Node5 -->
-<g id="edge106" class="edge">
+<g id="edge105" class="edge">
<title>Node3->Node5</title>
-<path fill="none" stroke="#191970" d="M1502.4962,-492.3866C1523.4857,-484.7078 1551.4184,-472.4971 1572.4804,-456 1589.5095,-442.6618 1604.5588,-423.2665 1614.9116,-408.0407"/>
-<polygon fill="#191970" stroke="#191970" points="1617.8697,-409.9124 1620.4439,-399.6351 1612.0225,-406.064 1617.8697,-409.9124"/>
+<path fill="none" stroke="#191970" d="M1532.8073,-492.2576C1537.2725,-483.0332 1543.9704,-468.7382 1549,-456 1555.0162,-440.7631 1560.8842,-423.3423 1565.3118,-409.5112"/>
+<polygon fill="#191970" stroke="#191970" points="1568.7268,-410.3195 1568.3956,-399.7298 1562.0507,-408.2147 1568.7268,-410.3195"/>
</g>
<!-- Node16 -->
<g id="node10" class="node">
<title>Node16</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2042.9804,-6 2042.9804,-25 2087.9804,-25 2087.9804,-6 2042.9804,-6"/>
-<text text-anchor="middle" x="2065.4804" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">utility</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2289.5,-6 2289.5,-25 2334.5,-25 2334.5,-6 2289.5,-6"/>
+<text text-anchor="middle" x="2312" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">utility</text>
</g>
<!-- Node3->Node16 -->
-<g id="edge113" class="edge">
+<g id="edge112" class="edge">
<title>Node3->Node16</title>
-<path fill="none" stroke="#191970" d="M1522.0349,-495.456C1681.4383,-473.2242 2175.3264,-394.7389 2256.4804,-266 2307.492,-185.0776 2252.7006,-122.0656 2174.4804,-67 2150.4786,-50.1032 2120.0614,-36.2633 2097.4614,-27.2022"/>
-<polygon fill="#191970" stroke="#191970" points="2098.5759,-23.88 2087.9879,-23.499 2096.0274,-30.3996 2098.5759,-23.88"/>
+<path fill="none" stroke="#191970" d="M1577.5598,-495.5187C1678.0794,-481.8772 1912.7978,-447.4988 2106,-400 2201.5468,-376.5098 2224.5669,-366.721 2317,-333 2390.1961,-306.2969 2426.3948,-323.4753 2479,-266 2505.8561,-236.6577 2516.4913,-215.1975 2500,-179 2467.0029,-106.5729 2384.3188,-53.7958 2340.4186,-29.8375"/>
+<polygon fill="#191970" stroke="#191970" points="2341.8865,-26.6536 2331.4164,-25.0327 2338.5904,-32.8291 2341.8865,-26.6536"/>
</g>
<!-- Node18 -->
<g id="node11" class="node">
<title>Node18</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2169.9804,-179.5 2169.9804,-198.5 2216.9804,-198.5 2216.9804,-179.5 2169.9804,-179.5"/>
-<text text-anchor="middle" x="2193.4804" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">vector</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2443.5,-179.5 2443.5,-198.5 2490.5,-198.5 2490.5,-179.5 2443.5,-179.5"/>
+<text text-anchor="middle" x="2467" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">vector</text>
</g>
<!-- Node3->Node18 -->
-<g id="edge114" class="edge">
+<g id="edge113" class="edge">
<title>Node3->Node18</title>
-<path fill="none" stroke="#191970" d="M1498.4139,-492.3554C1523.3188,-483.0805 1561.9941,-468.6438 1595.4804,-456 1775.2509,-388.122 1819.4027,-369.0586 1999.4804,-302 2043.3558,-285.6614 2057.7332,-289.0578 2098.4804,-266 2128.4437,-249.0446 2158.5663,-222.6728 2176.6232,-205.618"/>
-<polygon fill="#191970" stroke="#191970" points="2179.2405,-207.957 2184.0267,-198.5048 2174.3907,-202.9092 2179.2405,-207.957"/>
+<path fill="none" stroke="#191970" d="M1559.107,-492.4887C1604.0641,-478.9285 1689.9364,-453.7215 1764,-436 1991.4376,-381.5801 2064.1514,-422.9931 2280,-333 2302.7551,-323.5128 2303.5674,-312.2265 2326,-302 2382.1117,-276.4199 2420.2965,-313.1857 2460,-266 2473.2572,-250.2445 2473.14,-225.7235 2470.9166,-208.5989"/>
+<polygon fill="#191970" stroke="#191970" points="2474.3455,-207.8805 2469.2707,-198.5801 2467.4381,-209.0153 2474.3455,-207.8805"/>
</g>
<!-- Node22 -->
<g id="node15" class="node">
<title>Node22</title>
<g id="a_node15"><a xlink:href="runtime_2memory_8h.html" target="_top" xlink:title="Runtime memory management. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="948.9804,-123.5 948.9804,-142.5 1077.9804,-142.5 1077.9804,-123.5 948.9804,-123.5"/>
-<text text-anchor="middle" x="1013.4804" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/memory.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1190.5,-123.5 1190.5,-142.5 1319.5,-142.5 1319.5,-123.5 1190.5,-123.5"/>
+<text text-anchor="middle" x="1255" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/memory.h</text>
</a>
</g>
</g>
<!-- Node3->Node22 -->
-<g id="edge109" class="edge">
+<g id="edge108" class="edge">
<title>Node3->Node22</title>
-<path fill="none" stroke="#191970" d="M1422.9288,-495.8056C1290.875,-478.1807 935.8824,-423.1279 867.4804,-333 815.6417,-264.6962 933.2143,-181.4163 987.676,-147.9622"/>
-<polygon fill="#191970" stroke="#191970" points="989.6951,-150.8318 996.4493,-142.669 986.0789,-144.8381 989.6951,-150.8318"/>
+<path fill="none" stroke="#191970" d="M1478.2549,-494.4947C1405.1244,-482.1135 1265.6274,-453.3057 1158,-400 1113.9499,-378.1829 1091.8245,-377.0464 1070,-333 1063.883,-320.6546 1064.4414,-314.6067 1070,-302 1102.5386,-228.204 1184.9754,-172.659 1227.9718,-147.634"/>
+<polygon fill="#191970" stroke="#191970" points="1229.809,-150.6155 1236.7641,-142.6232 1226.343,-144.5338 1229.809,-150.6155"/>
</g>
<!-- Node3->Node8 -->
-<g id="edge110" class="edge">
+<g id="edge109" class="edge">
<title>Node3->Node8</title>
-<path fill="none" stroke="#191970" d="M1422.7205,-498.4271C1350.9719,-492.6936 1214.5469,-479.6116 1100.4804,-456 872.1585,-408.7376 735.2154,-463.6216 611.4804,-266 572.2501,-203.3438 655.8644,-125.8065 697.174,-92.9296"/>
-<polygon fill="#191970" stroke="#191970" points="699.5467,-95.518 705.2874,-86.6133 695.2466,-89.9945 699.5467,-95.518"/>
+<path fill="none" stroke="#191970" d="M1478.3237,-501.4654C1333.8343,-499.5103 915.6583,-490.908 784,-456 686.765,-430.219 635.3972,-424.6819 594,-333 568.7376,-277.0516 625.4461,-272.9348 666,-143 670.771,-127.7137 673.8787,-109.8082 675.7336,-96.5069"/>
+<polygon fill="#191970" stroke="#191970" points="679.2067,-96.939 677.0059,-86.5753 672.2635,-96.0494 679.2067,-96.939"/>
</g>
<!-- Node9 -->
<g id="node17" class="node">
<title>Node9</title>
<g id="a_node17"><a xlink:href="c__runtime__api_8h.html" target="_top" xlink:title="tvm/runtime/c_runtime\l_api.h">
-<polygon fill="#ffffff" stroke="#ff0000" points="298.9804,-.5 298.9804,-30.5 427.9804,-30.5 427.9804,-.5 298.9804,-.5"/>
-<text text-anchor="start" x="306.9804" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/c_runtime</text>
-<text text-anchor="middle" x="363.4804" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_api.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="321.5,-.5 321.5,-30.5 450.5,-30.5 450.5,-.5 321.5,-.5"/>
+<text text-anchor="start" x="329.5" y="-18.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/c_runtime</text>
+<text text-anchor="middle" x="386" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_api.h</text>
</a>
</g>
</g>
<!-- Node3->Node9 -->
-<g id="edge108" class="edge">
+<g id="edge107" class="edge">
<title>Node3->Node9</title>
-<path fill="none" stroke="#191970" d="M1422.7269,-500.5582C1248.5996,-495.3527 672.7936,-476.7378 592.4804,-456 437.2892,-415.9279 202.342,-289.9983 46.4804,-143 19.007,-117.0889 -16.2321,-95.5563 8.4804,-67 26.5369,-46.1351 189.7351,-29.4769 288.4587,-21.2009"/>
-<polygon fill="#191970" stroke="#191970" points="289.0114,-24.6672 298.6885,-20.354 288.4337,-17.6911 289.0114,-24.6672"/>
+<path fill="none" stroke="#191970" d="M1478.3911,-500.5605C1312.0992,-495.55 780.7045,-477.9762 707,-456 504.4502,-395.6066 495.2101,-279.0749 407,-87 400.2019,-72.1972 395.011,-54.6601 391.4908,-40.6588"/>
+<polygon fill="#191970" stroke="#191970" points="394.8508,-39.6598 389.1256,-30.7451 388.0419,-41.2843 394.8508,-39.6598"/>
</g>
<!-- Node3->Node14 -->
-<g id="edge111" class="edge">
+<g id="edge110" class="edge">
<title>Node3->Node14</title>
-<path fill="none" stroke="#191970" d="M1522.1265,-498.6436C1731.085,-484.0485 2539.2191,-422.777 2776.4804,-333 2837.7004,-309.8351 2898.4804,-315.956 2898.4804,-250.5 2898.4804,-250.5 2898.4804,-250.5 2898.4804,-133 2898.4804,-98.1936 2906.7057,-58.3226 2912.3325,-35.1197"/>
-<polygon fill="#191970" stroke="#191970" points="2915.7917,-35.7152 2914.8386,-25.1633 2909.0034,-34.0065 2915.7917,-35.7152"/>
+<path fill="none" stroke="#191970" d="M1577.6452,-495.8923C1648.9713,-487.1102 1784.5713,-470.3878 1900,-456 1903.3997,-455.5762 2875.9789,-334.6158 2879,-333 2991.5367,-272.811 3013.6912,-94.8141 3017.9857,-35.2049"/>
+<polygon fill="#191970" stroke="#191970" points="3021.4791,-35.4179 3018.6129,-25.2181 3014.4929,-34.9791 3021.4791,-35.4179"/>
</g>
<!-- Node3->Node15 -->
-<g id="edge112" class="edge">
+<g id="edge111" class="edge">
<title>Node3->Node15</title>
-<path fill="none" stroke="#191970" d="M1422.7423,-500.2367C1278.5403,-494.9171 861.9973,-478.0301 727.4804,-456 633.0028,-440.5272 600.5866,-450.8647 519.4804,-400 412.1751,-332.7049 446.8427,-249.5357 452.4804,-123 453.5926,-98.0369 451.1885,-91.1828 457.4804,-67 460.445,-55.6059 465.6983,-43.5544 470.4365,-33.9765"/>
-<polygon fill="#191970" stroke="#191970" points="473.5829,-35.5118 475.0703,-25.0219 467.3659,-32.2947 473.5829,-35.5118"/>
+<path fill="none" stroke="#191970" d="M1478.3375,-500.891C1326.841,-497.216 877.2843,-484.0439 818,-456 712.2444,-405.9733 623.7097,-340.1486 675,-235 682.1929,-220.254 870.228,-79.3643 934.6127,-31.3986"/>
+<polygon fill="#191970" stroke="#191970" points="936.9139,-34.0488 942.8453,-25.2699 932.7339,-28.4339 936.9139,-34.0488"/>
</g>
<!-- Node34 -->
<g id="node23" class="node">
<title>Node34</title>
<g id="a_node23"><a xlink:href="structural__hash_8h.html" target="_top" xlink:title="tvm/node/structural\l_hash.h">
-<polygon fill="#ffffff" stroke="#ff0000" points="1325.9804,-369.5 1325.9804,-399.5 1438.9804,-399.5 1438.9804,-369.5 1325.9804,-369.5"/>
-<text text-anchor="start" x="1333.9804" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
-<text text-anchor="middle" x="1382.4804" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_hash.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1340.5,-369.5 1340.5,-399.5 1453.5,-399.5 1453.5,-369.5 1340.5,-369.5"/>
+<text text-anchor="start" x="1348.5" y="-387.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
+<text text-anchor="middle" x="1397" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_hash.h</text>
</a>
</g>
</g>
<!-- Node3->Node34 -->
-<g id="edge107" class="edge">
+<g id="edge106" class="edge">
<title>Node3->Node34</title>
-<path fill="none" stroke="#191970" d="M1460.2351,-492.2823C1449.6899,-483.5225 1434.4774,-469.9147 1423.4804,-456 1411.9462,-441.4056 1401.5536,-423.1957 1394.1176,-408.8058"/>
-<polygon fill="#191970" stroke="#191970" points="1397.0816,-406.909 1389.4642,-399.5452 1390.8268,-410.052 1397.0816,-406.909"/>
-</g>
-<!-- Node47 -->
-<g id="node37" class="node">
-<title>Node47</title>
-<g id="a_node37"><a xlink:href="repr__printer_8h.html" target="_top" xlink:title="Printer class to print repr string of each AST/IR nodes. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="1431.9804,-436.5 1431.9804,-455.5 1562.9804,-455.5 1562.9804,-436.5 1431.9804,-436.5"/>
-<text text-anchor="middle" x="1497.4804" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/repr_printer.h</text>
-</a>
-</g>
-</g>
-<!-- Node3->Node47 -->
-<g id="edge105" class="edge">
-<title>Node3->Node47</title>
-<path fill="none" stroke="#191970" d="M1476.8351,-492.2455C1480.2048,-484.6973 1484.9734,-474.0158 1489.0788,-464.8197"/>
-<polygon fill="#191970" stroke="#191970" points="1492.2951,-466.2009 1493.1757,-455.6427 1485.9031,-463.3473 1492.2951,-466.2009"/>
+<path fill="none" stroke="#191970" d="M1517.2797,-492.3845C1496.5015,-473.7476 1450.2713,-432.2815 1421.5601,-406.5291"/>
+<polygon fill="#191970" stroke="#191970" points="1423.7436,-403.7859 1413.9623,-399.7143 1419.0696,-408.9969 1423.7436,-403.7859"/>
</g>
<!-- Node4->Node5 -->
<g id="edge5" class="edge">
<title>Node4->Node5</title>
-<path fill="none" stroke="#191970" d="M858.0166,-442.7096C977.5105,-436.0432 1252.1046,-419.8737 1482.4804,-400 1508.6595,-397.7416 1537.4134,-394.8102 1562.4944,-392.1054"/>
-<polygon fill="#191970" stroke="#191970" points="1563.0727,-395.5632 1572.6357,-391.0026 1562.3159,-388.6042 1563.0727,-395.5632"/>
+<path fill="none" stroke="#191970" d="M947.6515,-442.3635C1052.7702,-435.7912 1275.538,-420.674 1463,-400 1476.9602,-398.4604 1491.8794,-396.5425 1506.0823,-394.586"/>
+<polygon fill="#191970" stroke="#191970" points="1506.8554,-398.012 1516.2736,-393.1594 1505.885,-391.0796 1506.8554,-398.012"/>
</g>
<!-- Node4->Node18 -->
<g id="edge104" class="edge">
<title>Node4->Node18</title>
-<path fill="none" stroke="#191970" d="M858.1244,-443.1954C997.222,-436.5225 1335.8942,-418.8445 1448.4804,-400 1501.1131,-391.1904 1512.2717,-380.0479 1564.4804,-369 1665.2761,-347.6706 1692.5905,-353.879 1793.4804,-333 1910.335,-308.8172 1942.553,-309.9877 2053.4804,-266 2096.6143,-248.8955 2143.1539,-221.2676 2170.2316,-204.172"/>
-<polygon fill="#191970" stroke="#191970" points="2172.4153,-206.9303 2178.9603,-198.5988 2168.6482,-201.0304 2172.4153,-206.9303"/>
+<path fill="none" stroke="#191970" d="M947.5085,-443.3068C1075.3446,-437.3952 1382.2684,-422.0311 1639,-400 1672.7478,-397.104 2213.3883,-345.1656 2245,-333 2266.9877,-324.5381 2266.601,-311.8561 2288,-302 2349.4873,-273.6798 2384.8094,-310.2971 2436,-266 2452.8056,-251.4575 2460.6133,-226.4925 2464.1653,-208.94"/>
+<polygon fill="#191970" stroke="#191970" points="2467.6876,-209.109 2465.9366,-198.6599 2460.7893,-207.9203 2467.6876,-209.109"/>
</g>
<!-- Node4->Node22 -->
<g id="edge69" class="edge">
<title>Node4->Node22</title>
-<path fill="none" stroke="#191970" d="M754.2237,-436.4706C730.807,-429.5356 702.6335,-418.1092 682.4804,-400 623.0981,-346.6402 562.4598,-298.012 611.4804,-235 651.2725,-183.8505 833.8608,-154.2018 938.7577,-141.1066"/>
-<polygon fill="#191970" stroke="#191970" points="939.471,-144.5454 948.9702,-139.8533 938.6183,-137.5975 939.471,-144.5454"/>
+<path fill="none" stroke="#191970" d="M893.7066,-436.3263C920.4618,-398.3797 1023.7645,-257.91 1140,-179 1161.5416,-164.3759 1188.4321,-153.295 1210.7927,-145.6771"/>
+<polygon fill="#191970" stroke="#191970" points="1211.9831,-148.9701 1220.3901,-142.5224 1209.7972,-142.3202 1211.9831,-148.9701"/>
</g>
<!-- Node4->Node8 -->
<g id="edge71" class="edge">
<title>Node4->Node8</title>
-<path fill="none" stroke="#191970" d="M736.6311,-436.6578C707.1673,-429.9707 672.4284,-418.7151 645.4804,-400 614.7428,-378.653 608.5258,-367.6875 594.4804,-333 568.6541,-269.2175 547.2807,-240.3334 578.4804,-179 600.4457,-135.8199 650.2065,-106.5929 684.0817,-90.8574"/>
-<polygon fill="#191970" stroke="#191970" points="685.911,-93.8732 693.607,-86.5915 683.0498,-87.4846 685.911,-93.8732"/>
+<path fill="none" stroke="#191970" d="M861.5299,-436.3883C822.4662,-420.466 747.6105,-385.1168 703,-333 682.0132,-308.4819 681.3461,-297.6434 675,-266 663.9559,-210.931 695.458,-198.5122 704,-143 705.3519,-134.2145 706.2613,-131.5964 704,-123 701.3888,-113.0734 696.0933,-103.0366 690.9588,-94.8822"/>
+<polygon fill="#191970" stroke="#191970" points="693.864,-92.9303 685.3819,-86.5817 688.0537,-96.8342 693.864,-92.9303"/>
</g>
<!-- Node4->Node9 -->
<g id="edge67" class="edge">
<title>Node4->Node9</title>
-<path fill="none" stroke="#191970" d="M736.7197,-438.4574C677.583,-430.3382 592.5164,-416.3627 563.4804,-400 425.6844,-322.3476 379.3344,-112.9703 367.1619,-40.5517"/>
-<polygon fill="#191970" stroke="#191970" points="370.5919,-39.8361 365.5497,-30.5181 363.6805,-40.9467 370.5919,-39.8361"/>
+<path fill="none" stroke="#191970" d="M837.4121,-436.4533C807.534,-429.3098 769.494,-417.6788 739,-400 595.3953,-316.7457 578.9464,-266.4685 468,-143 446.472,-119.0422 439.572,-113.9936 422,-87 412.2049,-71.953 403.128,-53.8844 396.5389,-39.6757"/>
+<polygon fill="#191970" stroke="#191970" points="399.7138,-38.2021 392.3964,-30.5401 393.3386,-41.093 399.7138,-38.2021"/>
</g>
<!-- Node4->Node14 -->
<g id="edge102" class="edge">
<title>Node4->Node14</title>
-<path fill="none" stroke="#191970" d="M857.999,-444.1597C1178.1064,-434.216 2662.3577,-385.4115 2743.4804,-333 2855.8689,-260.3885 2816.9949,-182.5392 2884.4804,-67 2891.0555,-55.743 2898.9135,-43.4392 2905.3118,-33.6813"/>
-<polygon fill="#191970" stroke="#191970" points="2908.3092,-35.494 2910.9095,-25.2232 2902.4718,-31.6307 2908.3092,-35.494"/>
+<path fill="none" stroke="#191970" d="M947.6455,-443.337C1094.5947,-436.8015 1481.3785,-419.105 1804,-400 2005.8939,-388.0443 2056.1914,-382.3192 2258,-369 2407.9981,-359.1003 2924,-400.8245 2924,-250.5 2924,-250.5 2924,-250.5 2924,-133 2924,-88.262 2966.161,-50.9849 2994.3091,-31.0828"/>
+<polygon fill="#191970" stroke="#191970" points="2996.5884,-33.7652 3002.8845,-25.2441 2992.6488,-27.979 2996.5884,-33.7652"/>
</g>
<!-- Node4->Node15 -->
<g id="edge103" class="edge">
<title>Node4->Node15</title>
-<path fill="none" stroke="#191970" d="M739.3101,-436.4192C691.9806,-427.7979 629.6805,-414.316 608.4804,-400 517.4076,-338.5005 498.091,-303.6268 464.4804,-199 446.0352,-141.5817 463.4495,-69.1764 473.8996,-35.026"/>
-<polygon fill="#191970" stroke="#191970" points="477.2816,-35.9408 476.9935,-25.3499 470.6141,-33.8089 477.2816,-35.9408"/>
+<path fill="none" stroke="#191970" d="M888.5708,-436.1995C897.386,-381.2007 940.7411,-110.7022 952.8523,-35.1388"/>
+<polygon fill="#191970" stroke="#191970" points="956.3348,-35.5263 954.4616,-25.0984 949.423,-34.4184 956.3348,-35.5263"/>
</g>
<!-- Node33 -->
<g id="node22" class="node">
<title>Node33</title>
<g id="a_node22"><a xlink:href="data__type_8h.html" target="_top" xlink:title="tvm/runtime/data_type.h">
-<polygon fill="#ffffff" stroke="#000000" points="620.4804,-241 620.4804,-260 758.4804,-260 758.4804,-241 620.4804,-241"/>
-<text text-anchor="middle" x="689.4804" y="-248" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/data_type.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="684,-241 684,-260 822,-260 822,-241 684,-241"/>
+<text text-anchor="middle" x="753" y="-248" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/data_type.h</text>
</a>
</g>
</g>
<!-- Node4->Node33 -->
<g id="edge68" class="edge">
<title>Node4->Node33</title>
-<path fill="none" stroke="#191970" d="M792.1247,-436.3051C775.1381,-405.5563 722.2674,-309.8505 699.7753,-269.1357"/>
-<polygon fill="#191970" stroke="#191970" points="702.7677,-267.3141 694.8685,-260.2534 696.6404,-270.699 702.7677,-267.3141"/>
+<path fill="none" stroke="#191970" d="M877.36,-436.1943C868.7007,-427.1589 855.8682,-413.1922 846,-400 812.5882,-355.3338 779.5545,-298.3816 763.3373,-269.355"/>
+<polygon fill="#191970" stroke="#191970" points="766.238,-267.3686 758.3293,-260.3185 760.1154,-270.7617 766.238,-267.3686"/>
</g>
<!-- Node4->Node34 -->
<g id="edge34" class="edge">
<title>Node4->Node34</title>
-<path fill="none" stroke="#191970" d="M858.1567,-443.4246C953.5447,-438.7602 1144.48,-426.9383 1315.7496,-399.9637"/>
-<polygon fill="#191970" stroke="#191970" points="1316.3775,-403.4079 1325.7009,-398.3756 1315.2743,-396.4954 1316.3775,-403.4079"/>
+<path fill="none" stroke="#191970" d="M947.8005,-439.7354C1027.0951,-431.4184 1170.0813,-415.918 1292,-400 1304.399,-398.3812 1317.6025,-396.5314 1330.3128,-394.687"/>
+<polygon fill="#191970" stroke="#191970" points="1330.9883,-398.1255 1340.3756,-393.2134 1329.974,-391.1993 1330.9883,-398.1255"/>
</g>
<!-- Node35 -->
<g id="node24" class="node">
<title>Node35</title>
<g id="a_node24"><a xlink:href="ndarray_8h.html" target="_top" xlink:title="A device-independent managed NDArray abstraction. ">
-<polygon fill="#ffffff" stroke="#ff0000" points="974.9804,-308 974.9804,-327 1099.9804,-327 1099.9804,-308 974.9804,-308"/>
-<text text-anchor="middle" x="1037.4804" y="-315" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/ndarray.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1290.5,-308 1290.5,-327 1415.5,-327 1415.5,-308 1290.5,-308"/>
+<text text-anchor="middle" x="1353" y="-315" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/ndarray.h</text>
</a>
</g>
</g>
<!-- Node4->Node35 -->
<g id="edge70" class="edge">
<title>Node4->Node35</title>
-<path fill="none" stroke="#191970" d="M815.5194,-436.3416C857.4063,-413.9147 961.431,-358.2181 1010.739,-331.8178"/>
-<polygon fill="#191970" stroke="#191970" points="1012.3957,-334.901 1019.5595,-327.0952 1009.0915,-328.7298 1012.3957,-334.901"/>
+<path fill="none" stroke="#191970" d="M919.9006,-436.4381C970.8414,-421.6816 1071.9128,-392.5958 1158,-369 1208.481,-355.1636 1266.4978,-339.9258 1306.0366,-329.6413"/>
+<polygon fill="#191970" stroke="#191970" points="1307.1263,-332.9744 1315.9248,-327.0723 1305.366,-326.1994 1307.1263,-332.9744"/>
</g>
<!-- Node4->Node41 -->
<g id="edge72" class="edge">
<title>Node4->Node41</title>
-<path fill="none" stroke="#191970" d="M858.4392,-437.7058C942.4639,-426.2733 1093.4122,-405.7349 1181.1562,-393.7963"/>
-<polygon fill="#191970" stroke="#191970" points="1181.8364,-397.2361 1191.2732,-392.4198 1180.8926,-390.3 1181.8364,-397.2361"/>
+<path fill="none" stroke="#191970" d="M942.8518,-436.4684C995.5446,-427.3931 1076.7184,-413.1988 1147,-400 1150.1888,-399.4012 1153.4494,-398.7816 1156.7418,-398.1503"/>
+<polygon fill="#191970" stroke="#191970" points="1157.5206,-401.5646 1166.6742,-396.2296 1156.1916,-394.6919 1157.5206,-401.5646"/>
</g>
<!-- Node5->Node32 -->
<g id="edge6" class="edge">
<title>Node5->Node32</title>
-<path fill="none" stroke="#191970" d="M1685.9836,-371.7539C1690.8774,-370.7747 1695.7605,-369.8415 1700.4804,-369 1729.0669,-363.9033 1898.3693,-340.848 1998.1114,-327.3753"/>
-<polygon fill="#191970" stroke="#191970" points="1998.6915,-330.8288 2008.1333,-326.0222 1997.7549,-323.8918 1998.6915,-330.8288"/>
+<path fill="none" stroke="#191970" d="M1629.7167,-378.1666C1738.8566,-365.9793 1978.0005,-339.2749 2099.304,-325.7294"/>
+<polygon fill="#191970" stroke="#191970" points="2100.0542,-329.1675 2109.604,-324.5792 2099.2773,-322.2107 2100.0542,-329.1675"/>
</g>
<!-- Node5->Node14 -->
<g id="edge33" class="edge">
<title>Node5->Node14</title>
-<path fill="none" stroke="#191970" d="M1686.2909,-371.1563C1691.0727,-370.3201 1695.8487,-369.5823 1700.4804,-369 1756.8496,-361.9127 2675.1115,-364.3672 2722.4804,-333 2729.2776,-328.499 2841.2928,-73.2888 2846.4804,-67 2858.7578,-52.1164 2876.2743,-39.3502 2890.8365,-30.2513"/>
-<polygon fill="#191970" stroke="#191970" points="2892.6815,-33.226 2899.4484,-25.0737 2889.0746,-27.2267 2892.6815,-33.226"/>
+<path fill="none" stroke="#191970" d="M1629.7137,-382.1759C1868.8531,-372.3657 2782.7901,-334.7666 2786,-333 2854.2631,-295.4307 2886,-266.9186 2886,-189 2886,-189 2886,-189 2886,-133 2886,-101.7875 2888.9748,-90.0686 2910,-67 2930.478,-44.5318 2962.8079,-30.9807 2986.9482,-23.4609"/>
+<polygon fill="#191970" stroke="#191970" points="2988.1187,-26.7653 2996.7342,-20.599 2986.1538,-20.0467 2988.1187,-26.7653"/>
</g>
<!-- Node5->Node33 -->
<g id="edge28" class="edge">
<title>Node5->Node33</title>
-<path fill="none" stroke="#191970" d="M1572.9613,-372.0098C1505.5058,-357.079 1401.4673,-333.9654 1399.4804,-333 1379.8761,-323.4747 1381.7184,-310.0925 1361.4804,-302 1334.3504,-291.1516 940.3469,-265.8734 768.5728,-255.2922"/>
-<polygon fill="#191970" stroke="#191970" points="768.7726,-251.798 758.5766,-254.6776 768.343,-258.7848 768.7726,-251.798"/>
+<path fill="none" stroke="#191970" d="M1516.4364,-375.7208C1444.7207,-364.3758 1325.2094,-344.6962 1282,-333 1243.5878,-322.6023 1236.7065,-311.2419 1198,-302 1163.3613,-293.7294 950.4913,-270.9305 832.1618,-258.6264"/>
+<polygon fill="#191970" stroke="#191970" points="832.3711,-255.1294 822.0631,-257.5779 831.6482,-262.092 832.3711,-255.1294"/>
</g>
<!-- Node32->Node24 -->
<g id="edge7" class="edge">
<title>Node32->Node24</title>
-<path fill="none" stroke="#191970" d="M2134.6053,-306.9532C2160.7444,-299.5685 2189.3574,-287.0765 2208.4804,-266 2235.0129,-236.7572 2251.3943,-209.6335 2226.4804,-179 2204.4239,-151.8797 2105.787,-140.2193 2048.0125,-135.642"/>
-<polygon fill="#191970" stroke="#191970" points="2047.9625,-132.1284 2037.7279,-134.8676 2047.4369,-139.1086 2047.9625,-132.1284"/>
+<path fill="none" stroke="#191970" d="M2115.6954,-302.4995C2083.6495,-292.8266 2048.3141,-279.558 2039,-266 2031.1985,-254.6438 2030.438,-245.7944 2039,-235 2082.6807,-179.9306 2149.3193,-254.0694 2193,-199 2214.6417,-171.7157 2167.3818,-152.6848 2129.1722,-142.2929"/>
+<polygon fill="#191970" stroke="#191970" points="2129.759,-138.8296 2119.2012,-139.7143 2128.0063,-145.6066 2129.759,-138.8296"/>
</g>
<!-- Node29 -->
<g id="node9" class="node">
<title>Node29</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2093.4804,-179.5 2093.4804,-198.5 2151.4804,-198.5 2151.4804,-179.5 2093.4804,-179.5"/>
-<text text-anchor="middle" x="2122.4804" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">memory</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1258,-179.5 1258,-198.5 1316,-198.5 1316,-179.5 1258,-179.5"/>
+<text text-anchor="middle" x="1287" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">memory</text>
</g>
<!-- Node32->Node29 -->
<g id="edge8" class="edge">
<title>Node32->Node29</title>
-<path fill="none" stroke="#191970" d="M2060.6993,-302.3889C2049.8799,-285.1991 2036.4165,-256.7039 2048.4804,-235 2056.4383,-220.6832 2070.9033,-210.2479 2084.947,-202.972"/>
-<polygon fill="#191970" stroke="#191970" points="2086.5206,-206.0993 2094.0423,-198.6379 2083.5093,-199.7801 2086.5206,-206.0993"/>
+<path fill="none" stroke="#191970" d="M2109.8471,-314.6898C2009.1714,-309.4809 1806.3741,-296.0017 1637,-266 1583.1837,-256.4674 1571.3356,-246.9321 1518,-235 1436.3354,-216.7301 1412.4703,-217.6929 1326.2979,-198.937"/>
+<polygon fill="#191970" stroke="#191970" points="1326.8567,-195.4762 1316.3378,-196.7425 1325.3505,-202.3122 1326.8567,-195.4762"/>
</g>
<!-- Node32->Node16 -->
<g id="edge9" class="edge">
<title>Node32->Node16</title>
-<path fill="none" stroke="#191970" d="M2134.8785,-304.4156C2174.3584,-294.9773 2220.1001,-281.2523 2233.4804,-266 2259.2213,-236.6578 2261.2787,-214.6927 2245.4804,-179 2214.0267,-107.9372 2134.7397,-54.5344 2092.6717,-30.137"/>
-<polygon fill="#191970" stroke="#191970" points="2094.227,-26.9953 2083.8024,-25.1041 2090.7723,-33.0834 2094.227,-26.9953"/>
+<path fill="none" stroke="#191970" d="M2227.7423,-302.4202C2270.1111,-290.2774 2322.7701,-273.9653 2330,-266 2363.4838,-229.1104 2346.2819,-71.3764 2345,-67 2341.426,-54.798 2333.9763,-42.7442 2327.0582,-33.3692"/>
+<polygon fill="#191970" stroke="#191970" points="2329.6509,-31.003 2320.732,-25.2842 2324.1379,-35.3167 2329.6509,-31.003"/>
</g>
<!-- Node32->Node18 -->
<g id="edge10" class="edge">
<title>Node32->Node18</title>
-<path fill="none" stroke="#191970" d="M2134.7712,-304.5769C2155.9754,-296.9661 2177.503,-284.9426 2190.4804,-266 2201.9071,-249.321 2200.8882,-225.5004 2198.1356,-208.7825"/>
-<polygon fill="#191970" stroke="#191970" points="2201.4764,-207.6388 2196.0847,-198.5185 2194.6121,-209.0104 2201.4764,-207.6388"/>
+<path fill="none" stroke="#191970" d="M2236.1432,-304.6327C2302.6178,-290.9219 2398.7021,-270.5671 2406,-266 2429.0544,-251.5724 2447.0567,-225.2407 2457.355,-207.434"/>
+<polygon fill="#191970" stroke="#191970" points="2460.4922,-208.9934 2462.2669,-198.5482 2454.3659,-205.6068 2460.4922,-208.9934"/>
</g>
<!-- Node21 -->
<g id="node12" class="node">
<title>Node21</title>
<g id="a_node12"><a xlink:href="runtime_2container_2base_8h.html" target="_top" xlink:title="Base utilities for common POD(plain old data) container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="1447.9804,-179.5 1447.9804,-198.5 1502.9804,-198.5 1502.9804,-179.5 1447.9804,-179.5"/>
-<text text-anchor="middle" x="1475.4804" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">./base.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1905.5,-179.5 1905.5,-198.5 1960.5,-198.5 1960.5,-179.5 1905.5,-179.5"/>
+<text text-anchor="middle" x="1933" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">./base.h</text>
</a>
</g>
</g>
<!-- Node32->Node21 -->
<g id="edge11" class="edge">
<title>Node32->Node21</title>
-<path fill="none" stroke="#191970" d="M2008.4586,-304.4905C1965.7545,-295.1327 1908.2525,-281.4713 1858.4804,-266 1821.6713,-254.5581 1814.7477,-244.8476 1777.4804,-235 1752.3211,-228.3518 1588.2275,-204.8593 1513.0226,-194.2584"/>
-<polygon fill="#191970" stroke="#191970" points="1513.4147,-190.7792 1503.0245,-192.8513 1512.439,-197.7109 1513.4147,-190.7792"/>
+<path fill="none" stroke="#191970" d="M2113.1785,-302.4895C2072.9881,-291.7593 2024.1726,-277.2821 2006,-266 1980.9452,-250.4453 1958.6782,-224.232 1945.5107,-206.7743"/>
+<polygon fill="#191970" stroke="#191970" points="1948.2186,-204.548 1939.4879,-198.546 1942.5701,-208.6826 1948.2186,-204.548"/>
</g>
<!-- Node21->Node24 -->
<g id="edge25" class="edge">
<title>Node21->Node24</title>
-<path fill="none" stroke="#191970" d="M1503.2504,-185.1438C1517.6322,-183.1981 1535.4856,-180.8622 1551.4804,-179 1702.6657,-161.3984 1882.6316,-144.3078 1963.0277,-136.8749"/>
-<polygon fill="#191970" stroke="#191970" points="1963.6064,-140.3365 1973.2428,-135.933 1962.9636,-133.3661 1963.6064,-140.3365"/>
+<path fill="none" stroke="#191970" d="M1959.4735,-179.3733C1984.8597,-170.1419 2023.3645,-156.1402 2051.422,-145.9375"/>
+<polygon fill="#191970" stroke="#191970" points="2052.6701,-149.2079 2060.8719,-142.5011 2050.2778,-142.6293 2052.6701,-149.2079"/>
</g>
<!-- Node21->Node16 -->
<g id="edge27" class="edge">
<title>Node21->Node16</title>
-<path fill="none" stroke="#191970" d="M1493.5802,-179.4709C1519.7893,-165.889 1570.0411,-140.6559 1614.4804,-123 1689.5866,-93.1602 1709.1385,-86.8468 1787.4804,-67 1875.1752,-44.7838 1980.6896,-27.8633 2032.9132,-20.1282"/>
-<polygon fill="#191970" stroke="#191970" points="2033.5377,-23.5741 2042.9248,-18.6615 2032.523,-16.6481 2033.5377,-23.5741"/>
+<path fill="none" stroke="#191970" d="M1936.1859,-179.409C1944.5564,-155.8441 1969.5715,-94.9948 2013,-67 2056.7056,-38.8266 2210.8448,-23.4773 2279.0793,-17.9291"/>
+<polygon fill="#191970" stroke="#191970" points="2279.5706,-21.4012 2289.2629,-17.1223 2279.0177,-14.4231 2279.5706,-21.4012"/>
</g>
<!-- Node7 -->
<g id="node13" class="node">
<title>Node7</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1362.9804,-123.5 1362.9804,-142.5 1451.9804,-142.5 1451.9804,-123.5 1362.9804,-123.5"/>
-<text text-anchor="middle" x="1407.4804" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/logging.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1547.5,-123.5 1547.5,-142.5 1636.5,-142.5 1636.5,-123.5 1547.5,-123.5"/>
+<text text-anchor="middle" x="1592" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/logging.h</text>
</g>
<!-- Node21->Node7 -->
<g id="edge12" class="edge">
<title>Node21->Node7</title>
-<path fill="none" stroke="#191970" d="M1463.6357,-179.2455C1453.5925,-170.9746 1438.981,-158.9416 1427.1801,-149.2232"/>
-<polygon fill="#191970" stroke="#191970" points="1429.1337,-146.298 1419.1894,-142.6427 1424.6837,-151.7015 1429.1337,-146.298"/>
+<path fill="none" stroke="#191970" d="M1905.1788,-180.8913C1902.4269,-180.204 1899.6664,-179.5591 1897,-179 1789.8255,-156.526 1758.9427,-160.9816 1646.7825,-143.0599"/>
+<polygon fill="#191970" stroke="#191970" points="1647.196,-139.5813 1636.7645,-141.4346 1646.075,-146.4909 1647.196,-139.5813"/>
</g>
<!-- Node13 -->
<g id="node14" class="node">
<title>Node13</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1043.9804,-6 1043.9804,-25 1168.9804,-25 1168.9804,-6 1043.9804,-6"/>
-<text text-anchor="middle" x="1106.4804" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/logging.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="541.5,-6 541.5,-25 666.5,-25 666.5,-6 541.5,-6"/>
+<text text-anchor="middle" x="604" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/logging.h</text>
</g>
<!-- Node21->Node13 -->
<g id="edge13" class="edge">
<title>Node21->Node13</title>
-<path fill="none" stroke="#191970" d="M1479.8128,-179.3853C1485.3563,-165.47 1492.8556,-139.5737 1480.4804,-123 1444.467,-74.7684 1277.3814,-41.5967 1179.1434,-25.8883"/>
-<polygon fill="#191970" stroke="#191970" points="1179.4042,-22.3863 1168.9812,-24.2863 1178.3141,-29.3009 1179.4042,-22.3863"/>
+<path fill="none" stroke="#191970" d="M1915.9868,-179.3763C1888.6909,-164.4481 1832.9939,-136.0709 1782,-123 1340.2229,-9.7626 1214.0281,-73.9907 760,-31 732.8505,-28.4293 703.0552,-25.4964 676.8444,-22.8787"/>
+<polygon fill="#191970" stroke="#191970" points="676.9094,-19.3678 666.6105,-21.8545 676.2122,-26.333 676.9094,-19.3678"/>
</g>
<!-- Node21->Node22 -->
<g id="edge14" class="edge">
<title>Node21->Node22</title>
-<path fill="none" stroke="#191970" d="M1447.9638,-185.6646C1378.3107,-177.2218 1193.1284,-154.7755 1088.3906,-142.08"/>
-<polygon fill="#191970" stroke="#191970" points="1088.5507,-138.5739 1078.2022,-140.8451 1087.7083,-145.523 1088.5507,-138.5739"/>
+<path fill="none" stroke="#191970" d="M1905.2168,-180.6924C1902.4572,-180.0453 1899.6845,-179.4642 1897,-179 1789.9414,-160.4862 1475.096,-143.5569 1329.9065,-136.4872"/>
+<polygon fill="#191970" stroke="#191970" points="1329.9088,-132.9833 1319.7512,-135.9954 1329.5701,-139.9752 1329.9088,-132.9833"/>
</g>
<!-- Node21->Node8 -->
<g id="edge24" class="edge">
<title>Node21->Node8</title>
-<path fill="none" stroke="#191970" d="M1476.5292,-179.1164C1477.4814,-164.3671 1476.7735,-136.8526 1460.4804,-123 1435.0999,-101.4211 967.9071,-84.7253 788.1529,-79.0747"/>
-<polygon fill="#191970" stroke="#191970" points="788.1191,-75.572 778.0147,-78.7581 787.9006,-82.5686 788.1191,-75.572"/>
+<path fill="none" stroke="#191970" d="M1905.4988,-181.2584C1876.4431,-172.8238 1829.4603,-158.4547 1790,-143 1770.3744,-135.3136 1767.4772,-127.9933 1747,-123 1650.5672,-99.4852 970.0152,-83.171 747.9152,-78.4248"/>
+<polygon fill="#191970" stroke="#191970" points="747.7618,-74.9208 737.6897,-78.2075 747.6131,-81.9193 747.7618,-74.9208"/>
</g>
<!-- Node25 -->
<g id="node21" class="node">
<title>Node25</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1622.9804,-123.5 1622.9804,-142.5 1705.9804,-142.5 1705.9804,-123.5 1622.9804,-123.5"/>
-<text text-anchor="middle" x="1664.4804" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">initializer_list</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1654.5,-123.5 1654.5,-142.5 1737.5,-142.5 1737.5,-123.5 1654.5,-123.5"/>
+<text text-anchor="middle" x="1696" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">initializer_list</text>
</g>
<!-- Node21->Node25 -->
<g id="edge26" class="edge">
<title>Node21->Node25</title>
-<path fill="none" stroke="#191970" d="M1503.368,-180.737C1534.8106,-171.4207 1586.2711,-156.1731 1622.5589,-145.4212"/>
-<polygon fill="#191970" stroke="#191970" points="1623.7686,-148.7132 1632.3623,-142.5165 1621.78,-142.0016 1623.7686,-148.7132"/>
+<path fill="none" stroke="#191970" d="M1905.1257,-181.1209C1902.3845,-180.3871 1899.6411,-179.6686 1897,-179 1846.4528,-166.2027 1788.3562,-153.0766 1747.6645,-144.1386"/>
+<polygon fill="#191970" stroke="#191970" points="1748.2265,-140.6788 1737.7093,-141.9592 1746.7294,-147.5168 1748.2265,-140.6788"/>
</g>
<!-- Node22->Node16 -->
<g id="edge23" class="edge">
<title>Node22->Node16</title>
-<path fill="none" stroke="#191970" d="M1035.2357,-123.3886C1070.4274,-108.3419 1142.3797,-79.64 1206.4804,-67 1369.1725,-34.9188 1895.6224,-19.7508 2032.6371,-16.2868"/>
-<polygon fill="#191970" stroke="#191970" points="2032.9873,-19.7792 2042.8969,-16.031 2032.8127,-12.7814 2032.9873,-19.7792"/>
+<path fill="none" stroke="#191970" d="M1288.8058,-123.4144C1342.2791,-108.6709 1449.874,-80.7008 1543,-67 1825.848,-25.387 2172.2312,-17.3617 2279.0663,-15.8454"/>
+<polygon fill="#191970" stroke="#191970" points="2279.2609,-19.3432 2289.2137,-15.7111 2279.1682,-12.3438 2279.2609,-19.3432"/>
</g>
<!-- Node22->Node8 -->
<g id="edge15" class="edge">
<title>Node22->Node8</title>
-<path fill="none" stroke="#191970" d="M963.1033,-123.4369C911.9799,-113.7321 832.8349,-98.708 778.4952,-88.3926"/>
-<polygon fill="#191970" stroke="#191970" points="779.1271,-84.9502 768.6498,-86.5237 777.8215,-91.8273 779.1271,-84.9502"/>
+<path fill="none" stroke="#191970" d="M1190.4903,-126.7391C1081.2695,-116.1388 860.8307,-94.7444 748.0019,-83.7939"/>
+<polygon fill="#191970" stroke="#191970" points="748.0475,-80.282 737.7561,-82.7996 747.3712,-87.2493 748.0475,-80.282"/>
</g>
<!-- Node22->Node15 -->
<g id="edge22" class="edge">
<title>Node22->Node15</title>
-<path fill="none" stroke="#191970" d="M984.7375,-123.474C941.926,-109.553 858.696,-83.5164 786.4804,-67 694.5808,-45.9816 585.3375,-29.6341 525.4031,-21.3929"/>
-<polygon fill="#191970" stroke="#191970" points="525.5361,-17.8787 515.1551,-19.9968 524.5912,-24.8147 525.5361,-17.8787"/>
+<path fill="none" stroke="#191970" d="M1244.114,-123.4403C1227.2475,-109.1142 1193.2652,-82.1449 1160,-67 1107.9428,-43.2995 1043.2439,-29.3245 1000.8455,-22.0447"/>
+<polygon fill="#191970" stroke="#191970" points="1001.2712,-18.5675 990.8324,-20.3789 1000.1224,-25.4726 1001.2712,-18.5675"/>
</g>
<!-- Node8->Node16 -->
<g id="edge20" class="edge">
<title>Node8->Node16</title>
-<path fill="none" stroke="#191970" d="M778.0974,-74.2781C1010.9944,-63.6447 1855.3006,-25.0962 2032.6536,-16.9988"/>
-<polygon fill="#191970" stroke="#191970" points="2033.0746,-20.4833 2042.9046,-16.5307 2032.7553,-13.4906 2033.0746,-20.4833"/>
+<path fill="none" stroke="#191970" d="M737.6056,-74.7566C1003.482,-64.7496 2076.9893,-24.3453 2279.1,-16.7383"/>
+<polygon fill="#191970" stroke="#191970" points="2279.4991,-20.2258 2289.3604,-16.3521 2279.2358,-13.2308 2279.4991,-20.2258"/>
</g>
<!-- Node8->Node13 -->
<g id="edge17" class="edge">
<title>Node8->Node13</title>
-<path fill="none" stroke="#191970" d="M778.2551,-67.5254C847.7906,-56.5037 962.4516,-38.3293 1036.1937,-26.6408"/>
-<polygon fill="#191970" stroke="#191970" points="1036.9898,-30.0584 1046.3185,-25.036 1035.8938,-23.1447 1036.9898,-30.0584"/>
+<path fill="none" stroke="#191970" d="M666.4375,-67.3906C654.9921,-57.8786 637.2527,-43.1357 623.603,-31.7917"/>
+<polygon fill="#191970" stroke="#191970" points="625.6533,-28.9447 615.7255,-25.2449 621.1792,-34.3283 625.6533,-28.9447"/>
</g>
<!-- Node8->Node9 -->
<g id="edge16" class="edge">
<title>Node8->Node9</title>
-<path fill="none" stroke="#191970" d="M658.9207,-68.1258C603.9806,-59.7491 519.9087,-46.4503 438.269,-31.2933"/>
-<polygon fill="#191970" stroke="#191970" points="438.7999,-27.832 428.3271,-29.4359 437.5142,-34.7129 438.7999,-27.832"/>
+<path fill="none" stroke="#191970" d="M632.6954,-67.4581C586.7756,-57.7866 514.9445,-42.6578 460.6108,-31.2143"/>
+<polygon fill="#191970" stroke="#191970" points="461.186,-27.7587 450.6794,-29.1225 459.7433,-34.6084 461.186,-27.7587"/>
</g>
<!-- Node8->Node14 -->
<g id="edge18" class="edge">
<title>Node8->Node14</title>
-<path fill="none" stroke="#191970" d="M778.1528,-75.3311C1101.9156,-66.2764 2642.0765,-23.2023 2885.1461,-16.4043"/>
-<polygon fill="#191970" stroke="#191970" points="2885.5271,-19.8951 2895.4253,-16.1168 2885.3313,-12.8978 2885.5271,-19.8951"/>
+<path fill="none" stroke="#191970" d="M737.6281,-75.4335C1074.5676,-66.5818 2734.0111,-22.9869 2986.7113,-16.3483"/>
+<polygon fill="#191970" stroke="#191970" points="2986.8169,-19.8468 2996.7215,-16.0853 2986.633,-12.8492 2986.8169,-19.8468"/>
</g>
<!-- Node8->Node15 -->
<g id="edge19" class="edge">
<title>Node8->Node15</title>
-<path fill="none" stroke="#191970" d="M681.5541,-67.4581C639.3119,-56.5426 570.1677,-38.6755 524.9871,-27.0007"/>
-<polygon fill="#191970" stroke="#191970" points="525.7269,-23.5769 515.1692,-24.4637 523.9755,-30.3543 525.7269,-23.5769"/>
+<path fill="none" stroke="#191970" d="M721.1325,-67.4581C772.7475,-56.0397 858.7483,-37.0143 911.1236,-25.4277"/>
+<polygon fill="#191970" stroke="#191970" points="912.1223,-28.7915 921.1302,-23.214 910.6103,-21.9567 912.1223,-28.7915"/>
</g>
<!-- Node17 -->
<g id="node20" class="node">
<title>Node17</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="756.4804,-6 756.4804,-25 806.4804,-25 806.4804,-6 756.4804,-6"/>
-<text text-anchor="middle" x="781.4804" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">atomic</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="769,-6 769,-25 819,-25 819,-6 769,-6"/>
+<text text-anchor="middle" x="794" y="-13" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">atomic</text>
</g>
<!-- Node8->Node17 -->
<g id="edge21" class="edge">
<title>Node8->Node17</title>
-<path fill="none" stroke="#191970" d="M728.3242,-67.3906C737.8843,-58.0581 752.6027,-43.6902 764.1299,-32.4374"/>
-<polygon fill="#191970" stroke="#191970" points="766.787,-34.7348 771.4979,-25.2449 761.8972,-29.7257 766.787,-34.7348"/>
+<path fill="none" stroke="#191970" d="M696.125,-67.3906C715.1083,-57.3262 745.1373,-41.4057 766.9307,-29.8514"/>
+<polygon fill="#191970" stroke="#191970" points="768.8066,-32.8184 776.0023,-25.0419 765.5277,-26.6338 768.8066,-32.8184"/>
</g>
<!-- Node33->Node13 -->
<g id="edge30" class="edge">
<title>Node33->Node13</title>
-<path fill="none" stroke="#191970" d="M706.3438,-240.9967C770.9147,-204.6078 1002.7141,-73.9774 1080.683,-30.0381"/>
-<polygon fill="#191970" stroke="#191970" points="1082.4137,-33.0803 1089.4072,-25.1216 1078.977,-26.982 1082.4137,-33.0803"/>
+<path fill="none" stroke="#191970" d="M739.9762,-240.8157C710.3712,-217.9339 638.3224,-157.1462 609,-87 602.1162,-70.5322 601.3767,-50.0717 602.0573,-35.2626"/>
+<polygon fill="#191970" stroke="#191970" points="605.5546,-35.4233 602.7763,-25.1993 598.5724,-34.9244 605.5546,-35.4233"/>
</g>
<!-- Node33->Node9 -->
<g id="edge29" class="edge">
<title>Node33->Node9</title>
-<path fill="none" stroke="#191970" d="M659.0124,-240.9348C613.3318,-225.553 525.9086,-191.9959 464.4804,-143 427.1916,-113.2579 395.1407,-67.2592 377.6486,-39.4032"/>
-<polygon fill="#191970" stroke="#191970" points="380.5131,-37.38 372.2881,-30.7016 374.5532,-41.0515 380.5131,-37.38"/>
+<path fill="none" stroke="#191970" d="M737.3434,-240.9032C720.6169,-230.6232 693.3535,-213.7881 670,-199 577.8753,-140.6639 469.917,-70.373 417.6063,-36.19"/>
+<polygon fill="#191970" stroke="#191970" points="419.3854,-33.1716 409.1001,-30.6293 415.5551,-39.0307 419.3854,-33.1716"/>
</g>
<!-- Node33->Node14 -->
<g id="edge31" class="edge">
<title>Node33->Node14</title>
-<path fill="none" stroke="#191970" d="M757.6619,-240.9706C956.9288,-213.1385 1529.6399,-133.2846 1614.4804,-123 1851.8198,-94.2291 1911.2795,-87.446 2149.4804,-67 2434.6006,-42.5267 2780.1133,-22.9608 2885.4705,-17.2172"/>
-<polygon fill="#191970" stroke="#191970" points="2885.668,-20.7117 2895.4636,-16.6747 2885.2885,-13.722 2885.668,-20.7117"/>
+<path fill="none" stroke="#191970" d="M822.4253,-248.2418C1076.6752,-239.886 1943.5737,-210.6067 1970,-199 2013.7364,-179.7905 2002.9392,-143.6796 2046,-123 2218.0663,-40.3664 2837.3119,-19.9801 2986.498,-16.2246"/>
+<polygon fill="#191970" stroke="#191970" points="2986.8891,-19.7162 2996.8007,-15.9732 2986.7182,-12.7183 2986.8891,-19.7162"/>
</g>
<!-- Node33->Node15 -->
<g id="edge32" class="edge">
<title>Node33->Node15</title>
-<path fill="none" stroke="#191970" d="M681.0285,-240.9967C649.3234,-205.3474 537.1761,-79.2487 495.8871,-32.8233"/>
... 23555 lines suppressed ...