You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2022/07/07 08:14:30 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@40d242a3c8f9630223e5775c1f1bf23362c8850e)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new de83aacf6 deploying docs (apache/tvm@40d242a3c8f9630223e5775c1f1bf23362c8850e)
de83aacf6 is described below
commit de83aacf621b2cf2cf30af7b0eb79e2d422a330a
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Thu Jul 7 08:14:23 2022 +0000
deploying docs (apache/tvm@40d242a3c8f9630223e5775c1f1bf23362c8850e)
---
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 16 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 4 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 10 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 2658 ++++++---
.../tune_network_cuda.rst.txt | 2 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 157 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 6 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 34 +-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 16 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 8 +-
.../work_with_relay/sg_execution_times.rst.txt | 6 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 18 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 6 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 6 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 2 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 54 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 22 +-
.../tutorial/tensor_expr_get_started.rst.txt | 45 +-
docs/commit_hash | 2 +-
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 16 +-
docs/how_to/compile_models/from_pytorch.html | 6 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 34 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 60 +-
docs/how_to/deploy_models/deploy_prequantized.html | 6 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 36 +-
docs/how_to/deploy_models/sg_execution_times.html | 16 +-
.../extend_tvm/bring_your_own_datatypes.html | 4 +-
docs/how_to/extend_tvm/sg_execution_times.html | 10 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 18 +-
.../tune_conv2d_layer_cuda.html | 2658 ++++++---
.../tune_with_autoscheduler/tune_network_cuda.html | 2 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 157 +-
.../tune_with_autotvm/sg_execution_times.html | 6 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 34 +-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 8 +-
.../how_to/work_with_relay/sg_execution_times.html | 6 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 18 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/reference/api/doxygen/annotated.html | 764 +--
docs/reference/api/doxygen/array_8h__dep__incl.svg | 148 +-
docs/reference/api/doxygen/c__runtime__api_8h.html | 2 +-
.../api/doxygen/c__runtime__api_8h__dep__incl.svg | 1632 +++--
docs/reference/api/doxygen/classes.html | 439 +-
.../api/doxygen/classtvm_1_1runtime_1_1Object.html | 2 +-
.../doxygen/classtvm_1_1runtime_1_1ObjectRef.html | 2 +-
...asstvm_1_1runtime_1_1ObjectRef__coll__graph.svg | 12 +-
.../classtvm_1_1runtime_1_1Object__coll__graph.svg | 8 +-
...asstvm_1_1script_1_1printer_1_1Doc-members.html | 102 +
.../classtvm_1_1script_1_1printer_1_1Doc.html | 265 +
...vm_1_1script_1_1printer_1_1DocNode-members.html | 114 +
.../classtvm_1_1script_1_1printer_1_1DocNode.html | 307 +
..._1script_1_1printer_1_1DocNode__coll__graph.svg | 82 +
...cript_1_1printer_1_1DocNode__inherit__graph.svg | 119 +
...vm_1_1script_1_1printer_1_1Doc__coll__graph.svg | 91 +
...1_1script_1_1printer_1_1Doc__inherit__graph.svg | 110 +
...vm_1_1script_1_1printer_1_1ExprDoc-members.html | 104 +
.../classtvm_1_1script_1_1printer_1_1ExprDoc.html | 246 +
..._1script_1_1printer_1_1ExprDocNode-members.html | 115 +
...asstvm_1_1script_1_1printer_1_1ExprDocNode.html | 288 +
...ript_1_1printer_1_1ExprDocNode__coll__graph.svg | 103 +
...t_1_1printer_1_1ExprDocNode__inherit__graph.svg | 119 +
..._1script_1_1printer_1_1ExprDoc__coll__graph.svg | 113 +
...cript_1_1printer_1_1ExprDoc__inherit__graph.svg | 110 +
...1_1script_1_1printer_1_1LiteralDoc-members.html | 111 +
...lasstvm_1_1script_1_1printer_1_1LiteralDoc.html | 435 ++
...cript_1_1printer_1_1LiteralDocNode-members.html | 117 +
...tvm_1_1script_1_1printer_1_1LiteralDocNode.html | 325 +
...t_1_1printer_1_1LiteralDocNode__coll__graph.svg | 190 +
..._1printer_1_1LiteralDocNode__inherit__graph.svg | 119 +
...cript_1_1printer_1_1LiteralDoc__coll__graph.svg | 140 +
...pt_1_1printer_1_1LiteralDoc__inherit__graph.svg | 110 +
docs/reference/api/doxygen/data__type_8h.html | 2 +-
.../api/doxygen/data__type_8h__dep__incl.svg | 1411 ++---
...r_000003_000024.html => dir_000003_000026.html} | 0
...r_000003_000025.html => dir_000003_000027.html} | 0
...r_000004_000024.html => dir_000004_000026.html} | 0
...r_000004_000025.html => dir_000004_000027.html} | 0
...r_000005_000024.html => dir_000005_000026.html} | 0
...r_000005_000025.html => dir_000005_000027.html} | 0
...r_000006_000024.html => dir_000006_000026.html} | 0
...r_000006_000025.html => dir_000006_000027.html} | 0
...r_000007_000024.html => dir_000007_000026.html} | 0
...r_000011_000024.html => dir_000011_000026.html} | 0
...r_000014_000024.html => dir_000014_000026.html} | 0
...r_000015_000024.html => dir_000015_000026.html} | 0
...r_000016_000024.html => dir_000016_000026.html} | 0
...r_000016_000025.html => dir_000016_000027.html} | 0
docs/reference/api/doxygen/dir_000024_000007.html | 6 +-
docs/reference/api/doxygen/dir_000024_000008.html | 6 +-
docs/reference/api/doxygen/dir_000024_000013.html | 73 -
docs/reference/api/doxygen/dir_000024_000017.html | 6 +-
docs/reference/api/doxygen/dir_000025_000002.html | 73 -
...r_000003_000024.html => dir_000025_000007.html} | 6 +-
docs/reference/api/doxygen/dir_000025_000008.html | 6 +-
...r_000004_000025.html => dir_000025_000017.html} | 6 +-
docs/reference/api/doxygen/dir_000026_000007.html | 6 +-
...r_000024_000008.html => dir_000026_000008.html} | 0
...r_000024_000011.html => dir_000026_000011.html} | 0
docs/reference/api/doxygen/dir_000026_000013.html | 6 +-
docs/reference/api/doxygen/dir_000026_000017.html | 6 +-
docs/reference/api/doxygen/dir_000027_000002.html | 6 +-
docs/reference/api/doxygen/dir_000027_000008.html | 6 +-
...r_000025_000011.html => dir_000027_000011.html} | 0
...r_000025_000013.html => dir_000027_000013.html} | 0
...r_000026_000007.html => dir_000028_000007.html} | 0
...r_000026_000013.html => dir_000028_000013.html} | 0
...r_000026_000017.html => dir_000028_000017.html} | 0
...r_000027_000002.html => dir_000029_000002.html} | 0
...r_000027_000008.html => dir_000029_000008.html} | 0
...r_000028_000002.html => dir_000030_000002.html} | 0
...r_000028_000011.html => dir_000030_000011.html} | 0
...r_000028_000025.html => dir_000030_000027.html} | 0
docs/reference/api/doxygen/dir_000030_000028.html | 73 -
docs/reference/api/doxygen/dir_000030_000031.html | 73 -
...r_000029_000025.html => dir_000031_000027.html} | 0
...r_000029_000028.html => dir_000031_000030.html} | 0
...r_000030_000024.html => dir_000032_000026.html} | 0
...r_000030_000025.html => dir_000032_000027.html} | 0
docs/reference/api/doxygen/dir_000032_000030.html | 6 +-
docs/reference/api/doxygen/dir_000032_000031.html | 6 +-
docs/reference/api/doxygen/dir_000032_000033.html | 6 +-
...r_000030_000033.html => dir_000032_000035.html} | 0
...r_000031_000002.html => dir_000033_000002.html} | 0
...r_000031_000025.html => dir_000033_000027.html} | 0
...r_000031_000028.html => dir_000033_000030.html} | 0
...r_000032_000024.html => dir_000034_000026.html} | 0
...r_000032_000025.html => dir_000034_000027.html} | 0
...r_000032_000028.html => dir_000034_000030.html} | 0
...r_000032_000029.html => dir_000034_000031.html} | 0
...r_000032_000030.html => dir_000034_000032.html} | 0
...r_000032_000031.html => dir_000034_000033.html} | 0
...r_000032_000033.html => dir_000034_000035.html} | 0
...r_000033_000024.html => dir_000035_000026.html} | 0
...r_000033_000025.html => dir_000035_000027.html} | 0
...r_000033_000028.html => dir_000035_000030.html} | 0
...r_000034_000024.html => dir_000036_000026.html} | 0
...r_000034_000025.html => dir_000036_000027.html} | 0
...r_000034_000028.html => dir_000036_000030.html} | 0
...r_000035_000025.html => dir_000037_000027.html} | 0
...r_000035_000028.html => dir_000037_000030.html} | 0
.../dir_006b1f4ac353a18abb55f74cc4796db6_dep.svg | 6 +-
.../dir_02be2c9d68e402f80df60bd528724ee5_dep.svg | 22 +-
.../dir_05ffda4d144d7985f926507abde48dbb_dep.svg | 12 +-
.../dir_1f1b12d204a071c9e67e47fcbb552b86_dep.svg | 10 +-
.../dir_2b0ef9f1c86b565a92e96353e1195b2c_dep.svg | 8 +-
.../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1_dep.svg | 12 +-
.../dir_4378f18824ae7d4ad48f8d7785cd7ac8_dep.svg | 24 +-
.../dir_519be2d4a83a987dbf989f1de527b870_dep.svg | 10 +-
.../dir_54983dd6d74c59f67ee9e8e5a50aafc4_dep.svg | 42 +-
.../dir_5da96592f3a7c442b838b075c58254c2_dep.svg | 14 +-
.../dir_63946bee875c6d52bce55e72a67a86ad_dep.svg | 20 +-
.../dir_72c2f11201cd7636dc7624de0754daa5_dep.svg | 22 +-
.../dir_8395ded0a3205c0748976a0d4487d38d_dep.svg | 8 +-
...l => dir_84875704194fd544d29fe0c7fedd8939.html} | 22 +-
.../dir_84875704194fd544d29fe0c7fedd8939_dep.svg | 139 +
.../dir_8e4e25e66b8623d88c5b5dd2040bca97_dep.svg | 74 +-
...l => dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html} | 24 +-
.../dir_a59a89c7dd2e4e6561fe59bf359ce2f3_dep.svg | 127 +
.../dir_a98464176f1216e334ac3bbacd433085_dep.svg | 16 +-
.../dir_ac57496531ccbad72f774fa62e6de987_dep.svg | 28 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5.html | 4 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg | 581 +-
.../dir_d331277d4303e21ded95616eb56c1a9e_dep.svg | 6 +-
.../dir_d3953cf7eb33eca56fc6850c0e98447d_dep.svg | 6 +-
.../dir_d4a54fa981698f72ef4cd62f8b9e1a8f_dep.svg | 4 +-
.../dir_dc867ff9a37cad1764f1670dc7eba6c1_dep.svg | 12 +-
.../dir_f97d855a3173728370e632aa77170e34_dep.svg | 8 +-
.../{structural__hash_8h.html => doc_8h.html} | 46 +-
docs/reference/api/doxygen/doc_8h__dep__incl.svg | 36 +
docs/reference/api/doxygen/doc_8h__incl.svg | 1310 +++++
docs/reference/api/doxygen/doc_8h_source.html | 110 +
.../doxygen/{node_8h.html => doc__printer_8h.html} | 53 +-
.../api/doxygen/doc__printer_8h__incl.svg | 1326 +++++
.../api/doxygen/doc__printer_8h_source.html | 80 +
docs/reference/api/doxygen/files.html | 210 +-
docs/reference/api/doxygen/functions_0x7e.html | 6 +
docs/reference/api/doxygen/functions__.html | 3 +
docs/reference/api/doxygen/functions_b.html | 3 +
docs/reference/api/doxygen/functions_d.html | 3 +
docs/reference/api/doxygen/functions_e.html | 3 +
docs/reference/api/doxygen/functions_f.html | 7 +-
.../reference/api/doxygen/functions_func_0x7e.html | 6 +
docs/reference/api/doxygen/functions_func_b.html | 5 +-
docs/reference/api/doxygen/functions_func_d.html | 3 +
docs/reference/api/doxygen/functions_func_e.html | 5 +-
docs/reference/api/doxygen/functions_func_f.html | 5 +-
docs/reference/api/doxygen/functions_func_i.html | 3 +-
docs/reference/api/doxygen/functions_func_l.html | 3 +
docs/reference/api/doxygen/functions_func_n.html | 3 +
docs/reference/api/doxygen/functions_func_r.html | 2 +-
docs/reference/api/doxygen/functions_func_s.html | 19 +-
docs/reference/api/doxygen/functions_func_t.html | 14 +-
docs/reference/api/doxygen/functions_func_v.html | 35 +-
docs/reference/api/doxygen/functions_i.html | 13 +-
docs/reference/api/doxygen/functions_l.html | 5 +-
docs/reference/api/doxygen/functions_m.html | 2 +-
docs/reference/api/doxygen/functions_n.html | 3 +
docs/reference/api/doxygen/functions_s.html | 11 +-
docs/reference/api/doxygen/functions_t.html | 14 +-
docs/reference/api/doxygen/functions_v.html | 36 +-
docs/reference/api/doxygen/functions_vars.html | 3 +
docs/reference/api/doxygen/functions_vars_v.html | 1 +
docs/reference/api/doxygen/functor_8h.html | 2 +-
.../api/doxygen/functor_8h__dep__incl.svg | 1343 +++--
docs/reference/api/doxygen/hierarchy.html | 368 +-
docs/reference/api/doxygen/inherit_graph_10.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_108.svg | 6202 ++++++++++----------
docs/reference/api/doxygen/inherit_graph_117.svg | 4295 +++++++-------
docs/reference/api/doxygen/inherit_graph_189.svg | 8 +-
docs/reference/api/doxygen/inherit_graph_203.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_204.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_39.svg | 16 +-
docs/reference/api/doxygen/inherit_graph_43.svg | 8 +-
docs/reference/api/doxygen/inherits.html | 4 +-
docs/reference/api/doxygen/ir_2expr_8h.html | 2 +-
.../api/doxygen/ir_2expr_8h__dep__incl.svg | 1044 ++--
docs/reference/api/doxygen/ir_2span_8h.html | 2 +-
.../api/doxygen/ir_2span_8h__dep__incl.svg | 1110 ++--
docs/reference/api/doxygen/ir_2type_8h.html | 2 +-
.../api/doxygen/ir_2type_8h__dep__incl.svg | 1199 ++--
docs/reference/api/doxygen/map_8h__dep__incl.svg | 132 +-
docs/reference/api/doxygen/namespacemembers_d.html | 7 +-
.../api/doxygen/namespacemembers_func_d.html | 5 +-
.../api/doxygen/namespacemembers_func_s.html | 6 +-
docs/reference/api/doxygen/namespacemembers_s.html | 6 +-
docs/reference/api/doxygen/namespaces.html | 38 +-
docs/reference/api/doxygen/namespacetvm.html | 2 +
...030_000029.html => namespacetvm_1_1script.html} | 18 +-
.../doxygen/namespacetvm_1_1script_1_1printer.html | 147 +
.../api/doxygen/ndarray_8h__dep__incl.svg | 276 +-
docs/reference/api/doxygen/node_8h.html | 2 +-
docs/reference/api/doxygen/node_8h__dep__incl.svg | 1210 ++--
.../reference/api/doxygen/object_8h__dep__incl.svg | 468 +-
.../api/doxygen/optional_8h__dep__incl.svg | 136 +-
.../api/doxygen/packed__func_8h__dep__incl.svg | 112 +-
docs/reference/api/doxygen/reflection_8h.html | 2 +-
.../api/doxygen/reflection_8h__dep__incl.svg | 1322 +++--
docs/reference/api/doxygen/repr__printer_8h.html | 2 +-
.../api/doxygen/repr__printer_8h__dep__incl.svg | 1225 ++--
.../runtime_2container_2base_8h__dep__incl.svg | 464 +-
docs/reference/api/doxygen/runtime_2memory_8h.html | 2 +-
.../api/doxygen/runtime_2memory_8h__dep__incl.svg | 1766 +++---
.../api/doxygen/runtime_2module_8h__dep__incl.svg | 100 +-
docs/reference/api/doxygen/search/all_1.js | 2 +-
docs/reference/api/doxygen/search/all_10.js | 2 +-
docs/reference/api/doxygen/search/all_13.js | 6 +-
docs/reference/api/doxygen/search/all_14.js | 10 +-
docs/reference/api/doxygen/search/all_15.js | 22 +-
docs/reference/api/doxygen/search/all_16.js | 4 +-
docs/reference/api/doxygen/search/all_17.js | 6 +-
docs/reference/api/doxygen/search/all_18.js | 2 +-
docs/reference/api/doxygen/search/all_1b.js | 2 +
docs/reference/api/doxygen/search/all_3.js | 1 +
docs/reference/api/doxygen/search/all_5.js | 5 +
docs/reference/api/doxygen/search/all_6.js | 2 +
docs/reference/api/doxygen/search/all_7.js | 2 +-
docs/reference/api/doxygen/search/all_a.js | 2 +-
docs/reference/api/doxygen/search/all_d.js | 2 +
docs/reference/api/doxygen/search/all_e.js | 2 +-
docs/reference/api/doxygen/search/all_f.js | 1 +
docs/reference/api/doxygen/search/classes_11.js | 4 +-
docs/reference/api/doxygen/search/classes_13.js | 2 +-
docs/reference/api/doxygen/search/classes_3.js | 2 +
docs/reference/api/doxygen/search/classes_4.js | 2 +
docs/reference/api/doxygen/search/classes_9.js | 2 +
docs/reference/api/doxygen/search/files_3.js | 2 +
docs/reference/api/doxygen/search/functions_12.js | 4 +-
docs/reference/api/doxygen/search/functions_13.js | 6 +-
docs/reference/api/doxygen/search/functions_14.js | 10 +-
docs/reference/api/doxygen/search/functions_15.js | 2 +-
docs/reference/api/doxygen/search/functions_16.js | 2 +-
docs/reference/api/doxygen/search/functions_19.js | 2 +
docs/reference/api/doxygen/search/functions_2.js | 1 +
docs/reference/api/doxygen/search/functions_4.js | 2 +
docs/reference/api/doxygen/search/functions_5.js | 1 +
docs/reference/api/doxygen/search/functions_6.js | 2 +-
docs/reference/api/doxygen/search/functions_9.js | 2 +-
docs/reference/api/doxygen/search/functions_c.js | 1 +
docs/reference/api/doxygen/search/functions_e.js | 1 +
docs/reference/api/doxygen/search/functions_f.js | 2 +-
docs/reference/api/doxygen/search/namespaces_1.js | 2 +
docs/reference/api/doxygen/search/variables_0.js | 2 +-
docs/reference/api/doxygen/search/variables_14.js | 2 +-
.../api/doxygen/serializer_8h__dep__incl.svg | 264 +-
.../api/doxygen/shape__tuple_8h__dep__incl.svg | 268 +-
docs/reference/api/doxygen/string_8h.html | 2 +-
.../reference/api/doxygen/string_8h__dep__incl.svg | 1324 +++--
.../api/doxygen/structural__equal_8h.html | 2 +-
.../doxygen/structural__equal_8h__dep__incl.svg | 1068 ++--
.../reference/api/doxygen/structural__hash_8h.html | 2 +-
.../api/doxygen/structural__hash_8h__dep__incl.svg | 1068 ++--
docs/reference/api/python/auto_scheduler.html | 4 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 6 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 6 +-
.../vta/tutorials/optimize/sg_execution_times.html | 6 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 2 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 258 +-
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 28 +-
docs/tutorial/tensor_expr_get_started.html | 41 +-
374 files changed, 28395 insertions(+), 19500 deletions(-)
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 3c231da85..5330df5cb 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -317,7 +317,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 3.129 seconds)
+ **Total running time of the script:** ( 1 minutes 2.162 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 0cebcb158..3e9d1ddab 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -115,7 +115,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip0b98df0f-efed-4fd1-bd0f-a5c6f7ab728e from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip5bbd24b5-9964-4f2e-912e-359d733978e5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index a1fff0d39..4c74f3fb5 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -113,7 +113,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 47.7MB/s]
35%|###4 | 14.3M/41.5M [00:00<00:00, 54.6MB/s]
48%|####7 | 19.7M/41.5M [00:00<00:00, 47.6MB/s]
59%|#####8 | 24.4M/41.5M [00:00<00:00, 33.5MB/s]
80%|######## | 33.4M/41.5M [00:00<00:00, 48.3MB/s]
96%|#########6| 40.0M/41.5M [00:00<00:00, 48.6MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 48.0MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
15%|#5 | 6.33M/41.5M [00:00<00:00, 45.3MB/s]
26%|##5 | 10.6M/41.5M [00:00<00:00, 41.6MB/s]
35%|###5 | 14.6M/41.5M [00:00<00:00, 34.7MB/s]
43%|####3 | 17.9M/41.5M [00:00<00:00, 30.2MB/s]
54%|#####3 | 22.3M/41.5M [00:00<00:00, 32.4MB/s]
61%|######1 | 25.5M/41.5M [00:00<00:00, 27.4MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 36.0MB/s]
92%|#########2| 38.3M/41.5M [00:01<00:00, 34.5MB/s]
100%|##########| 41.5M/41.5M [00:01<00:00, 34.3MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index a3213f0a7..8bdf55492 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -94,7 +94,7 @@ Load a pretrained PyTorch model
.. code-block:: none
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
48%|####7 | 21.3M/44.7M [00:00<00:00, 224MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 242MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
8%|7 | 3.49M/44.7M [00:00<00:01, 36.6MB/s]
17%|#6 | 7.47M/44.7M [00:00<00:00, 39.6MB/s]
65%|######4 | 28.9M/44.7M [00:00<00:00, 121MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 129MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 4a4435a3d..8593d0ec5 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -423,7 +423,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 4.491 seconds)
+ **Total running time of the script:** ( 1 minutes 3.436 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index ea046f0a9..69757cc6e 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**05:03.432** total execution time for **how_to_compile_models** files:
+**05:20.160** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:04.491 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:03.436 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:03.129 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:02.162 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:41.193 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:41.505 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:26.693 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:34.835 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.879 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:26.439 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:22.921 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:25.923 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:22.758 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:22.532 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:19.567 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:21.015 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:14.396 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:19.411 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.407 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.902 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 8778cdfd1..11811a880 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -441,7 +441,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.2909 16.2089 16.8662 16.1445 0.2069
+ 15.8547 15.6650 16.5339 15.5613 0.3511
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index cbc3459be..182c42893 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -123,7 +123,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
.. code-block:: none
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
11%|# | 18.4M/170M [00:00<00:00, 193MB/s]
22%|##2 | 37.6M/170M [00:00<00:00, 197MB/s]
33%|###3 | 56.4M/170M [00:00<00:00, 171MB/s]
45%|####5 | 77.2M/170M [00:00<00:00, 188MB/s]
56%|#####6 | 95.5M/170M [00:00<00:00, 172MB/s]
68%|######8 | 116M/170M [00:00<00:00, 185MB/s]
79%|#######8 | 134M/170M [00:00<00:00, 175MB/s]
90%|########9 | 152M/170M [00:00<00:00, 180MB/s]
100%|##########| 170M/170M [00:00<00:00, 184MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
3%|2 | 5.02M/170M [00:00<00:03, 52.7MB/s]
6%|5 | 10.0M/170M [00:00<00:04, 37.2MB/s]
8%|8 | 13.9M/170M [00:00<00:05, 29.6MB/s]
10%|# | 17.3M/170M [00:00<00:05, 31.5MB/s]
12%|#2 | 20.5M/170M [00:00<00:05, 30.7MB/s]
14%|#3 | 23.6M/170M [00:00<00:04, 30.7MB/s]
17%|#6 | 28.6M/170M [00:00<00:04, 37.0MB/s]
20%|## | 34.4M/170M [00:00<00:03, 43.7MB/s]
23%|##2 | 38.7M/170M [00:01<00:03, 39.8MB/s]
25%|##5 | 42.6M/170M [00:01<00:03, 34.1MB/s]
28%|##7 | 47.5M/170M [00:01<00:03, 38.3MB/s]
30%|### | 51.4M/170M [00:01<00:03, 38.8MB/s]
33%|###3 | 56.8M/170M [00:01<00:02, 43.7MB/s]
37%|###7 | 62.9M/170M [00:01<00:02, 49.0MB/s]
40%|###9 | 67.7M/170M [00:01<00:02, 49.2MB/s]
43%|####2 | 72.5M/170M [00:01<00:02, 44.7MB/s]
45%|####5 | 77.2M/170M [00:02<00:02, 45.8MB/
s]
48%|####8 | 81.9M/170M [00:02<00:02, 45.9MB/s]
51%|#####1 | 87.2M/170M [00:02<00:01, 48.6MB/s]
54%|#####4 | 92.1M/170M [00:02<00:01, 49.3MB/s]
57%|#####7 | 96.9M/170M [00:02<00:01, 45.7MB/s]
60%|#####9 | 101M/170M [00:02<00:01, 41.6MB/s]
62%|######2 | 105M/170M [00:02<00:01, 35.5MB/s]
64%|######4 | 109M/170M [00:02<00:02, 31.3MB/s]
66%|######6 | 112M/170M [00:03<00:01, 30.7MB/s]
68%|######7 | 115M/170M [00:03<00:01, 28.9MB/s]
69%|######9 | 118M/170M [00:03<00:01, 27.8MB/s]
71%|#######1 | 121M/170M [00:03<00:01, 27.3MB/s]
73%|#######2 | 123M/170M [00:03<00:02, 21.6MB/s]
74%|#######3 | 126M/170M [00:03<00:02, 20.5MB/s]
75%|#######5 | 128M/170M [00:03<00:02, 20.8MB/s]
76%|#######6 | 130M/170M [00:03<00:02, 20.3MB/s]
78%|#######7 | 132M/170M [00:04<00:01, 21.2MB/s]
79%|#######9 | 134M/170M [00:04<00:01, 20.4MB/s]
80%|######## | 136M/170M [00:04<00:01, 20.4MB/s]
82%|########1 | 139M/170M [00:04<00:01, 21.7MB/s]
83%|########2 | 141M/170M [00:04<00:01, 21.9MB/s]
84%|########4 | 143M/170M [00:04<00:01, 22.4MB/s]
86%|########5 | 145M/170M [00:04<00:01, 22.7MB/s]
87%|########6 | 148M/170M [00:04<00:01, 20.9MB/s]
88%|########8 | 150M/170M [00:04<00:00, 22.2MB/s]
90%|########9 | 153M/170M [00:05<00:00, 23.5MB/s]
91%|#########1| 155M/170M [00:05<00:00, 22.7MB/s]
93%|#########2| 157M/170M [00:05<00:00, 23.3MB/s]
94%|#########3| 159M/170M [00:05<00:00, 23.0MB/s]
96%|#########5| 163M/170M [00:05<00:00, 26.1MB/s]
98%|#########8| 167M/170M [00:05<00:00, 29.9MB/s]
100%|#########9| 169M/170M [00:05<00:00, 28.6MB/s]
100%|##########| 170M/170M [00:05<00:00, 31.4MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -292,7 +292,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 10.065 seconds)
+ **Total running time of the script:** ( 2 minutes 59.469 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 49712eac8..171982d3a 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -232,7 +232,7 @@ training. Other models require a full post training calibration.
.. code-block:: none
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 197MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 180MB/s]
@@ -412,7 +412,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.6104 90.5887 91.4744 90.2353 0.1939
+ 90.2709 90.1941 95.4556 90.0354 0.5425
@@ -461,7 +461,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 10.371 seconds)
+ **Total running time of the script:** ( 1 minutes 9.887 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index 38322a9a7..e79ab11f8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -439,7 +439,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 121.3128 121.2697 125.7490 120.6289 0.5435
+ 119.0905 119.1786 122.8603 117.4642 0.6454
@@ -476,7 +476,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 59.960 seconds)
+ **Total running time of the script:** ( 2 minutes 6.017 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 69badb824..c4e3aff75 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -255,7 +255,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 29.098 seconds)
+ **Total running time of the script:** ( 1 minutes 38.816 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index f4024df04..c4166f2d8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -158,7 +158,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
5%|4 | 6579/132723 [00:00<00:01, 65785.98KB/s]
10%|# | 13883/132723 [00:00<00:01, 70040.90KB/s]
17%|#6 | 21945/132723 [00:00<00:01, 74865.76KB/s]
23%|##2 | 30032/132723 [00:00<00:01, 77234.20KB/s]
29%|##8 | 37906/132723 [00:00<00:01, 77768.42KB/s]
34%|###4 | 45683/132723 [00:00<00:01, 77173.50KB/s]
40%|#### | 53685/132723 [00:00<00:01, 78092.32KB/s]
47%|####6 | 61754/132723 [00:00<00:00, 78914.70KB/s]
53%|#####2 | 69842/132723 [00:00<00:00, 79525.01KB/s]
59%|#####8 | 77819/132723 [00:01<00:00, 79584.98KB/s]
65%|######4 | 85829/132723 [00:01<00:00, 79741.31KB/s]
71%|####### | 93804/132723 [00:01<00:00, 79064.34KB/s]
77%|#######6 | 101712/132723 [00:01<00:00, 76571.38KB/s]
83%|########2 | 109660/132723 [00:01<00:00, 77423.66KB/s]
88%|########8 | 117417/132723 [00:01<00:00, 74891.30KB/s]
95%|########
#4| 125477/132723 [00:01<00:00, 76544.12KB/s]
100%|##########| 132723/132723 [00:01<00:00, 77261.94KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
5%|4 | 6393/132723 [00:00<00:01, 63927.01KB/s]
11%|#1 | 14929/132723 [00:00<00:01, 76520.93KB/s]
18%|#7 | 23387/132723 [00:00<00:01, 80197.46KB/s]
24%|##3 | 31544/132723 [00:00<00:01, 75715.08KB/s]
30%|##9 | 39155/132723 [00:00<00:01, 55369.54KB/s]
36%|###5 | 47594/132723 [00:00<00:01, 63061.31KB/s]
41%|####1 | 54550/132723 [00:00<00:01, 53502.08KB/s]
47%|####7 | 63037/132723 [00:01<00:01, 61173.11KB/s]
53%|#####3 | 70464/132723 [00:01<00:00, 64563.11KB/s]
60%|#####9 | 79075/132723 [00:01<00:00, 70379.63KB/s]
65%|######5 | 86555/132723 [00:01<00:00, 59549.95KB/s]
72%|#######1 | 95126/132723 [00:01<00:00, 66049.23KB/s]
78%|#######8 | 103702/132723 [00:01<00:00, 71212.40KB/s]
85%|########4 | 112197/132723 [00:01<00:00, 74950.75KB/s]
90%|######### | 120050/132723 [00:01<00:00, 69506.51KB/s]
97%|########
#6| 128617/132723 [00:01<00:00, 73834.90KB/s]
100%|##########| 132723/132723 [00:01<00:00, 67144.39KB/s]
@@ -241,7 +241,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 26.781 seconds)
+ **Total running time of the script:** ( 2 minutes 17.546 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index baf4a830e..a7e1c9f61 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**11:09.358** total execution time for **how_to_deploy_models** files:
+**11:03.296** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:10.065 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 02:59.469 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:26.781 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 02:17.546 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 01:59.960 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:06.017 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:29.098 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:38.816 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:10.371 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:09.887 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:29.863 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:28.542 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:23.214 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:23.012 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 2775becf8..c240473ed 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -476,7 +476,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipf47a4a5d-6e6a-4037-9310-597ac1787583 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipe6a90132-9029-4551-b725-8a2c585f77c0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
@@ -590,7 +590,7 @@ Now, to actually convert the entire network, we have written `a pass in Relay <h
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- Check failed: (lower) is false: Intrinsic lowering function for target llvm, intrinsic name tir.sqrt, type 150 not found
+ Check failed: (lower) is false: FloatImm lowering function for target llvm type 150 not found
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index 5f44e1c85..140167003 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:43.007** total execution time for **how_to_extend_tvm** files:
+**00:39.293** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:39.619 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:36.110 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.383 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.234 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.993 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:00.942 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.011 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.008 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 452a1e485..7201fc7d7 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -216,10 +216,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 7143us [7143us] (45.97%; 45.97%)
- FoldScaleAxis: 8396us [8us] (54.03%; 54.03%)
- FoldConstant: 8388us [1627us] (53.98%; 99.90%)
- InferType: 6761us [6761us] (43.51%; 80.60%)
+ InferType: 6763us [6763us] (45.98%; 45.98%)
+ FoldScaleAxis: 7946us [6us] (54.02%; 54.02%)
+ FoldConstant: 7940us [1567us] (53.98%; 99.93%)
+ InferType: 6373us [6373us] (43.32%; 80.26%)
@@ -258,10 +258,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 6875us [6875us] (44.95%; 44.95%)
- FoldScaleAxis: 8419us [8us] (55.05%; 55.05%)
- FoldConstant: 8411us [1681us] (54.99%; 99.90%)
- InferType: 6730us [6730us] (44.00%; 80.02%)
+ InferType: 6440us [6440us] (44.75%; 44.75%)
+ FoldScaleAxis: 7950us [5us] (55.25%; 55.25%)
+ FoldConstant: 7945us [1557us] (55.21%; 99.94%)
+ InferType: 6388us [6388us] (44.39%; 80.40%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index a136487f9..ef9ec1cb4 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -340,7 +340,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 54.207339 ms
+ Convolution: 54.158303 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 3d32069a4..1bda7f0ee 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -671,7 +671,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 7.021100 ms
+ conv2d with tensor core: 7.704298 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index f27abcb80..7ed30a0d2 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -143,8 +143,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.019769
- Baseline: 3.259536
+ Numpy running time: 0.018352
+ Baseline: 3.511263
@@ -239,7 +239,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.330347
+ Opt1: 0.307780
@@ -342,7 +342,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.343379
+ Opt2: 0.339759
@@ -438,7 +438,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.137935
+ Opt3: 0.115245
@@ -563,7 +563,7 @@ flattening.
.. code-block:: none
- Opt4: 0.112368
+ Opt4: 0.109447
@@ -685,7 +685,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.113770
+ Opt5: 0.111666
@@ -810,7 +810,7 @@ Futhermore, we can also utilize multi-core processors to do the thread-level par
.. code-block:: none
- Opt6: 0.147954
+ Opt6: 0.144600
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 20abbc5a8..495f012e2 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:35.100** total execution time for **how_to_optimize_operators** files:
+**00:34.856** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.752 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.567 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.279 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.269 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.069 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.020 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index 856b89003..f8941cdc4 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**05:27.036** total execution time for **how_to_tune_with_autoscheduler** files:
+**05:35.790** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:40.059 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 02:54.405 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:22.883 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:21.949 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:44.603 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 00:43.870 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:21.244 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:18.355 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:09.256 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.789 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:08.990 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:08.421 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index c26c43b14..68c54d957 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -240,471 +240,912 @@ cooperative fetching, unrolling and operator fusion.
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 32;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [2304]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
- conv2d_nchw_1[1] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [4032]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope="local", align=8)[0] = 0f32
conv2d_nchw_1[2] = 0f32
+ conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[3] = 0f32
- conv2d_nchw_1[4] = 0f32
- conv2d_nchw_1[5] = 0f32
- conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 32) {
- let cse_var_2: int32 = (rc.outer.outer*784)
- let cse_var_1: int32 = (rc.outer.outer*144)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else((((9 <= threadIdx.x_1) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[(((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 56)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 56), 81)) && (floormod((threadIdx.x_1 + 56), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 56), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 56), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 31), 81)) && (floormod((threadIdx.x_1 + 31), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 31), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 168)] = @tir.if_then_else((((9 <= floormod((threadIdx.x_1 + 6), 81)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 168), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 6), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 280)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 37), 81)) && (floormod((threadIdx.x_1 + 37), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 280), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 37), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((1 <= floormod((threadIdx.x_1 + 3), 9)) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 12), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 68), 81)) && (floormod((threadIdx.x_1 + 68), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 68), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 504)] = @tir.if_then_else((((threadIdx.x_1 < 54) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 504), 81)*49)) + ((floordiv(threadIdx.x_1, 9) + 2)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 74), 81)) && (floormod((threadIdx.x_1 + 74), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 74), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 616)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 49), 81)) && (floormod((threadIdx.x_1 + 49), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 616), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 49), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else((((threadIdx.x_1 < 48) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 728)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 80), 81)) && (floormod((threadIdx.x_1 + 80), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 728), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 80), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 55), 81)) && (floormod((threadIdx.x_1 + 55), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 55), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 840)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 30), 81)) && (floormod((threadIdx.x_1 + 30), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 840), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 30), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 952)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 61), 81)) && (floormod((threadIdx.x_1 + 61), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 952), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 61), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 9) + 4), 9)) && (floormod((threadIdx.x_1 + 36), 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1008), 81)*49)) + (floormod((floordiv(threadIdx.x_1, 9) + 4), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1064)] = @tir.if_then_else(((1 <= floormod((threadIdx.x_1 + 2), 9)) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1064), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 11), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 42), 81)) && (floormod((threadIdx.x_1 + 42), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 42), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else((((threadIdx.x_1 < 55) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 17), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- if @tir.likely((threadIdx.x_1 < 8), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1288)] = 0f32
- }
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- kernel.shared_1: Buffer(kernel.shared, float32, [2304], [], scope="shared")[(threadIdx.x_2*32)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2*32), 144), 3)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 1)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 1), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 2)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 2), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 3)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 1), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 4)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 4), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 5)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 5), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 6)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 2), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 7)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 7), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 8)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 8), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 9)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 3), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 10)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 10), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 11)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 11), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 12)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 4), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 13)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 13), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 14)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 14), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 15)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 5), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 16)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 16), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 17)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 17), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 18)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 6), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 19)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 19), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 20)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 20), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 21)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 7), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 22)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 22), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 23)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 23), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 24)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 8), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 25)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 25), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 26)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 26), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 27)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 9), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 28)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 28), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 29)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 29), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 30)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 10), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 31)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 31), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1792)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 64), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1793)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 65), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1794)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 22), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1795)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 1), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1796)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 68), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1797)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 23), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1798)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 2), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1799)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 71), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1800)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 24), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1801)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 3), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1802)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 74), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1803)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 25), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1804)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 4), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1805)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 77), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1806)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 26), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1807)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 5), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
+ for (rc.outer.outer: int32, 0, 8) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*3136)
+ let cse_var_1: int32 = (rc.outer.outer*576)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [4032], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 98), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 294), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 490), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 686), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 678)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1078), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1274), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1470), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1666)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1666), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 1364)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1862)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1862), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2058)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2058), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2156)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2156), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2254)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2254), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2352)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2352), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2450)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2450), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2548)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2548), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2646)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 2050)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2744)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2744), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2842)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2842), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2940)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2940), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3038)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3038), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3136)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3136), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3234)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3234), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3332)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3332), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3430)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3430), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3528)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 2736)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3626)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3626), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3724)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3724), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3822)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3822), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3920)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3920), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, 7)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ if @tir.likely((threadIdx.x_1 < 14), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 4018)] = @tir.if_then_else((((threadIdx.x_1 < 7) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 4018), 63)*49)) + rx.outer.outer) + threadIdx.x_1) + 41)], 0f32, dtype=float32)
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1808)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 80), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[(threadIdx.x_2*4)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv((floormod(threadIdx.x_2, 48)*4), 3)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv(((floormod(threadIdx.x_2, 48)*4) + 1), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv(((floormod(threadIdx.x_2, 48)*4) + 2), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 3)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 1), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1809)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 27), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1[((threadIdx.x_2*4) + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 8), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 393)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 3), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 394)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 10), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 395)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*4) + 392), 3) + 1), 64)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1810)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 6), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1[((threadIdx.x_2*4) + 784)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 16), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 785)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 17), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 786)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 6), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 787)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*4) + 784), 3) + 1), 64)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1811)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 83), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1812)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 28), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1813)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 7), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1814)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 86), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1815)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 29), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1816)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 8), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1817)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 89), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1818)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 30), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1819)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 9), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1820)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 92), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1821)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 31), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1822)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 10), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1823)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 95), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1176)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 8), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1177)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 25), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1178)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 26), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1179)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 9), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
}
- }
- for (rc.outer.inner: int32, 0, 8) {
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 26)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 89)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 107)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9))]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 26)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 89)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 107)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[(floordiv(threadIdx.x, 49)*384)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 768)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 192)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 960)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 769)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 193)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 961)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 770)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 194)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 962)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 771)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 195)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 963)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 772)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 196)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 964)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 773)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 197)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 965)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 774)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 198)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 966)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 775)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 199)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 967)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 776)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 200)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 968)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 777)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 201)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 969)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 778)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 202)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 970)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 779)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 203)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 971)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 780)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 204)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 972)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 781)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 205)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 973)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 782)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 206)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 974)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 783)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 207)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 975)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 784)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 208)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 976)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 785)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 209)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 977)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 786)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 210)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 978)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 787)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 211)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 979)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 788)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 212)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 980)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 789)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 213)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 981)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 790)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 214)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 982)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 791)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 215)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 983)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 24)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 792)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 216)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 984)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 25)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 793)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 217)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 985)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 26)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 794)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 218)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 986)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 27)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 795)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 219)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 987)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 28)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 796)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 220)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 988)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 29)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 797)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 221)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 989)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 30)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 798)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 222)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 990)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 31)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 799)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 223)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 991)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 32)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 800)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 224)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 992)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 33)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 801)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 225)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 993)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 34)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 802)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 226)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 994)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 35)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 803)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 227)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 995)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 36)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 804)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 228)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 996)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 37)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 805)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 229)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 997)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 38)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 806)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 230)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 998)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 39)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 807)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 231)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 999)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 40)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 808)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 232)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1000)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 41)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 809)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 233)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1001)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 42)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 810)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 234)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1002)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 43)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 811)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 235)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1003)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 44)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 812)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 236)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1004)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 45)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 813)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 237)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1005)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 46)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 814)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 238)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1006)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 47)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 815)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 239)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1007)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 48)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 816)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 240)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1008)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 49)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 817)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 241)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1009)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 50)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 818)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 242)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1010)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 51)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 819)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 243)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1011)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 52)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 820)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 244)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1012)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 53)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 821)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 245)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1013)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 54)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 822)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 246)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1014)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 55)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 823)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 247)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1015)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 56)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 824)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 248)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1016)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 57)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 825)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 249)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1017)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 58)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 826)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 250)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1018)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 59)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 827)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 251)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1019)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 60)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 828)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 252)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1020)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 61)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 829)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 253)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1021)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 62)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 830)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 254)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1022)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 63)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 831)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 255)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1023)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 64)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 832)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 256)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1024)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 65)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 833)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 257)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1025)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 66)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 834)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 258)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1026)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 67)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 835)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 259)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1027)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 68)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 836)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 260)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1028)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 69)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 837)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 261)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1029)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 70)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 838)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 262)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1030)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 71)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 839)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 263)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1031)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 72)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 840)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 264)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1032)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 73)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 841)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 265)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1033)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 74)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 842)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 266)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1034)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 75)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 843)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 267)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1035)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 76)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 844)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 268)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1036)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 77)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 845)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 269)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1037)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 78)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 846)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 270)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1038)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 79)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 847)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 271)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1039)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 80)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 848)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 272)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1040)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 81)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 849)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 273)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1041)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 82)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 850)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 274)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1042)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 83)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 851)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 275)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1043)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 84)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 852)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 276)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1044)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 85)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 853)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 277)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1045)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 86)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 854)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 278)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1046)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 87)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 855)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 279)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1047)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 88)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 856)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 280)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1048)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 89)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 857)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 281)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1049)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 90)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 858)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 282)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1050)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 91)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 859)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 283)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1051)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 92)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 860)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 284)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1052)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 93)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 861)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 285)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1053)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 94)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 862)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 286)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1054)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 95)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 863)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 287)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1055)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 96)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 864)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 288)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1056)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 97)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 865)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 289)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1057)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 98)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 866)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 290)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1058)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 99)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 867)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 291)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1059)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 100)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 868)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 292)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1060)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 101)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 869)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 293)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1061)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 102)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 870)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 294)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1062)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 103)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 871)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 295)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1063)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 104)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 872)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 296)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1064)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 105)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 873)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 297)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1065)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 106)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 874)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 298)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1066)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 107)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 875)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 299)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1067)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 108)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 876)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 300)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1068)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 109)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 877)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 301)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1069)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 110)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 878)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 302)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1070)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 111)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 879)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 303)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1071)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 112)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 880)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 304)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1072)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 113)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 881)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 305)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1073)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 114)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 882)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 306)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1074)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 115)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 883)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 307)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1075)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 116)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 884)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 308)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1076)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 117)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 885)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 309)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1077)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 118)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 886)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 310)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1078)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 119)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 887)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 311)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1079)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 120)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 888)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 312)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1080)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 121)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 889)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 313)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1081)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 122)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 890)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 314)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1082)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 123)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 891)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 315)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1083)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 124)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 892)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 316)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1084)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 125)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 893)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 317)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1085)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 126)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 894)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 318)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1086)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 127)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 895)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 319)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1087)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 128)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 896)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 320)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1088)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 129)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 897)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 321)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1089)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 130)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 898)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 322)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1090)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 131)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 899)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 323)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1091)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 132)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 900)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 324)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1092)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 133)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 901)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 325)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1093)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 134)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 902)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 326)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1094)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 135)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 903)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 327)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1095)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 136)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 904)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 328)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1096)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 137)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 905)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 329)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1097)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 138)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 906)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 330)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1098)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 139)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 907)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 331)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1099)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 140)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 908)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 332)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1100)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 141)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 909)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 333)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1101)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 142)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 910)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 334)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1102)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 143)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 911)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 335)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1103)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 144)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 912)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 336)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1104)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 145)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 913)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 337)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1105)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 146)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 914)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 338)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1106)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 147)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 915)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 339)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1107)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 148)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 916)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 340)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1108)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 149)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 917)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 341)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1109)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 150)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 918)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 342)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1110)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 151)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 919)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 343)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1111)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 152)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 920)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 344)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1112)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 153)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 921)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 345)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1113)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 154)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 922)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 346)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1114)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 155)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 923)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 347)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1115)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 156)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 924)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 348)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1116)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 157)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 925)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 349)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1117)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 158)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 926)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 350)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1118)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 159)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 927)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 351)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1119)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 160)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 928)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 352)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1120)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 161)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 929)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 353)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1121)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 162)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 930)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 354)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1122)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 163)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 931)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 355)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1123)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 164)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 932)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 356)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1124)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 165)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 933)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 357)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1125)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 166)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 934)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 358)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1126)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 167)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 935)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 359)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1127)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 168)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 936)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 360)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1128)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 169)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 937)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 361)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1129)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 170)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 938)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 362)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1130)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 171)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 939)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 363)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1131)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 172)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 940)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 364)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1132)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 173)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 941)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 365)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1133)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 174)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 942)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 366)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1134)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 175)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 943)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 367)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1135)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 176)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 944)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 368)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1136)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 177)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 945)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 369)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1137)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 178)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 946)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 370)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1138)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 179)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 947)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 371)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1139)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 180)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 948)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 372)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1140)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 181)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 949)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 373)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1141)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 182)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 950)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 374)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1142)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 183)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 951)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 375)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1143)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 184)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 952)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 376)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1144)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 185)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 953)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 377)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1145)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 186)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 954)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 378)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1146)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 187)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 955)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 379)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1147)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 188)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 956)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 380)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1148)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 189)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 957)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 381)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1149)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 190)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 958)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 382)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1150)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 191)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 959)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 383)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1151)]))
}
}
}
for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((blockIdx.x*784) + (floordiv(threadIdx.x, 7)*98)) + (i1.inner*49)) + (floormod(threadIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((blockIdx.x*16) + (floordiv(threadIdx.x, 7)*2)) + i1.inner)]), 0f32)
- }
+ compute[((((blockIdx.x*392) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias[(((blockIdx.x*8) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
+ compute[(((((blockIdx.x*392) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 196)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias[((((blockIdx.x*8) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 4)]), 0f32)
}
}
}
@@ -759,7 +1200,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.229 ms
+ Execution time of this operator: 0.330 ms
@@ -807,36 +1248,36 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
- conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
- conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
- conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
+ conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+ conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=2)
+ conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
- conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+ conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
- conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
+ conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=32)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
- conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+ conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
- compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=2)
+ compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
- compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+ compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -854,14 +1295,14 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=32)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=98)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=98)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -881,440 +1322,859 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[1296];
- __shared__ float kernel_shared[2304];
+ extern "C" __global__ void __launch_bounds__(98) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[4];
+ __shared__ float pad_temp_shared[4032];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
- conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
+ conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[3] = 0.000000e+00f;
- conv2d_nchw[4] = 0.000000e+00f;
- conv2d_nchw[5] = 0.000000e+00f;
- conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
- __syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = ((((9 <= ((int)threadIdx.x)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 56)] = (((((9 <= ((((int)threadIdx.x) + 56) % 81)) && (((((int)threadIdx.x) + 56) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 56) / 81) * 49)) + ((((((int)threadIdx.x) + 56) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 168)] = ((((3 <= ((int)threadIdx.x)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 168) / 81) * 49)) + (((((int)threadIdx.x) + 6) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 280)] = (((((9 <= ((((int)threadIdx.x) + 37) % 81)) && (((((int)threadIdx.x) + 37) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 280) / 81) * 49)) + ((((((int)threadIdx.x) + 37) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 336)] = (((1 <= ((((int)threadIdx.x) + 3) % 9)) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 336) / 81) * 49)) + (((((int)threadIdx.x) + 12) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((9 <= ((((int)threadIdx.x) + 68) % 81)) && (((((int)threadIdx.x) + 68) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 392) / 81) * 49)) + ((((((int)threadIdx.x) + 68) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 504)] = ((((((int)threadIdx.x) < 54) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 504) / 81) * 49)) + ((((int)threadIdx.x) / 9) * 7)) + (((int)threadIdx.x) % 9)) + 6)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 616)] = (((((9 <= ((((int)threadIdx.x) + 49) % 81)) && (((((int)threadIdx.x) + 49) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 616) / 81) * 49)) + ((((((int)threadIdx.x) + 49) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 672)] = ((((((int)threadIdx.x) < 48) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + (((((int)threadIdx.x) + 24) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 728)] = (((((9 <= ((((int)threadIdx.x) + 80) % 81)) && (((((int)threadIdx.x) + 80) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 728) / 81) * 49)) + ((((((int)threadIdx.x) + 80) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 840)] = (((((9 <= ((((int)threadIdx.x) + 30) % 81)) && (((((int)threadIdx.x) + 30) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 840) / 81) * 49)) + ((((((int)threadIdx.x) + 30) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 896)] = ((((4 <= ((int)threadIdx.x)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + (((((int)threadIdx.x) + 5) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 952)] = (((((9 <= ((((int)threadIdx.x) + 61) % 81)) && (((((int)threadIdx.x) + 61) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 952) / 81) * 49)) + ((((((int)threadIdx.x) + 61) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 <= (((((int)threadIdx.x) / 9) + 4) % 9)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1064)] = (((1 <= ((((int)threadIdx.x) + 2) % 9)) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1064) / 81) * 49)) + (((((int)threadIdx.x) + 11) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((9 <= ((((int)threadIdx.x) + 42) % 81)) && (((((int)threadIdx.x) + 42) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1176) / 81) * 49)) + ((((((int)threadIdx.x) + 42) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1232)] = ((((((int)threadIdx.x) < 55) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + (((((int)threadIdx.x) + 17) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 8) {
- pad_temp_shared[(((int)threadIdx.x) + 1288)] = 0.000000e+00f;
- }
- kernel_shared[(((int)threadIdx.x) * 32)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) * 32) % 144) / 3) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 1)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 1) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 2)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 2) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 3)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 1) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 4)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 4) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 5)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 5) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 6)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 2) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 7)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 7) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 8)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 8) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 9)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 3) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 10)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 10) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 11)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 11) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 12)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 4) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 13)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 13) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 14)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 14) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 15)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 5) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 16)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 16) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 17)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 17) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 18)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 6) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 19)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 19) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 20)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 20) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 21)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 7) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 22)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 22) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 23)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 23) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 24)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 8) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 25)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 25) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 26)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 26) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 27)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 9) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 28)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 28) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 29)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 29) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 30)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 10) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 31)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 31) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1792)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 64) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1793)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 65) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1794)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 22) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1795)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 1) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1796)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 68) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1797)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 23) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1798)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 2) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1799)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 71) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1800)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 24) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1801)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 3) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1802)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 74) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1803)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 25) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1804)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 4) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1805)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 77) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1806)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 26) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1807)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 5) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1808)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 80) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1809)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 27) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1810)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 6) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1811)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 83) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1812)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 28) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1813)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 7) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1814)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 86) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1815)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 29) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1816)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 8) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1817)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 89) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1818)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 30) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1819)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 9) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1820)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 92) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1821)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 31) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1822)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 10) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1823)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 95) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- __syncthreads();
- for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9))] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 26)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 89)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 107)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9))] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 26)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 89)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 107)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ for (int rc_outer_outer = 0; rc_outer_outer < 8; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
+ __syncthreads();
+ pad_temp_shared[((int)threadIdx.x)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 98)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 98) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 196) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 294)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 294) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 392) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 490)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 490) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 588) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 686)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 686) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 784) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 882)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 678)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 980) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1078)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1078) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1176) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1274)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1274) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1372) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1470)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1470) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1666)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1666) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 1364)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1862)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1862) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1960) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2058)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2058) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2156)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2156) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2254)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2254) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2352)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2352) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2450)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2450) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2548)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2548) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2646)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 2050)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2744)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2744) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2842)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2842) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2940)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2940) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3038)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3038) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3136)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3136) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3234)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3234) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3332)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3332) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3430)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3430) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3528)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 2736)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3626)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3626) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3724)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3724) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3822)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3822) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3920)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3920) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 14) {
+ pad_temp_shared[(((int)threadIdx.x) + 4018)] = ((((((int)threadIdx.x) < 7) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 4018) / 63) * 49)) + rx_outer_outer) + ((int)threadIdx.x)) + 41)] : 0.000000e+00f);
+ }
+ kernel_shared[(((int)threadIdx.x) * 4)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) % 48) * 4) / 3) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) % 48) * 4) + 1) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) % 48) * 4) + 2) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 3)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 1) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 8) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 393)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 3) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 394)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 10) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 395)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((((int)threadIdx.x) * 4) + 392) / 3) + 1) & 63) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 784)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 16) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 785)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 17) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 786)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 6) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 787)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((((int)threadIdx.x) * 4) + 784) / 3) + 1) & 63) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1176)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 8) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1177)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 25) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1178)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 26) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1179)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 9) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ }
+ __syncthreads();
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[((((int)threadIdx.x) / 49) * 384)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 768)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 192)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 960)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 769)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 193)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 961)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 770)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 194)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 962)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 771)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 195)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 963)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 772)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 196)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 964)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 773)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 197)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 965)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 774)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 198)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 966)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 775)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 199)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 967)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 776)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 200)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 968)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 777)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 201)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 969)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 778)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 202)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 970)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 779)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 203)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 971)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 780)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 204)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 972)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 781)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 205)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 973)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 782)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 206)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 974)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 783)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 207)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 975)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 784)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 208)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 976)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 785)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 209)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 977)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 786)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 210)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 978)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 787)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 211)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 979)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 788)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 212)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 980)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 789)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 213)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 981)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 790)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 214)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 982)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 791)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 215)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 983)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 24)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 792)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 216)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 984)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 25)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 793)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 217)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 985)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 26)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 794)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 218)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 986)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 27)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 795)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 219)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 987)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 28)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 796)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 220)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 988)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 29)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 797)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 221)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 989)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 30)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 798)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 222)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 990)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 31)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 799)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 223)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 991)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 32)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 800)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 224)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 992)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 33)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 801)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 225)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 993)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 34)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 802)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 226)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 994)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 35)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 803)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 227)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 995)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 36)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 804)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 228)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 996)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 37)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 805)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 229)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 997)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 38)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 806)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 230)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 998)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 39)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 807)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 231)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 999)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 40)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 808)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 232)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1000)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 41)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 809)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 233)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1001)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 42)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 810)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 234)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1002)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 43)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 811)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 235)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1003)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 44)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 812)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 236)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1004)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 45)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 813)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 237)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1005)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 46)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 814)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 238)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1006)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 47)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 815)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 239)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1007)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 48)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 816)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 240)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1008)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 49)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 817)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 241)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1009)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 50)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 818)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 242)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1010)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 51)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 819)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 243)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1011)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 52)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 820)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 244)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1012)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 53)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 821)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 245)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1013)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 54)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 822)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 246)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1014)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 55)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 823)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 247)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1015)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 56)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 824)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 248)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1016)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 57)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 825)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 249)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1017)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 58)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 826)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 250)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1018)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 59)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 827)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 251)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1019)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 60)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 828)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 252)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1020)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 61)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 829)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 253)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1021)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 62)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 830)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 254)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1022)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 63)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 831)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 255)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1023)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 64)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 832)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 256)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1024)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 65)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 833)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 257)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1025)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 66)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 834)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 258)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1026)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 67)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 835)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 259)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1027)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 68)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 836)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 260)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1028)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 69)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 837)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 261)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1029)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 70)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 838)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 262)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1030)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 71)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 839)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 263)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1031)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 72)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 840)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 264)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1032)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 73)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 841)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 265)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1033)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 74)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 842)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 266)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1034)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 75)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 843)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 267)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1035)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 76)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 844)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 268)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1036)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 77)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 845)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 269)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1037)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 78)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 846)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 270)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1038)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 79)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 847)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 271)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1039)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 80)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 848)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 272)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1040)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 81)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 849)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 273)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1041)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 82)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 850)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 274)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1042)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 83)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 851)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 275)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1043)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 84)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 852)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 276)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1044)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 85)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 853)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 277)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1045)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 86)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 854)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 278)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1046)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 87)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 855)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 279)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1047)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 88)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 856)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 280)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1048)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 89)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 857)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 281)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1049)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 90)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 858)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 282)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1050)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 91)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 859)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 283)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1051)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 92)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 860)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 284)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1052)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 93)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 861)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 285)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1053)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 94)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 862)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 286)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1054)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 95)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 863)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 287)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1055)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 96)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 864)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 288)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1056)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 97)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 865)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 289)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1057)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 98)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 866)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 290)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1058)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 99)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 867)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 291)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1059)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 100)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 868)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 292)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1060)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 101)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 869)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 293)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1061)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 102)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 870)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 294)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1062)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 103)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 871)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 295)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1063)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 104)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 872)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 296)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1064)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 105)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 873)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 297)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1065)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 106)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 874)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 298)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1066)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 107)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 875)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 299)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1067)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 108)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 876)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 300)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1068)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 109)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 877)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 301)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1069)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 110)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 878)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 302)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1070)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 111)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 879)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 303)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1071)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 112)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 880)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 304)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1072)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 113)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 881)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 305)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1073)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 114)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 882)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 306)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1074)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 115)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 883)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 307)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1075)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 116)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 884)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 308)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1076)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 117)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 885)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 309)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1077)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 118)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 886)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 310)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1078)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 119)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 887)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 311)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1079)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 120)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 888)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 312)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1080)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 121)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 889)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 313)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1081)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 122)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 890)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 314)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1082)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 123)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 891)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 315)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1083)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 124)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 892)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 316)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1084)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 125)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 893)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 317)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1085)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 126)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 894)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 318)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1086)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 127)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 895)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 319)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1087)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 128)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 896)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 320)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1088)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 129)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 897)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 321)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1089)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 130)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 898)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 322)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1090)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 131)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 899)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 323)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1091)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 132)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 900)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 324)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1092)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 133)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 901)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 325)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1093)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 134)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 902)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 326)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1094)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 135)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 903)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 327)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1095)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 136)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 904)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 328)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1096)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 137)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 905)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 329)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1097)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 138)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 906)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 330)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1098)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 139)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 907)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 331)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1099)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 140)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 908)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 332)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1100)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 141)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 909)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 333)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1101)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 142)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 910)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 334)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1102)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 143)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 911)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 335)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1103)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 144)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 912)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 336)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1104)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 145)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 913)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 337)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1105)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 146)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 914)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 338)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1106)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 147)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 915)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 339)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1107)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 148)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 916)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 340)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1108)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 149)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 917)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 341)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1109)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 150)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 918)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 342)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1110)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 151)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 919)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 343)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1111)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 152)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 920)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 344)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1112)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 153)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 921)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 345)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1113)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 154)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 922)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 346)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1114)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 155)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 923)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 347)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1115)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 156)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 924)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 348)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1116)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 157)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 925)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 349)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1117)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 158)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 926)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 350)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1118)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 159)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 927)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 351)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1119)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 160)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 928)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 352)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1120)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 161)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 929)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 353)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1121)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 162)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 930)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 354)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1122)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 163)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 931)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 355)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1123)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 164)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 932)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 356)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1124)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 165)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 933)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 357)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1125)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 166)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 934)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 358)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1126)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 167)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 935)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 359)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1127)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 168)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 936)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 360)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1128)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 169)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 937)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 361)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1129)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 170)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 938)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 362)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1130)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 171)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 939)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 363)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1131)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 172)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 940)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 364)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1132)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 173)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 941)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 365)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1133)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 174)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 942)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 366)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1134)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 175)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 943)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 367)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1135)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 176)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 944)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 368)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1136)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 177)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 945)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 369)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1137)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 178)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 946)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 370)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1138)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 179)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 947)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 371)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1139)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 180)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 948)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 372)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1140)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 181)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 949)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 373)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1141)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 182)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 950)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 374)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1142)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 183)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 951)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 375)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1143)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 184)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 952)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 376)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1144)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 185)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 953)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 377)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1145)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 186)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 954)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 378)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1146)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 187)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 955)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 379)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1147)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 188)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 956)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 380)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1148)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 189)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 957)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 381)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1149)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 190)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 958)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 382)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1150)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 191)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 959)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 383)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1151)]));
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ compute[((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 8) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 196)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 8) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 4)]), 0.000000e+00f);
}
}
@@ -1376,7 +2236,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 40.059 seconds)
+ **Total running time of the script:** ( 2 minutes 54.405 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index e0aec7014..5d12ebcfe 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -647,7 +647,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 9.7007 9.7019 9.7260 9.6743 0.0211
+ 9.7218 9.7307 9.7445 9.6903 0.0230
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 91bed68f7..498fd1ec8 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -666,7 +666,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 756.0719 756.0888 756.1048 756.0220 0.0359
+ 755.4379 755.4041 756.4968 754.4129 0.8511
@@ -694,7 +694,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 22.883 seconds)
+ **Total running time of the script:** ( 1 minutes 21.949 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 228161775..c16cbda18 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -397,14 +397,14 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_5: placeholder_16: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_18: Buffer(placeholder_11, float32, [4916, 16, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_8: placeholder_19: Buffer(placeholder_13, int32, [33], [])} {
- for (i0.outer.i1.outer.fused: int32, 0, 128) "parallel" {
- allocate(compute_4: Pointer(global float32), float32, [512]), storage_scope = global {
- for (i.outer.inner: int32, 0, 2) {
- for (i.inner.init: int32, 0, 16) {
- let cse_var_1: int32 = ((i.outer.inner*256) + (i.inner.init*16))
+ preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_17: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_8: placeholder_19: Buffer(placeholder_13, int32, [33], [])} {
+ for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
+ allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 4) {
+ for (i.inner.init: int32, 0, 32) {
+ let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
{
- compute_5: Buffer(compute_4, float32, [512], [])[cse_var_1] = 0f32
+ compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
compute_5[(cse_var_1 + 1)] = 0f32
compute_5[(cse_var_1 + 2)] = 0f32
compute_5[(cse_var_1 + 3)] = 0f32
@@ -422,83 +422,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
compute_5[(cse_var_1 + 15)] = 0f32
}
}
- for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
- for (i.inner: int32, 0, 16) {
- let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
- {
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_4: int32 = ((i.outer.inner*256) + (i.inner*16))
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*256) + (i.inner*16)) + 1)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*256) + (i.inner*16)) + 2)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*256) + (i.inner*16)) + 3)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*256) + (i.inner*16)) + 4)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*256) + (i.inner*16)) + 5)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*256) + (i.inner*16)) + 6)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*256) + (i.inner*16)) + 7)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*256) + (i.inner*16)) + 8)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*256) + (i.inner*16)) + 9)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*256) + (i.inner*16)) + 10)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*256) + (i.inner*16)) + 11)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*256) + (i.inner*16)) + 12)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*256) + (i.inner*16)) + 13)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_18: int32 = (((i.outer.inner*256) + (i.inner*16)) + 14)
- compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_19: int32 = (((i.outer.inner*256) + (i.inner*16)) + 15)
- compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
+ for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (i.inner: int32, 0, 32) {
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 32) {
- for (i1.inner: int32, 0, 16) {
- let cse_var_20: int32 = ((((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16)) + i1.inner)
- compute[cse_var_20] = max((compute_5[((i0.inner*16) + i1.inner)] + placeholder_4[cse_var_20]), 0f32)
- }
+ for (i0.inner: int32, 0, 128) {
+ let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+ compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
}
}
}
@@ -554,7 +549,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.697 ms
+ Execution time of this operator: 1.699 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index bc322f426..e46450f41 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:43.722** total execution time for **how_to_tune_with_autotvm** files:
+**00:44.500** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:43.691 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:44.465 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.016 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.020 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index cb4da61af..082a0eee5 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -892,8 +892,8 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
- No: 6 GFLOPS: 103.61/103.61 result: MeasureResult(costs=(0.002234297895833333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6444010734558105, timestamp=1657174711.450943) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
- No: 7 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 6 GFLOPS: 93.95/93.95 result: MeasureResult(costs=(0.002464068875,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8310317993164062, timestamp=1657179140.2786798) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+ No: 7 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1016,7 +1016,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
- No: 8 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 8 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1139,7 +1139,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
- No: 9 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 9 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1262,7 +1262,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
- No: 10 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 10 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1280,7 +1280,7 @@ for this template
TimeoutError
[('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
- No: 11 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 11 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1403,7 +1403,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
- No: 12 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 12 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1526,7 +1526,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
- No: 13 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 13 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1649,7 +1649,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
- No: 14 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 14 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1772,7 +1772,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
- No: 15 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 15 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1895,7 +1895,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
- No: 16 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 16 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2018,7 +2018,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
- No: 17 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 17 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2141,7 +2141,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
- No: 18 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 18 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2264,7 +2264,7 @@ for this template
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
- No: 19 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+ No: 19 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
yield remote, remote.load_module(os.path.split(build_result.filename)[1])
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
@@ -2352,7 +2352,7 @@ for this template
15: _PyEval_EvalFrameDefault
14: 0x0000000000537c30
13: _PyObject_FastCallKeywords
- 12: 0x00007f2274af7fa2
+ 12: 0x00007f5e8b033fa2
11: _ctypes_callproc
10: ffi_call
9: ffi_call_unix64
@@ -2417,7 +2417,7 @@ for this template
21: _PyFunction_FastCallKeywords
20: _PyEval_EvalFrameDefault
19: _PyFunction_FastCall [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
- No: 20 GFLOPS: 144.75/144.75 result: MeasureResult(costs=(0.0015993116699999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.454596996307373, timestamp=1657174738.1003814) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+ No: 20 GFLOPS: 144.09/144.09 result: MeasureResult(costs=(0.00160666181,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4486920833587646, timestamp=1657179166.8123953) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
@@ -2474,7 +2474,7 @@ and measure running time.
Best config:
[('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
Finish loading 20 records
- Time cost of this operator: 0.002037
+ Time cost of this operator: 0.002029
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index c6d471827..d2381c58a 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -329,10 +329,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.7 98.705 (1, 2, 10, 10, 3) 2 1 [311.7]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.125 0.99 (1, 6, 10, 10) 1 1 [3.125]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.964 0.305 (1, 1, 10, 10, 3) 1 1 [0.964]
- Total_time - 315.789 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.8 98.713 (1, 2, 10, 10, 3) 2 1 [310.8]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.094 0.983 (1, 6, 10, 10) 1 1 [3.094]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.959 0.304 (1, 1, 10, 10, 3) 1 1 [0.959]
+ Total_time - 314.852 - - - - -
@@ -398,10 +398,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 88.0 96.911 (1, 6, 10, 10, 1) 2 1 [88.0]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.845 2.032 (1, 6, 10, 10) 1 1 [1.845]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.96 1.057 (1, 1, 10, 10, 3) 1 1 [0.96]
- Total_time - 90.805 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 133.7 97.938 (1, 6, 10, 10, 1) 2 1 [133.7]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.818 1.332 (1, 6, 10, 10) 1 1 [1.818]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.996 0.73 (1, 1, 10, 10, 3) 1 1 [0.996]
+ Total_time - 136.514 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index 32147d8aa..50ddb4156 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -225,7 +225,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmpno5zwqco/images/random'
+ '/tmp/tmprszi7_mv/images/random'
@@ -325,8 +325,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmpno5zwqco/images/target contains 8144 images
- /tmp/tmpno5zwqco/images/random contains 5000 images
+ /tmp/tmprszi7_mv/images/target contains 8144 images
+ /tmp/tmprszi7_mv/images/random contains 5000 images
@@ -501,13 +501,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 56s - loss: 0.2213 - accuracy: 0.9240 - val_loss: 0.1338 - val_accuracy: 0.9562
+ 328/328 - 56s - loss: 0.2335 - accuracy: 0.9211 - val_loss: 0.1229 - val_accuracy: 0.9603
Epoch 2/3
- 328/328 - 53s - loss: 0.0986 - accuracy: 0.9627 - val_loss: 0.1379 - val_accuracy: 0.9543
+ 328/328 - 53s - loss: 0.0972 - accuracy: 0.9621 - val_loss: 0.1103 - val_accuracy: 0.9630
Epoch 3/3
- 328/328 - 53s - loss: 0.0656 - accuracy: 0.9755 - val_loss: 0.1003 - val_accuracy: 0.9653
+ 328/328 - 52s - loss: 0.0682 - accuracy: 0.9757 - val_loss: 0.1070 - val_accuracy: 0.9694
- <keras.callbacks.History object at 0x7f5b41476610>
+ <keras.callbacks.History object at 0x7f2b7a78bf90>
@@ -864,7 +864,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 0.282 seconds)
+ **Total running time of the script:** ( 5 minutes 15.759 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index ee5233553..6df6e9f35 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**05:49.383** total execution time for **how_to_work_with_microtvm** files:
+**06:01.915** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 05:00.282 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 05:15.759 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:45.588 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:42.880 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.510 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.274 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_ethosu.py` (``micro_ethosu.py``) | 00:00.001 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 7d5e593bf..423d3f955 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:11.408** total execution time for **how_to_work_with_relay** files:
+**00:11.310** total execution time for **how_to_work_with_relay** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.906 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:09.919 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.496 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.385 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index b7f5eb2e2..2f4cfedaa 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -261,7 +261,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f5aba38c4d0>
+ <function my_cuda_math_rule at 0x7f2acde41680>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 72a0f37da..7b0831d60 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:04.025** total execution time for **how_to_work_with_schedules** files:
+**00:04.149** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:01.882 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:01.955 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.929 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:00.908 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.525 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.566 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.513 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.540 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.101 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.099 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.035 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.036 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.027 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.030 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.014 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.015 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index b663f0954..7ffc3bed2 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -347,7 +347,7 @@ The importing needs to happen before the tensorized GEMV being executed.
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpgatjuc7x/input0.cc'\nsource_filename = \"/tmp/tmpgatjuc7x/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpg1mz_rs_/input0.cc'\nsource_filename = \"/tmp/tmpg1mz_rs_/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca floa [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index 94b74de91..1a2881e33 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:21.699** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:21.165** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:21.692 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:21.158 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.007 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.006 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 38228f3f6..823b39539 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -291,7 +291,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 23.84s!
+ resnet18_v1 inference graph built in 23.35s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 82014505a..bb66ed129 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -335,7 +335,7 @@ The compilation steps are:
"target_host parameter is going to be deprecated. "
/workspace/python/tvm/relay/build_module.py:411: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 16.79s!
+ yolov3-tiny inference graph built in 15.37s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 4beed8fec..36d3d8930 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:31.991** total execution time for **topic_vta_tutorials_frontend** files:
+**01:31.940** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.240 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:48.371 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:43.751 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:43.569 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 0d1ab57ed..bec12741b 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.238** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.420** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.843 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:03.032 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.395 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.388 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index 76c0ff228..7956d09c1 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.728** total execution time for **topic_vta_tutorials** files:
+**00:00.691** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.394 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.370 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.334 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.320 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index ef92afa1c..950450193 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -335,7 +335,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 93.831 ms
+ Execution time of this operator: 93.286 ms
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 4dda2971c..dd556a5e0 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -462,16 +462,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 10.78/10.78 result: MeasureResult(costs=(0.024891427799999997,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5347135066986084, timestamp=1657173554.660016) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
- No: 2 GFLOPS: 2.96/10.78 result: MeasureResult(costs=(0.09058258799999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5823323726654053, timestamp=1657173556.2725492) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
- No: 3 GFLOPS: 11.80/11.80 result: MeasureResult(costs=(0.0227486266,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5608701705932617, timestamp=1657173557.3425524) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
- No: 4 GFLOPS: 1.90/11.80 result: MeasureResult(costs=(0.1416140796,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.391812324523926, timestamp=1657173560.316899) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
- No: 5 GFLOPS: 3.63/11.80 result: MeasureResult(costs=(0.0739070932,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.320406436920166, timestamp=1657173561.7683496) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
- No: 6 GFLOPS: 1.75/11.80 result: MeasureResult(costs=(0.1529638476,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.619828939437866, timestamp=1657173564.4328322) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
- No: 7 GFLOPS: 0.88/11.80 result: MeasureResult(costs=(0.3066724936,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.0241453647613525, timestamp=1657173570.0457098) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
- No: 8 GFLOPS: 10.77/11.80 result: MeasureResult(costs=(0.024920171799999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5401318073272705, timestamp=1657173570.6089115) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
- No: 9 GFLOPS: 1.84/11.80 result: MeasureResult(costs=(0.1456923304,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.449063777923584, timestamp=1657173573.1786609) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
- No: 10 GFLOPS: 2.73/11.80 result: MeasureResult(costs=(0.09849079000000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.7016336917877197, timestamp=1657173574.9182162) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
+ No: 1 GFLOPS: 9.83/9.83 result: MeasureResult(costs=(0.0273207268,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5695722103118896, timestamp=1657177973.0337226) [('tile_y', [-1, 1]), ('tile_x', [-1, 256])],None,80
+ No: 2 GFLOPS: 2.46/9.83 result: MeasureResult(costs=(0.1090980332,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.894110918045044, timestamp=1657177974.9440458) [('tile_y', [-1, 4]), ('tile_x', [-1, 8])],None,32
+ No: 3 GFLOPS: 11.76/11.76 result: MeasureResult(costs=(0.0228197492,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.5580134391784668, timestamp=1657177975.9997888) [('tile_y', [-1, 64]), ('tile_x', [-1, 32])],None,56
+ No: 4 GFLOPS: 1.85/11.76 result: MeasureResult(costs=(0.144777739,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.441532611846924, timestamp=1657177979.0086226) [('tile_y', [-1, 1]), ('tile_x', [-1, 4])],None,20
+ No: 5 GFLOPS: 3.65/11.76 result: MeasureResult(costs=(0.0734487292,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.318424940109253, timestamp=1657177980.4598012) [('tile_y', [-1, 256]), ('tile_x', [-1, 16])],None,48
+ No: 6 GFLOPS: 1.77/11.76 result: MeasureResult(costs=(0.1518345532,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.5892674922943115, timestamp=1657177983.0920568) [('tile_y', [-1, 512]), ('tile_x', [-1, 4])],None,29
+ No: 7 GFLOPS: 0.87/11.76 result: MeasureResult(costs=(0.3070881112,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.034933805465698, timestamp=1657177988.6950371) [('tile_y', [-1, 512]), ('tile_x', [-1, 2])],None,19
+ No: 8 GFLOPS: 10.33/11.76 result: MeasureResult(costs=(0.025982635599999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.562802791595459, timestamp=1657177989.2742012) [('tile_y', [-1, 4]), ('tile_x', [-1, 64])],None,62
+ No: 9 GFLOPS: 1.77/11.76 result: MeasureResult(costs=(0.15163598080000001,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.5215632915496826, timestamp=1657177991.9121811) [('tile_y', [-1, 2]), ('tile_x', [-1, 2])],None,11
+ No: 10 GFLOPS: 2.26/11.76 result: MeasureResult(costs=(0.11866880860000002,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.0046634674072266, timestamp=1657177993.976864) [('tile_y', [-1, 4]), ('tile_x', [-1, 4])],None,22
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 0dab7e528..50946d6fa 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -327,7 +327,7 @@ standard deviation.
.. code-block:: none
- {'mean': 500.61246781000136, 'median': 500.6702233999988, 'std': 0.3408716062195281}
+ {'mean': 492.3690949199863, 'median': 492.15409664998333, 'std': 0.9317648306325024}
@@ -563,31 +563,31 @@ the tuning data to.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.36/ 17.36 GFLOPS | Progress: (4/20) | 6.46 s
[Task 1/25] Current/Best: 6.15/ 17.36 GFLOPS | Progress: (8/20) | 9.53 s
[Task 1/25] Current/Best: 11.51/ 22.73 GFLOPS | Progress: (12/20) | 12.02 s
[Task 1/25] Current/Best: 16.69/ 22.73 GFLOPS | Progress: (16/20) | 13.72 s
[Task 1/25] Current/Best: 11.58/ 23.55 GFLOPS | Progress: (20/20) | 15.48 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 11.85/ 12.72 GFLOPS | Progress: (4/20) | 3.99 s
[Task 2/25] Current/Best: 14.08/ 18.61 GFLOPS | Progress: (8/20) | 5.30 s
[Task 2/25] Current/Best: 20.87/ 20.87 GFLOPS | Progress: (12/20) | 6.63 s
[Task 2/25] Current/Best: 11.54/ 20.87 GFLOPS | Progress: (16/20) | 7.93 s
[Task 2/25] Current/Best: 19.16/ 20.87 GFLOPS | Progress: (20/20) | 9.57 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.56 GFLOPS | Progress: (4/20) | 5.93 s
[Task 3/25] Current/Best: 15.50/ 16.91 GFLOPS | Progress: (8/20) | 7.87 s
[Task 3/25] Current/Best: 14.80/ 16.91 GFLOPS | Progress: (12/20) | 9.60 s
[Task 3/25] Current/Best: 7.16/ 23.76 GFLOPS | Progress: (16/20) | 11.52 s
[Task 3/25] Current/Best: 12.47/ 23.76 GFLOPS | Progress: (20/20) | 16.11 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.65/ 20.38 GFLOPS | Progress: (4/20) | 2.43 s
[Task 4/25] Current/Best: 6.70/ 20.38 GFLOPS | Progress: (8/20) | 7.22 s
[Task 4/25] Current/Best: 21.09/ 21.09 GFLOPS | Progress: (12/20) | 12.14 s
[Task 4/25] Current/Best: 16.77/ 21.09 GFLOPS | Progress: (16/20) | 14.56 s
[Task 4/25] Current/Best: 13.10/ 21.09 GFLOPS | Progress: (20/20) | 16.55 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 9.75/ 10.19 GFLOPS | Progress: (4/20) | 2.65 s
[Task 5/25] Current/Best: 11.80/ 13.18 GFLOPS | Progress: (8/20) | 4.70 s
[Task 5/25] Current/Best: 9.55/ 17.83 GFLOPS | Progress: (12/20) | 7.94 s
[Task 5/25] Current/Best: 11.72/ 22.21 GFLOPS | Progress: (16/20) | 9.37 s
[Task 5/25] Current/Best: 11.71/ 22.21 GFLOPS | Progress: (20/20) | 11.40 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.27/ 20.75 GFLOPS | Progress: (4/20) | 4.14 s
[Task 6/25] Current/Best: 18.97/ 20.75 GFLOPS | Progress: (8/20) | 5.89 s
[Task 6/25] Current/Best: 13.10/ 20.75 GFLOPS | Progress: (12/20) | 7.84 s
[Task 6/25] Current/Best: 19.71/ 20.75 GFLOPS | Progress: (16/20) | 10.10 s
[Task 6/25] Current/Best: 3.76/ 20.75 GFLOPS | Progress: (20/20) | 12.61 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 11.08/ 12.72 GFLOPS | Progress: (4/20) | 3.71 s
[Task 7/25] Current/Best: 20.08/ 21.09 GFLOPS | Progress: (8/20) | 5.24 s
[Task 7/25] Current/Best: 15.59/ 21.09 GFLOPS | Progress: (12/20) | 7.18 s
[Task 7/25] Current/Best: 12.26/ 21.09 GFLOPS | Progress: (16/20) | 9.23 s
[Task 7/25] Current/Best: 6.33/ 21.58 GFLOPS | Progress: (20/20) | 11.70 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 10.47/ 14.44 GFLOPS | Progress: (4/20) | 2.95 s
[Task 8/25] Current/Best: 10.21/ 14.44 GFLOPS | Progress: (8/20) | 8.09 s
[Task 8/25] Current/Best: 13.23/ 14.44 GFLOPS | Progress: (12/20) | 14.63 s
[Task 8/25] Current/Best: 18.99/ 18.99 GFLOPS | Progress: (16/20) | 16.72 s
[Task 8/25] Current/Best: 20.40/ 20.40 GFLOPS | Progress: (20/20) | 23.82 s Done.
-
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.28/ 15.72 GFLOPS | Progress: (4/20) | 11.99 s
[Task 9/25] Current/Best: 22.76/ 22.76 GFLOPS | Progress: (8/20) | 13.78 s
[Task 9/25] Current/Best: 8.25/ 22.76 GFLOPS | Progress: (12/20) | 16.32 s
[Task 9/25] Current/Best: 17.77/ 22.76 GFLOPS | Progress: (16/20) | 19.20 s
[Task 9/25] Current/Best: 8.95/ 22.76 GFLOPS | Progress: (20/20) | 27.91 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.45/ 18.45 GFLOPS | Progress: (4/20) | 2.58 s
[Task 10/25] Current/Best: 15.43/ 18.45 GFLOPS | Progress: (8/20) | 4.21 s
[Task 10/25] Current/Best: 12.67/ 19.05 GFLOPS | Progress: (12/20) | 5.77 s
[Task 10/25] Current/Best: 19.15/ 20.37 GFLOPS | Progress: (16/20) | 6.88 s
[Task 10/25] Current/Best: 8.99/ 20.37 GFLOPS | Progress: (20/20
) | 8.41 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 12.31/ 18.05 GFLOPS | Progress: (4/20) | 3.47 s
[Task 11/25] Current/Best: 16.86/ 18.05 GFLOPS | Progress: (8/20) | 6.32 s
[Task 11/25] Current/Best: 17.96/ 18.05 GFLOPS | Progress: (12/20) | 8.38 s
[Task 11/25] Current/Best: 13.47/ 21.07 GFLOPS | Progress: (16/20) | 11.35 s
[Task 11/25] Current/Best: 19.37/ 21.55 GFLOPS | Progress: (20/20) | 13.45 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.77/ 18.26 GFLOPS | Progress: (4/20) | 5.75 s
[Task 12/25] Current/Best: 5.30/ 18.26 GFLOPS | Progress: (8/20) | 9.70 s
[Task 12/25] Current/Best: 19.28/ 19.28 GFLOPS | Progress: (12/20) | 11.67 s
[Task 12/25] Current/Best: 12.52/ 19.28 GFLOPS | Progress: (16/20) | 14.65 s
[Task 12/25] Current/Best: 15.14/ 19.28 GFLOPS | Progress: (20/20) | 16.61 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.87/ 17.34 GFLOPS | Progress: (4/20) | 3.85 s
[Task 13/25] Current/Best: 16.03/ 20.75 GFLOPS | Progress: (8/20) | 6.48 s
[Task 13/25] Current/Best: 19.46/ 21.41 GFLOPS | Progress: (12/20) | 9.62 s
[Task 13/25] Current/Best: 12.21/ 21.41 GFLOPS | Progress: (16/20) | 13.08 s
[Task 13/25] Current/Best: 18.76/ 21.41 GFLOPS | Progress: (20/20) | 15.37 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 13.60/ 13.60 GFLOPS | Progress: (4/20) | 3.37 s
[Task 14/25] Current/Best: 6.10/ 13.60 GFLOPS | Progress: (8/20) | 5.55 s
[Task 14/25] Current/Best: 21.02/ 21.02 GFLOPS | Progress: (12/20) | 8.22 s
[Task 14/25] Current/Best: 16.84/ 21.02 GFLOPS | Progress: (16/20) | 9.91 s Done.
-
[Task 14/25] Current/Best: 17.18/ 21.02 GFLOPS | Progress: (20/20) | 11.66 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 16.06/ 17.56 GFLOPS | Progress: (4/20) | 2.84 s
[Task 15/25] Current/Best: 14.23/ 17.85 GFLOPS | Progress: (8/20) | 4.15 s
[Task 15/25] Current/Best: 10.33/ 22.23 GFLOPS | Progress: (12/20) | 6.39 s
[Task 15/25] Current/Best: 20.27/ 22.23 GFLOPS | Progress: (16/20) | 9.82 s
[Task 15/25] Current/Best: 9.49/ 22.23 GFLOPS | Progress: (20/20) | 10.86 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 20.46/ 20.46 GFLOPS | Progress: (4/20) | 3.21 s
[Task 16/25] Current/Best: 3.02/ 20.46 GFLOPS | Progress: (8/20) | 4.84 s
[Task 16/25] Current/Best: 19.30/ 20.46 GFLOPS | Progress: (12/20) | 6.07 s
[Task 16/25] Current/Best: 17.92/ 20.46 GFLOPS | Progress: (16/20) |
7.48 s
[Task 16/25] Current/Best: 9.94/ 22.23 GFLOPS | Progress: (20/20) | 9.65 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 14.03/ 18.79 GFLOPS | Progress: (4/20) | 4.88 s
[Task 17/25] Current/Best: 14.49/ 22.51 GFLOPS | Progress: (8/20) | 7.85 s
[Task 17/25] Current/Best: 16.91/ 22.51 GFLOPS | Progress: (12/20) | 9.91 s
[Task 17/25] Current/Best: 16.49/ 22.51 GFLOPS | Progress: (16/20) | 12.12 s
[Task 17/25] Current/Best: 10.02/ 22.51 GFLOPS | Progress: (20/20) | 14.33 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 11.25/ 17.85 GFLOPS | Progress: (4/20) | 3.87 s
[Task 18/25] Current/Best: 10.55/ 19.45 GFLOPS | Progress: (8/20) | 7.64 s
[Task 18/25] Current/Best: 19.20/ 19.45 GFLOPS | Progress: (12/20) | 9.58 s
[Task 18/25] Current/Best: 9.84/ 19.45 GFLOPS | Progress: (16/20) | 13.49 s
[Task 18/25] Current/Best: 20.65/ 20.65 GFLOPS | Progress: (20/20) | 15.03 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 6.40/ 20.06 GFLOPS | Progress: (4/20) | 6.44 s
[Task 19/25] Current/Best: 2.60/ 20.06 GFLOPS | Progress: (8/20) | 9.79 s
[Task 19/25] Current/Best: 18.76/ 20.66 GFLOPS | Progress: (12/20) | 12.74 s
[Task 19/25] Current/Best: 13.91/ 20.66 GFLOPS | Progress: (16/20) | 15.75 s
[Task 19/25] Current/Best: 2.70/ 23.06 GFLOPS | Progress: (20/20) | 18.57 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 9.16/ 14.99 GFLOPS | Progress: (4/20) | 3.46 s Done.
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 17.55/ 17.55 GFLOPS | Progress: (4/20) | 6.30 s
[Task 1/25] Current/Best: 6.15/ 17.55 GFLOPS | Progress: (8/20) | 9.20 s
[Task 1/25] Current/Best: 11.58/ 22.84 GFLOPS | Progress: (12/20) | 11.65 s
[Task 1/25] Current/Best: 16.83/ 22.84 GFLOPS | Progress: (16/20) | 13.34 s
[Task 1/25] Current/Best: 11.63/ 23.91 GFLOPS | Progress: (20/20) | 15.07 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 12.20/ 13.05 GFLOPS | Progress: (4/20) | 3.92 s
[Task 2/25] Current/Best: 14.15/ 18.71 GFLOPS | Progress: (8/20) | 5.22 s
[Task 2/25] Current/Best: 21.03/ 21.03 GFLOPS | Progress: (12/20) | 6.54 s
[Task 2/25] Current/Best: 11.76/ 21.03 GFLOPS | Progress: (16/20) | 7.79 s
[Task 2/25] Current/Best: 18.66/ 21.03 GFLOPS | Progress: (20/20) | 9.42 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 1.63/ 10.60 GFLOPS | Progress: (4/20) | 5.86 s
[Task 3/25] Current/Best: 15.34/ 16.89 GFLOPS | Progress: (8/20) | 7.80 s
[Task 3/25] Current/Best: 14.95/ 16.89 GFLOPS | Progress: (12/20) | 9.51 s
[Task 3/25] Current/Best: 7.17/ 23.79 GFLOPS | Progress: (16/20) | 11.42 s
[Task 3/25] Current/Best: 12.67/ 23.79 GFLOPS | Progress: (20/20) | 16.00 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 9.51/ 19.36 GFLOPS | Progress: (4/20) | 2.39 s
[Task 4/25] Current/Best: 6.79/ 19.36 GFLOPS | Progress: (8/20) | 7.12 s
[Task 4/25] Current/Best: 21.86/ 21.86 GFLOPS | Progress: (12/20) | 12.05 s
[Task 4/25] Current/Best: 16.73/ 21.86 GFLOPS | Progress: (16/20) | 14.42 s
[Task 4/25] Current/Best: 13.37/ 21.86 GFLOPS | Progress: (20/20) | 16.48 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 9.42/ 10.18 GFLOPS | Progress: (4/20) | 2.58 s
[Task 5/25] Current/Best: 11.66/ 12.60 GFLOPS | Progress: (8/20) | 4.67 s
[Task 5/25] Current/Best: 11.80/ 18.04 GFLOPS | Progress: (12/20) | 7.93 s
[Task 5/25] Current/Best: 11.64/ 22.72 GFLOPS | Progress: (16/20) | 9.35 s
[Task 5/25] Current/Best: 12.07/ 22.72 GFLOPS | Progress: (20/20) | 11.28 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.25/ 20.73 GFLOPS | Progress: (4/20) | 4.09 s
[Task 6/25] Current/Best: 18.86/ 20.73 GFLOPS | Progress: (8/20) | 5.86 s
[Task 6/25] Current/Best: 13.24/ 20.73 GFLOPS | Progress: (12/20) | 7.83 s
[Task 6/25] Current/Best: 20.05/ 20.73 GFLOPS | Progress: (16/20) | 10.12 s
[Task 6/25] Current/Best: 3.69/ 20.73 GFLOPS | Progress: (20/20) | 12.67 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 11.24/ 12.97 GFLOPS | Progress: (4/20) | 3.59 s
[Task 7/25] Current/Best: 20.20/ 20.90 GFLOPS | Progress: (8/20) | 5.09 s
[Task 7/25] Current/Best: 16.08/ 20.90 GFLOPS | Progress: (12/20) | 7.04 s
[Task 7/25] Current/Best: 12.25/ 20.90 GFLOPS | Progress: (16/20) | 9.09 s
[Task 7/25] Current/Best: 6.23/ 21.69 GFLOPS | Progress: (20/20) | 11.56 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 9.77/ 13.95 GFLOPS | Progress: (4/20) | 2.93 s
[Task 8/25] Current/Best: 10.08/ 13.95 GFLOPS | Progress: (8/20) | 8.02 s
[Task 8/25] Current/Best: 12.37/ 13.95 GFLOPS | Progress: (12/20) | 14.48 s
[Task 8/25] Current/Best: 18.76/ 18.76 GFLOPS | Progress: (16/20) | 16.61 s
[Task 8/25] Current/Best: 19.24/ 19.24 GFLOPS | Progress: (20/20) | 23.62 s Done.
+
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 14.21/ 15.88 GFLOPS | Progress: (4/20) | 11.92 s
[Task 9/25] Current/Best: 23.54/ 23.54 GFLOPS | Progress: (8/20) | 13.70 s
[Task 9/25] Current/Best: 8.24/ 23.54 GFLOPS | Progress: (12/20) | 16.21 s
[Task 9/25] Current/Best: 17.98/ 23.54 GFLOPS | Progress: (16/20) | 19.08 s
[Task 9/25] Current/Best: 9.10/ 23.54 GFLOPS | Progress: (20/20) | 27.64 s
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 18.04/ 18.04 GFLOPS | Progress: (4/20) | 2.55 s
[Task 10/25] Current/Best: 15.59/ 18.04 GFLOPS | Progress: (8/20) | 4.21 s
[Task 10/25] Current/Best: 12.30/ 18.90 GFLOPS | Progress: (12/20) | 5.76 s
[Task 10/25] Current/Best: 19.14/ 20.30 GFLOPS | Progress: (16/20) | 6.87 s
[Task 10/25] Current/Best: 8.86/ 20.30 GFLOPS | Progress: (20/20
) | 8.40 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 12.28/ 18.15 GFLOPS | Progress: (4/20) | 3.32 s
[Task 11/25] Current/Best: 16.85/ 18.15 GFLOPS | Progress: (8/20) | 6.14 s
[Task 11/25] Current/Best: 18.21/ 18.21 GFLOPS | Progress: (12/20) | 8.21 s
[Task 11/25] Current/Best: 13.38/ 21.19 GFLOPS | Progress: (16/20) | 11.16 s
[Task 11/25] Current/Best: 19.41/ 21.57 GFLOPS | Progress: (20/20) | 13.23 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 7.80/ 17.85 GFLOPS | Progress: (4/20) | 5.78 s
[Task 12/25] Current/Best: 5.19/ 17.85 GFLOPS | Progress: (8/20) | 9.71 s
[Task 12/25] Current/Best: 18.95/ 18.95 GFLOPS | Progress: (12/20) | 11.73 s
[Task 12/25] Current/Best: 15.47/ 18.95 GFLOPS | Progress: (16/20) | 14.62 s
[Task 12/25] Current/Best: 15.17/ 18.95 GFLOPS | Progress: (20/20) | 16.54 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 8.73/ 17.36 GFLOPS | Progress: (4/20) | 3.77 s
[Task 13/25] Current/Best: 15.86/ 20.93 GFLOPS | Progress: (8/20) | 6.38 s
[Task 13/25] Current/Best: 19.57/ 21.63 GFLOPS | Progress: (12/20) | 9.44 s
[Task 13/25] Current/Best: 12.29/ 21.63 GFLOPS | Progress: (16/20) | 12.91 s
[Task 13/25] Current/Best: 18.78/ 21.63 GFLOPS | Progress: (20/20) | 15.21 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 13.60/ 13.60 GFLOPS | Progress: (4/20) | 3.43 s
[Task 14/25] Current/Best: 6.13/ 13.60 GFLOPS | Progress: (8/20) | 5.64 s
[Task 14/25] Current/Best: 19.90/ 19.90 GFLOPS | Progress: (12/20) | 8.32 s
[Task 14/25] Current/Best: 16.82/ 19.90 GFLOPS | Progress: (16/20) | 10.01 s Done.
+
[Task 14/25] Current/Best: 17.26/ 19.90 GFLOPS | Progress: (20/20) | 11.79 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 16.17/ 17.62 GFLOPS | Progress: (4/20) | 2.71 s
[Task 15/25] Current/Best: 14.50/ 18.01 GFLOPS | Progress: (8/20) | 4.01 s
[Task 15/25] Current/Best: 10.37/ 22.30 GFLOPS | Progress: (12/20) | 6.25 s
[Task 15/25] Current/Best: 20.38/ 22.30 GFLOPS | Progress: (16/20) | 9.63 s
[Task 15/25] Current/Best: 9.71/ 22.30 GFLOPS | Progress: (20/20) | 10.65 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 20.64/ 20.64 GFLOPS | Progress: (4/20) | 2.98 s
[Task 16/25] Current/Best: 3.04/ 20.64 GFLOPS | Progress: (8/20) | 4.61 s
[Task 16/25] Current/Best: 19.58/ 20.64 GFLOPS | Progress: (12/20) | 5.83 s
[Task 16/25] Current/Best: 17.27/ 20.64 GFLOPS | Progress: (16/20) |
7.23 s
[Task 16/25] Current/Best: 10.10/ 21.95 GFLOPS | Progress: (20/20) | 9.37 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 13.24/ 18.85 GFLOPS | Progress: (4/20) | 4.78 s
[Task 17/25] Current/Best: 14.30/ 23.41 GFLOPS | Progress: (8/20) | 7.57 s
[Task 17/25] Current/Best: 16.94/ 23.41 GFLOPS | Progress: (12/20) | 9.63 s
[Task 17/25] Current/Best: 16.44/ 23.41 GFLOPS | Progress: (16/20) | 11.84 s
[Task 17/25] Current/Best: 10.05/ 23.41 GFLOPS | Progress: (20/20) | 14.00 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 11.09/ 17.91 GFLOPS | Progress: (4/20) | 3.78 s
[Task 18/25] Current/Best: 10.56/ 17.91 GFLOPS | Progress: (8/20) | 7.48 s
[Task 18/25] Current/Best: 19.54/ 19.54 GFLOPS | Progress: (12/20) | 9.40 s
[Task 18/25] Current/Best: 10.08/ 19.54 GFLOPS | Progress: (16/20) | 13.25 s
[Task 18/25] Current/Best: 20.92/ 20.92 GFLOPS | Progress: (20/20) | 14.78 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 7.15/ 20.41 GFLOPS | Progress: (4/20) | 6.05 s
[Task 19/25] Current/Best: 2.61/ 20.41 GFLOPS | Progress: (8/20) | 9.42 s
[Task 19/25] Current/Best: 19.48/ 21.98 GFLOPS | Progress: (12/20) | 12.46 s
[Task 19/25] Current/Best: 14.87/ 21.98 GFLOPS | Progress: (16/20) | 15.47 s
[Task 19/25] Current/Best: 2.70/ 23.65 GFLOPS | Progress: (20/20) | 18.31 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 8.81/ 15.15 GFLOPS | Progress: (4/20) | 3.35 s Done.
Done.
-
[Task 20/25] Current/Best: 10.29/ 14.99 GFLOPS | Progress: (8/20) | 7.06 s
[Task 20/25] Current/Best: 2.32/ 16.51 GFLOPS | Progress: (12/20) | 11.02 s
[Task 20/25] Current/Best: 12.13/ 16.51 GFLOPS | Progress: (16/20) | 14.87 s
[Task 20/25] Current/Best: 13.22/ 21.71 GFLOPS | Progress: (20/20) | 16.99 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.39/ 17.37 GFLOPS | Progress: (4/20) | 3.38 s
[Task 21/25] Current/Best: 14.30/ 17.37 GFLOPS | Progress: (8/20) | 5.05 s
[Task 21/25] Current/Best: 1.61/ 17.37 GFLOPS | Progress: (12/20) | 7.24 s
[Task 21/25] Current/Best: 18.18/ 18.18 GFLOPS | Progress: (16/20) | 10.81 s
[Task 21/25] Current/Best: 4.45/ 18.18 GFLOPS | Progress: (20/20) | 18.29 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 17.02 GFLOPS | Progress: (4/20
) | 2.72 s
[Task 22/25] Current/Best: 9.22/ 21.68 GFLOPS | Progress: (8/20) | 4.69 s
[Task 22/25] Current/Best: 19.73/ 21.68 GFLOPS | Progress: (12/20) | 7.07 s
[Task 22/25] Current/Best: 15.26/ 21.68 GFLOPS | Progress: (16/20) | 9.19 s
[Task 22/25] Current/Best: 15.09/ 21.68 GFLOPS | Progress: (20/20) | 10.88 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 17.39/ 20.01 GFLOPS | Progress: (4/20) | 3.31 s
[Task 23/25] Current/Best: 15.75/ 20.01 GFLOPS | Progress: (8/20) | 6.73 s
[Task 23/25] Current/Best: 20.48/ 21.16 GFLOPS | Progress: (12/20) | 8.62 s
[Task 23/25] Current/Best: 5.77/ 21.16 GFLOPS | Progress: (16/20) | 15.84 s
[Task 23/25] Current/Best: 7.41/ 21.16 GFLOPS | Progress: (20/20) | 20.14 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.51/ 8.51 GFLOPS | Progress: (4/20) | 11.88 s
[Task 24/25] Current/Best: 1.94/ 8.51 GFLOPS | Progress: (8/20) | 22.98 s
[Task 24/25] Current/Best: 4.44/ 8.51 GFLOPS | Progress: (12/20) | 34.56 s Done.
+
[Task 20/25] Current/Best: 9.61/ 15.15 GFLOPS | Progress: (8/20) | 6.74 s
[Task 20/25] Current/Best: 2.32/ 16.51 GFLOPS | Progress: (12/20) | 10.69 s
[Task 20/25] Current/Best: 11.85/ 16.51 GFLOPS | Progress: (16/20) | 14.59 s
[Task 20/25] Current/Best: 12.65/ 22.33 GFLOPS | Progress: (20/20) | 16.73 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 6.41/ 17.67 GFLOPS | Progress: (4/20) | 3.24 s
[Task 21/25] Current/Best: 14.66/ 17.67 GFLOPS | Progress: (8/20) | 4.87 s
[Task 21/25] Current/Best: 1.61/ 17.67 GFLOPS | Progress: (12/20) | 6.99 s
[Task 21/25] Current/Best: 18.10/ 18.10 GFLOPS | Progress: (16/20) | 10.51 s
[Task 21/25] Current/Best: 4.47/ 18.10 GFLOPS | Progress: (20/20) | 17.76 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 2.70/ 17.04 GFLOPS | Progress: (4/20
) | 2.66 s
[Task 22/25] Current/Best: 8.59/ 21.58 GFLOPS | Progress: (8/20) | 4.66 s
[Task 22/25] Current/Best: 19.96/ 21.58 GFLOPS | Progress: (12/20) | 7.04 s
[Task 22/25] Current/Best: 14.81/ 21.58 GFLOPS | Progress: (16/20) | 9.19 s
[Task 22/25] Current/Best: 14.16/ 21.58 GFLOPS | Progress: (20/20) | 10.93 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 17.63/ 20.68 GFLOPS | Progress: (4/20) | 3.24 s
[Task 23/25] Current/Best: 14.56/ 20.68 GFLOPS | Progress: (8/20) | 6.75 s
[Task 23/25] Current/Best: 20.75/ 21.90 GFLOPS | Progress: (12/20) | 8.57 s
[Task 23/25] Current/Best: 6.44/ 21.90 GFLOPS | Progress: (16/20) | 15.53 s
[Task 23/25] Current/Best: 7.97/ 21.90 GFLOPS | Progress: (20/20) | 19.73 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 8.50/ 8.50 GFLOPS | Progress: (4/20) | 11.76 s
[Task 24/25] Current/Best: 3.67/ 8.50 GFLOPS | Progress: (8/20) | 22.99 s
[Task 24/25] Current/Best: 4.50/ 8.50 GFLOPS | Progress: (12/20) | 33.73 s Done.
Done.
-
[Task 24/25] Current/Best: 7.28/ 8.84 GFLOPS | Progress: (16/20) | 40.41 s
[Task 24/25] Current/Best: 3.25/ 8.84 GFLOPS | Progress: (20/20) | 46.48 s Done.
-
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.54/ 2.80 GFLOPS | Progress: (4/20) | 11.66 s
[Task 25/25] Current/Best: 5.66/ 7.88 GFLOPS | Progress: (8/20) | 22.96 s
[Task 25/25] Current/Best: 5.84/ 7.88 GFLOPS | Progress: (12/20) | 34.29 s
[Task 25/25] Current/Best: 5.74/ 9.12 GFLOPS | Progress: (16/20) | 36.04 s
[Task 25/25] Current/Best: 2.82/ 9.12 GFLOPS | Progress: (20/20) | 46.77 s
+
[Task 24/25] Current/Best: 6.04/ 8.76 GFLOPS | Progress: (16/20) | 39.50 s
[Task 24/25] Current/Best: 3.39/ 8.92 GFLOPS | Progress: (20/20) | 45.54 s Done.
+
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 25/25] Current/Best: 1.53/ 2.74 GFLOPS | Progress: (4/20) | 11.61 s
[Task 25/25] Current/Best: 5.56/ 7.87 GFLOPS | Progress: (8/20) | 23.03 s
[Task 25/25] Current/Best: 6.05/ 7.87 GFLOPS | Progress: (12/20) | 34.50 s
[Task 25/25] Current/Best: 5.98/ 8.84 GFLOPS | Progress: (16/20) | 36.21 s
[Task 25/25] Current/Best: 2.89/ 9.45 GFLOPS | Progress: (20/20) | 46.86 s
@@ -748,8 +748,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 415.3408216899993, 'median': 415.25963069999534, 'std': 0.49710023803746783}
- unoptimized: {'mean': 500.61246781000136, 'median': 500.6702233999988, 'std': 0.3408716062195281}
+ optimized: {'mean': 408.97684854001454, 'median': 408.9519365000342, 'std': 0.41429279321076506}
+ unoptimized: {'mean': 492.3690949199863, 'median': 492.15409664998333, 'std': 0.9317648306325024}
@@ -772,7 +772,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 10 minutes 34.415 seconds)
+ **Total running time of the script:** ( 10 minutes 37.690 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 2fd404072..ad86d39b5 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -282,7 +282,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.217e-07 secs/op
+ 1.282e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index 88ef68351..875492e8b 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -263,7 +263,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0x20bc51c0)), stage(b, placeholder(b, 0x8a02f50)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
+ [stage(a, placeholder(a, 0x21dc96c0)), stage(b, placeholder(b, 0x6d536b0)), stage(T_add, compute(T_add, body=[(a[ax0, ax1, ax2] + b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min=0, ext=10))], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[(a[ax0, ax1, ax2]*b[ax1, ax2])], axis=[iter_var(ax0, range(min=0, ext=100)), iter_var(ax1, range(min=0, ext=10)), iter_var(ax2, range(min [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index 3f59dafdd..092913566 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,30 +5,30 @@
Computation times
=================
-**13:26.431** total execution time for **tutorial** files:
+**13:32.045** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:34.415 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 10:37.690 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:59.727 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:00.942 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 00:57.993 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 00:58.520 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:28.506 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:29.048 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:23.802 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:24.484 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:01.115 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.713 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.705 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.504 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.160 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.137 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_introduction.py` (``introduction.py``) | 00:00.005 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
-+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_command_line_driver.py` (``tvmc_command_line_driver.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_tvmc_python.py` (``tvmc_python.py``) | 00:00.001 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_tutorial_install.py` (``install.py``) | 00:00.001 | 0.0 MB |
++------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index 1c640083e..7bdaba17d 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -403,7 +403,7 @@ compile and run this new schedule with the parallel operation applied:
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- parallel: 0.000006
+ parallel: 0.000007
@@ -512,10 +512,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 8.172000000286062e-06 1.0
- naive 5.8387000000000005e-06 0.7144762603763603
- parallel 6.0191e-06 0.7365516397196892
- vector 2.46563e-05 3.0171683797279614
+ numpy 8.368310000150814e-06 1.0
+ naive 5.8476e-06 0.6987790844142502
+ parallel 6.976400000000001e-06 0.8336689247738518
+ vector 2.46018e-05 2.9398767492548226
@@ -936,7 +936,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.019095
+ Numpy running time: 0.018040
@@ -996,7 +996,7 @@ optimizations.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- none: 3.264284
+ none: 3.433590
@@ -1101,7 +1101,7 @@ schedule.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- blocking: 0.326766
+ blocking: 0.299341
@@ -1199,7 +1199,7 @@ already cache friendly from our previous optimizations.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- vectorization: 0.347377
+ vectorization: 0.335047
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1275,7 +1275,7 @@ more cache friendly.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- loop permutation: 0.131158
+ loop permutation: 0.113682
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1376,7 +1376,7 @@ optimized schedule.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- array packing: 0.109524
+ array packing: 0.108208
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1471,7 +1471,7 @@ to `C` when all the block results are ready.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- block caching: 0.111630
+ block caching: 0.110076
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1559,7 +1559,7 @@ of thread-level parallelization.
/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- parallelization: 0.145116
+ parallelization: 0.142007
@main = primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
buffers = {A: Buffer(A_2: Pointer(float32), float32, [1048576], []),
@@ -1640,13 +1640,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.2642844886 1.0
- blocking 0.32676598760000003 0.10010340359156158
- vectorization 0.3473767875 0.10641743656631608
- loop permutation 0.1311578895 0.04017967488987198
- array packing 0.10952372889999999 0.03355213961359507
- block caching 0.1116296398 0.0341972766742142
- parallelization 0.1451159283 0.04445566212344376
+ none 3.4335895295 1.0
+ blocking 0.2993405374 0.08718005889410725
+ vectorization 0.33504689360000006 0.09757919248105049
+ loop permutation 0.1136822756 0.033108871815715965
+ array packing 0.1082077985 0.03151448289620024
+ block caching 0.11007550610000001 0.03205843481123069
+ parallelization 0.14200685370000002 0.041358133370321376
@@ -1686,6 +1686,11 @@ operations with tunable parameters that allows you to automatically optimize
the computation for specific platforms.
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 1 minutes 0.942 seconds)
+
+
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
.. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index d17fd6843..17cae850c 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-1392e64e0bd9f55238256f5feb95eb2af90b6b97
+40d242a3c8f9630223e5775c1f1bf23362c8850e
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 32f6e3128..81a70db90 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -569,7 +569,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.129 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 2.162 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 6135256f8..0c71c52a6 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -422,7 +422,7 @@ to download the full example code</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip0b98df0f-efed-4fd1-bd0f-a5c6f7ab728e from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip5bbd24b5-9964-4f2e-912e-359d733978e5 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index 0821fb8d8..4dc913b1e 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -427,13 +427,15 @@ python3 -m pip install -f https://release.oneflow.info <span class="nv">oneflow<
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 19%|#9 | 7.99M/41.5M [00:00<00:00, 47.7MB/s]
- 35%|###4 | 14.3M/41.5M [00:00<00:00, 54.6MB/s]
- 48%|####7 | 19.7M/41.5M [00:00<00:00, 47.6MB/s]
- 59%|#####8 | 24.4M/41.5M [00:00<00:00, 33.5MB/s]
- 80%|######## | 33.4M/41.5M [00:00<00:00, 48.3MB/s]
- 96%|#########6| 40.0M/41.5M [00:00<00:00, 48.6MB/s]
-100%|##########| 41.5M/41.5M [00:00<00:00, 48.0MB/s]
+ 15%|#5 | 6.33M/41.5M [00:00<00:00, 45.3MB/s]
+ 26%|##5 | 10.6M/41.5M [00:00<00:00, 41.6MB/s]
+ 35%|###5 | 14.6M/41.5M [00:00<00:00, 34.7MB/s]
+ 43%|####3 | 17.9M/41.5M [00:00<00:00, 30.2MB/s]
+ 54%|#####3 | 22.3M/41.5M [00:00<00:00, 32.4MB/s]
+ 61%|######1 | 25.5M/41.5M [00:00<00:00, 27.4MB/s]
+ 77%|#######7 | 32.0M/41.5M [00:00<00:00, 36.0MB/s]
+ 92%|#########2| 38.3M/41.5M [00:01<00:00, 34.5MB/s]
+100%|##########| 41.5M/41.5M [00:01<00:00, 34.3MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 12c981557..7f3ef9b67 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -409,8 +409,10 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 48%|####7 | 21.3M/44.7M [00:00<00:00, 224MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 242MB/s]
+ 8%|7 | 3.49M/44.7M [00:00<00:01, 36.6MB/s]
+ 17%|#6 | 7.47M/44.7M [00:00<00:00, 39.6MB/s]
+ 65%|######4 | 28.9M/44.7M [00:00<00:00, 121MB/s]
+100%|##########| 44.7M/44.7M [00:00<00:00, 129MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 62b00d433..3267b6f6b 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -631,7 +631,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 4.491 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 3.436 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index b10937657..95b05e23a 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:03.432</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>05:20.160</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -331,43 +331,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:04.491</p></td>
+<td><p>01:03.436</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:03.129</p></td>
+<td><p>01:02.162</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:41.193</p></td>
+<td><p>00:41.505</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:26.693</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
+<td><p>00:34.835</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:25.879</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
+<td><p>00:26.439</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:22.921</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
+<td><p>00:25.923</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:22.758</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
+<td><p>00:22.532</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:19.567</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
+<td><p>00:21.015</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:14.396</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
+<td><p>00:19.411</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.407</p></td>
+<td><p>00:02.902</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 2a87e9a52..da8dabbf2 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -648,7 +648,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.2909 16.2089 16.8662 16.1445 0.2069
+ 15.8547 15.6650 16.5339 15.5613 0.3511
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index 30e6ab0af..2db99795a 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -431,15 +431,55 @@ be unstable.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 11%|# | 18.4M/170M [00:00<00:00, 193MB/s]
- 22%|##2 | 37.6M/170M [00:00<00:00, 197MB/s]
- 33%|###3 | 56.4M/170M [00:00<00:00, 171MB/s]
- 45%|####5 | 77.2M/170M [00:00<00:00, 188MB/s]
- 56%|#####6 | 95.5M/170M [00:00<00:00, 172MB/s]
- 68%|######8 | 116M/170M [00:00<00:00, 185MB/s]
- 79%|#######8 | 134M/170M [00:00<00:00, 175MB/s]
- 90%|########9 | 152M/170M [00:00<00:00, 180MB/s]
-100%|##########| 170M/170M [00:00<00:00, 184MB/s]
+ 3%|2 | 5.02M/170M [00:00<00:03, 52.7MB/s]
+ 6%|5 | 10.0M/170M [00:00<00:04, 37.2MB/s]
+ 8%|8 | 13.9M/170M [00:00<00:05, 29.6MB/s]
+ 10%|# | 17.3M/170M [00:00<00:05, 31.5MB/s]
+ 12%|#2 | 20.5M/170M [00:00<00:05, 30.7MB/s]
+ 14%|#3 | 23.6M/170M [00:00<00:04, 30.7MB/s]
+ 17%|#6 | 28.6M/170M [00:00<00:04, 37.0MB/s]
+ 20%|## | 34.4M/170M [00:00<00:03, 43.7MB/s]
+ 23%|##2 | 38.7M/170M [00:01<00:03, 39.8MB/s]
+ 25%|##5 | 42.6M/170M [00:01<00:03, 34.1MB/s]
+ 28%|##7 | 47.5M/170M [00:01<00:03, 38.3MB/s]
+ 30%|### | 51.4M/170M [00:01<00:03, 38.8MB/s]
+ 33%|###3 | 56.8M/170M [00:01<00:02, 43.7MB/s]
+ 37%|###7 | 62.9M/170M [00:01<00:02, 49.0MB/s]
+ 40%|###9 | 67.7M/170M [00:01<00:02, 49.2MB/s]
+ 43%|####2 | 72.5M/170M [00:01<00:02, 44.7MB/s]
+ 45%|####5 | 77.2M/170M [00:02<00:02, 45.8MB/s]
+ 48%|####8 | 81.9M/170M [00:02<00:02, 45.9MB/s]
+ 51%|#####1 | 87.2M/170M [00:02<00:01, 48.6MB/s]
+ 54%|#####4 | 92.1M/170M [00:02<00:01, 49.3MB/s]
+ 57%|#####7 | 96.9M/170M [00:02<00:01, 45.7MB/s]
+ 60%|#####9 | 101M/170M [00:02<00:01, 41.6MB/s]
+ 62%|######2 | 105M/170M [00:02<00:01, 35.5MB/s]
+ 64%|######4 | 109M/170M [00:02<00:02, 31.3MB/s]
+ 66%|######6 | 112M/170M [00:03<00:01, 30.7MB/s]
+ 68%|######7 | 115M/170M [00:03<00:01, 28.9MB/s]
+ 69%|######9 | 118M/170M [00:03<00:01, 27.8MB/s]
+ 71%|#######1 | 121M/170M [00:03<00:01, 27.3MB/s]
+ 73%|#######2 | 123M/170M [00:03<00:02, 21.6MB/s]
+ 74%|#######3 | 126M/170M [00:03<00:02, 20.5MB/s]
+ 75%|#######5 | 128M/170M [00:03<00:02, 20.8MB/s]
+ 76%|#######6 | 130M/170M [00:03<00:02, 20.3MB/s]
+ 78%|#######7 | 132M/170M [00:04<00:01, 21.2MB/s]
+ 79%|#######9 | 134M/170M [00:04<00:01, 20.4MB/s]
+ 80%|######## | 136M/170M [00:04<00:01, 20.4MB/s]
+ 82%|########1 | 139M/170M [00:04<00:01, 21.7MB/s]
+ 83%|########2 | 141M/170M [00:04<00:01, 21.9MB/s]
+ 84%|########4 | 143M/170M [00:04<00:01, 22.4MB/s]
+ 86%|########5 | 145M/170M [00:04<00:01, 22.7MB/s]
+ 87%|########6 | 148M/170M [00:04<00:01, 20.9MB/s]
+ 88%|########8 | 150M/170M [00:04<00:00, 22.2MB/s]
+ 90%|########9 | 153M/170M [00:05<00:00, 23.5MB/s]
+ 91%|#########1| 155M/170M [00:05<00:00, 22.7MB/s]
+ 93%|#########2| 157M/170M [00:05<00:00, 23.3MB/s]
+ 94%|#########3| 159M/170M [00:05<00:00, 23.0MB/s]
+ 96%|#########5| 163M/170M [00:05<00:00, 26.1MB/s]
+ 98%|#########8| 167M/170M [00:05<00:00, 29.9MB/s]
+100%|#########9| 169M/170M [00:05<00:00, 28.6MB/s]
+100%|##########| 170M/170M [00:05<00:00, 31.4MB/s]
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py:3878: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/usr/local/lib/python3.7/dist-packages/torchvision/models/detection/anchor_utils.py:127: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -534,7 +574,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 10.065 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 59.469 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 379d13dd1..fca246db1 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -475,7 +475,7 @@ training. Other models require a full post training calibration.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 197MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 180MB/s]
</pre></div>
</div>
</div>
@@ -564,7 +564,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.6104 90.5887 91.4744 90.2353 0.1939
+ 90.2709 90.1941 95.4556 90.0354 0.5425
</pre></div>
</div>
<div class="admonition note">
@@ -603,7 +603,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 10.371 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 9.887 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 26eebb97b..89172647f 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -568,7 +568,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 121.3128 121.2697 125.7490 120.6289 0.5435
+ 119.0905 119.1786 122.8603 117.4642 0.6454
</pre></div>
</div>
<div class="admonition note">
@@ -596,7 +596,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 59.960 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 6.017 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index 39becbce9..45543a25e 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -504,7 +504,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 29.098 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 38.816 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index f72281b2e..27726260a 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -436,23 +436,23 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 5%|4 | 6579/132723 [00:00<00:01, 65785.98KB/s]
- 10%|# | 13883/132723 [00:00<00:01, 70040.90KB/s]
- 17%|#6 | 21945/132723 [00:00<00:01, 74865.76KB/s]
- 23%|##2 | 30032/132723 [00:00<00:01, 77234.20KB/s]
- 29%|##8 | 37906/132723 [00:00<00:01, 77768.42KB/s]
- 34%|###4 | 45683/132723 [00:00<00:01, 77173.50KB/s]
- 40%|#### | 53685/132723 [00:00<00:01, 78092.32KB/s]
- 47%|####6 | 61754/132723 [00:00<00:00, 78914.70KB/s]
- 53%|#####2 | 69842/132723 [00:00<00:00, 79525.01KB/s]
- 59%|#####8 | 77819/132723 [00:01<00:00, 79584.98KB/s]
- 65%|######4 | 85829/132723 [00:01<00:00, 79741.31KB/s]
- 71%|####### | 93804/132723 [00:01<00:00, 79064.34KB/s]
- 77%|#######6 | 101712/132723 [00:01<00:00, 76571.38KB/s]
- 83%|########2 | 109660/132723 [00:01<00:00, 77423.66KB/s]
- 88%|########8 | 117417/132723 [00:01<00:00, 74891.30KB/s]
- 95%|#########4| 125477/132723 [00:01<00:00, 76544.12KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 77261.94KB/s]
+ 5%|4 | 6393/132723 [00:00<00:01, 63927.01KB/s]
+ 11%|#1 | 14929/132723 [00:00<00:01, 76520.93KB/s]
+ 18%|#7 | 23387/132723 [00:00<00:01, 80197.46KB/s]
+ 24%|##3 | 31544/132723 [00:00<00:01, 75715.08KB/s]
+ 30%|##9 | 39155/132723 [00:00<00:01, 55369.54KB/s]
+ 36%|###5 | 47594/132723 [00:00<00:01, 63061.31KB/s]
+ 41%|####1 | 54550/132723 [00:00<00:01, 53502.08KB/s]
+ 47%|####7 | 63037/132723 [00:01<00:01, 61173.11KB/s]
+ 53%|#####3 | 70464/132723 [00:01<00:00, 64563.11KB/s]
+ 60%|#####9 | 79075/132723 [00:01<00:00, 70379.63KB/s]
+ 65%|######5 | 86555/132723 [00:01<00:00, 59549.95KB/s]
+ 72%|#######1 | 95126/132723 [00:01<00:00, 66049.23KB/s]
+ 78%|#######8 | 103702/132723 [00:01<00:00, 71212.40KB/s]
+ 85%|########4 | 112197/132723 [00:01<00:00, 74950.75KB/s]
+ 90%|######### | 120050/132723 [00:01<00:00, 69506.51KB/s]
+ 97%|#########6| 128617/132723 [00:01<00:00, 73834.90KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 67144.39KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -495,7 +495,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 26.781 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 17.546 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index d96044aac..ac0feb41a 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>11:09.358</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>11:03.296</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -331,31 +331,31 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:10.065</p></td>
+<td><p>02:59.469</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>02:26.781</p></td>
+<td><p>02:17.546</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>01:59.960</p></td>
+<td><p>02:06.017</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:29.098</p></td>
+<td><p>01:38.816</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:10.371</p></td>
+<td><p>01:09.887</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:29.863</p></td>
+<td><p>00:28.542</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:23.214</p></td>
+<td><p>00:23.012</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 33a268510..4b8359ef2 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -607,7 +607,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipf47a4a5d-6e6a-4037-9310-597ac1787583 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipe6a90132-9029-4551-b725-8a2c585f77c0 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
@@ -671,7 +671,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/workspace/python/tvm/driver/build_module.py:268: UserWarning: target_host parameter is going to be deprecated. Please pass in tvm.target.Target(target, host=target_host) instead.
"target_host parameter is going to be deprecated. "
- Check failed: (lower) is false: Intrinsic lowering function for target llvm, intrinsic name tir.sqrt, type 150 not found
+ Check failed: (lower) is false: FloatImm lowering function for target llvm type 150 not found
</pre></div>
</div>
<p>When we attempt to run the model, we get a familiar error telling us that more functions need to be registered for myfloat.</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index 2a48f0d09..dcc6c6769 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.007</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:39.293</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -331,19 +331,19 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:39.619</p></td>
+<td><p>00:36.110</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.383</p></td>
+<td><p>00:02.234</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:00.993</p></td>
+<td><p>00:00.942</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
-<td><p>00:00.011</p></td>
+<td><p>00:00.008</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 81bdddd6d..b8889bb40 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -507,10 +507,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 7143us [7143us] (45.97%; 45.97%)
-FoldScaleAxis: 8396us [8us] (54.03%; 54.03%)
- FoldConstant: 8388us [1627us] (53.98%; 99.90%)
- InferType: 6761us [6761us] (43.51%; 80.60%)
+InferType: 6763us [6763us] (45.98%; 45.98%)
+FoldScaleAxis: 7946us [6us] (54.02%; 54.02%)
+ FoldConstant: 7940us [1567us] (53.98%; 99.93%)
+ InferType: 6373us [6373us] (43.32%; 80.26%)
</pre></div>
</div>
</div>
@@ -532,10 +532,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 6875us [6875us] (44.95%; 44.95%)
-FoldScaleAxis: 8419us [8us] (55.05%; 55.05%)
- FoldConstant: 8411us [1681us] (54.99%; 99.90%)
- InferType: 6730us [6730us] (44.00%; 80.02%)
+InferType: 6440us [6440us] (44.75%; 44.75%)
+FoldScaleAxis: 7950us [5us] (55.25%; 55.25%)
+ FoldConstant: 7945us [1557us] (55.21%; 99.94%)
+ InferType: 6388us [6388us] (44.39%; 80.40%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 348ffa9db..9f84c7fa2 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -559,7 +559,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.207339 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.158303 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index f06c384e8..53ec63e63 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -901,7 +901,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 7.021100 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 7.704298 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 0d4ef591a..3d45f3f89 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -456,8 +456,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.019769
-Baseline: 3.259536
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018352
+Baseline: 3.511263
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -517,7 +517,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.330347
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.307780
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -584,7 +584,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.343379
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.339759
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -645,7 +645,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.137935
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.115245
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -728,7 +728,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.112368
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109447
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -814,7 +814,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.113770
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111666
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -904,7 +904,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147954
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.144600
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index dd0f64b02..7c2a10034 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:35.100</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.856</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,15 +331,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.752</p></td>
+<td><p>00:32.567</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.279</p></td>
+<td><p>00:01.269</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.069</p></td>
+<td><p>00:01.020</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index fe6e88f23..fc075be67 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:27.036</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>05:35.790</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -331,27 +331,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>02:40.059</p></td>
+<td><p>02:54.405</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:22.883</p></td>
+<td><p>01:21.949</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>00:44.603</p></td>
+<td><p>00:43.870</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:21.244</p></td>
+<td><p>00:18.355</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:09.256</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
+<td><p>00:08.789</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:08.990</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
+<td><p>00:08.421</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 939395c7d..e587c893e 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -486,471 +486,912 @@ cooperative fetching, unrolling and operator fusion.</p>
compute: Buffer(compute_2: Pointer(float32), float32, [25088], [])}
buffer_map = {data_1: data, kernel_1: kernel, bias_1: bias, compute_1: compute}
preflattened_buffer_map = {data_1: data_3: Buffer(data_2, float32, [1, 512, 7, 7], []), kernel_1: kernel_3: Buffer(kernel_2, float32, [512, 512, 3, 3], []), bias_1: bias_3: Buffer(bias_2, float32, [1, 512, 1, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [1, 512, 7, 7], [])} {
- attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 32;
- allocate(conv2d_nchw: Pointer(local float32), float32, [14]), storage_scope = local;
- allocate(pad_temp.shared: Pointer(shared float32), float32, [1296]), storage_scope = shared;
- allocate(kernel.shared: Pointer(shared float32), float32, [2304]), storage_scope = shared;
- attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- conv2d_nchw_1: Buffer(conv2d_nchw, float32, [14], [], scope="local", align=32)[0] = 0f32
- conv2d_nchw_1[1] = 0f32
+ attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 64;
+ allocate(conv2d_nchw: Pointer(local float32), float32, [4]), storage_scope = local;
+ allocate(pad_temp.shared: Pointer(shared float32), float32, [4032]), storage_scope = shared;
+ allocate(kernel.shared: Pointer(shared float32), float32, [1536]), storage_scope = shared;
+ attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ conv2d_nchw_1: Buffer(conv2d_nchw, float32, [4], [], scope="local", align=8)[0] = 0f32
conv2d_nchw_1[2] = 0f32
+ conv2d_nchw_1[1] = 0f32
conv2d_nchw_1[3] = 0f32
- conv2d_nchw_1[4] = 0f32
- conv2d_nchw_1[5] = 0f32
- conv2d_nchw_1[6] = 0f32
- conv2d_nchw_1[7] = 0f32
- conv2d_nchw_1[8] = 0f32
- conv2d_nchw_1[9] = 0f32
- conv2d_nchw_1[10] = 0f32
- conv2d_nchw_1[11] = 0f32
- conv2d_nchw_1[12] = 0f32
- conv2d_nchw_1[13] = 0f32
- for (rc.outer.outer: int32, 0, 32) {
- let cse_var_2: int32 = (rc.outer.outer*784)
- let cse_var_1: int32 = (rc.outer.outer*144)
- {
- attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1: Buffer(pad_temp.shared, float32, [1296], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else((((9 <= threadIdx.x_1) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[(((cse_var_2 + (floordiv(threadIdx.x_1, 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 56)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 56), 81)) && (floormod((threadIdx.x_1 + 56), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 56), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 56), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 112)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 31), 81)) && (floormod((threadIdx.x_1 + 31), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 112), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 31), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 168)] = @tir.if_then_else((((9 <= floormod((threadIdx.x_1 + 6), 81)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 168), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 6), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 224)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 62), 81)) && (floormod((threadIdx.x_1 + 62), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 224), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 62), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 280)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 37), 81)) && (floormod((threadIdx.x_1 + 37), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 280), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 37), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 336)] = @tir.if_then_else(((1 <= floormod((threadIdx.x_1 + 3), 9)) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 336), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 12), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 68), 81)) && (floormod((threadIdx.x_1 + 68), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 68), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 448)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 43), 81)) && (floormod((threadIdx.x_1 + 43), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 448), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 43), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 504)] = @tir.if_then_else((((threadIdx.x_1 < 54) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 504), 81)*49)) + ((floordiv(threadIdx.x_1, 9) + 2)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 560)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 74), 81)) && (floormod((threadIdx.x_1 + 74), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 2), 9))) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 560), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 74), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 616)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 49), 81)) && (floormod((threadIdx.x_1 + 49), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 616), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 49), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 672)] = @tir.if_then_else((((threadIdx.x_1 < 48) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 672), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 24), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 728)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 80), 81)) && (floormod((threadIdx.x_1 + 80), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 728), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 80), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 55), 81)) && (floormod((threadIdx.x_1 + 55), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 1), 9))) && (floormod((threadIdx.x_1 + 1), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 55), 81), 9)*7)) + floormod((threadIdx.x_1 + 1), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 840)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 30), 81)) && (floormod((threadIdx.x_1 + 30), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 3), 9))) && (floormod((threadIdx.x_1 + 3), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 840), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 30), 81), 9)*7)) + floormod((threadIdx.x_1 + 3), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 896)] = @tir.if_then_else((((9 <= floormod((threadIdx.x_1 + 5), 81)) && (1 <= floormod((threadIdx.x_1 + 5), 9))) && (floormod((threadIdx.x_1 + 5), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 896), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 5), 81), 9)*7)) + floormod((threadIdx.x_1 + 5), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 952)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 61), 81)) && (floormod((threadIdx.x_1 + 61), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 7), 9))) && (floormod((threadIdx.x_1 + 7), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 952), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 61), 81), 9)*7)) + floormod((threadIdx.x_1 + 7), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1008)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 9) + 4), 9)) && (floormod((threadIdx.x_1 + 36), 81) < 72)) && (1 <= floormod(threadIdx.x_1, 9))) && (floormod(threadIdx.x_1, 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1008), 81)*49)) + (floormod((floordiv(threadIdx.x_1, 9) + 4), 9)*7)) + floormod(threadIdx.x_1, 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1064)] = @tir.if_then_else(((1 <= floormod((threadIdx.x_1 + 2), 9)) && (floormod((threadIdx.x_1 + 2), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1064), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 11), 81), 9)*7)) + floormod((threadIdx.x_1 + 2), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1120)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 67), 81)) && (floormod((threadIdx.x_1 + 67), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 4), 9))) && (floormod((threadIdx.x_1 + 4), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1120), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 67), 81), 9)*7)) + floormod((threadIdx.x_1 + 4), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((9 <= floormod((threadIdx.x_1 + 42), 81)) && (floormod((threadIdx.x_1 + 42), 81) < 72)) && (1 <= floormod((threadIdx.x_1 + 6), 9))) && (floormod((threadIdx.x_1 + 6), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 42), 81), 9)*7)) + floormod((threadIdx.x_1 + 6), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- pad_temp.shared_1[(threadIdx.x_1 + 1232)] = @tir.if_then_else((((threadIdx.x_1 < 55) && (1 <= floormod((threadIdx.x_1 + 8), 9))) && (floormod((threadIdx.x_1 + 8), 9) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 1232), 81)*49)) + (floordiv(floormod((threadIdx.x_1 + 17), 81), 9)*7)) + floormod((threadIdx.x_1 + 8), 9)) - 8)], 0f32, dtype=float32)
- attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56;
- if @tir.likely((threadIdx.x_1 < 8), dtype=bool) {
- pad_temp.shared_1[(threadIdx.x_1 + 1288)] = 0f32
- }
- attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- kernel.shared_1: Buffer(kernel.shared, float32, [2304], [], scope="shared")[(threadIdx.x_2*32)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod((threadIdx.x_2*32), 144), 3)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 1)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 1), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 2)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 2), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 3)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 1), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 4)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 4), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 5)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 5), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 6)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 2), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 7)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 7), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 8)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 8), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 9)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 3), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 10)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 10), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 11)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 11), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 12)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 4), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 13)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 13), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 14)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 14), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 15)] = kernel[(((((blockIdx.x*73728) + (floordiv((threadIdx.x_2*2), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 5), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 16)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 16), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 17)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 17), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 18)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 6), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 19)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 19), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 20)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 20), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 21)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 7), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 22)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 22), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 23)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 23), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 24)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 8), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 25)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 25), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 26)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 26), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 27)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 9), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 28)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 28), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 29)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 29), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 30)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 10), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- kernel.shared_1[((threadIdx.x_2*32) + 31)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 1), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 31), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 56 {
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1792)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 64), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1793)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 65), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1794)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 22), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1795)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 1), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1796)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 68), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1797)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 23), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1798)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 2), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1799)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 71), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1800)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 24), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1801)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 3), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1802)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 74), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1803)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 25), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1804)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 4), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1805)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 77), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1806)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 26), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1807)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 112), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 5), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
+ for (rc.outer.outer: int32, 0, 8) {
+ for (rx.outer.outer: int32, 0, 3) {
+ let cse_var_2: int32 = (rc.outer.outer*3136)
+ let cse_var_1: int32 = (rc.outer.outer*576)
+ {
+ attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1: Buffer(pad_temp.shared, float32, [4032], [], scope="shared")[threadIdx.x_1] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) - 8)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 98)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 98), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1, [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 196)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 196), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 294)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 294), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 392)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 392), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 490)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 490), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 588)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 588), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 686)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 686), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 784)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 784), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 882)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 678)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 980)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 980), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x_1 [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1078)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1078), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1176)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1176), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1274)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1274), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1372)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1372), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1470)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1470), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1568)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1568), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1666)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1666), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1764)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 1364)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1862)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1862), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 1960)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 1960), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2058)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2058), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2156)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2156), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2254)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2254), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2352)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2352), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2450)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2450), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2548)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2548), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2646)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 2050)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2744)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2744), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2842)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2842), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 2940)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 2940), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3038)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3038), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3136)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 7), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 7), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3136), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 7), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3234)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 3), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 3), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3234), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 3), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3332)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 8), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 8), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3332), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 8), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3430)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 4), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 4), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3430), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 4), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3528)] = @tir.if_then_else(((((7 <= floormod(threadIdx.x_1, 63)) && (floormod(threadIdx.x_1, 63) < 56)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv(threadIdx.x_1, 63)*49)) + rx.outer.outer) + floormod(threadIdx.x_1, 63)) + 2736)], 0f32, dtype=float32)
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3626)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 5), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 5), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3626), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 5), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3724)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 1), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 1), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3724), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 1), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3822)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 6), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 6), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3822), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 6), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ pad_temp.shared_1[(threadIdx.x_1 + 3920)] = @tir.if_then_else(((((1 <= floormod((floordiv(threadIdx.x_1, 7) + 2), 9)) && (floormod((floordiv(threadIdx.x_1, 7) + 2), 9) < 8)) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[(((((cse_var_2 + (floordiv((threadIdx.x_1 + 3920), 63)*49)) + (floormod((floordiv(threadIdx.x_1, 7) + 2), 9)*7)) + rx.outer.outer) + floormod(threadIdx.x [...]
+ attr [IterVar(threadIdx.x_1, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98;
+ if @tir.likely((threadIdx.x_1 < 14), dtype=bool) {
+ pad_temp.shared_1[(threadIdx.x_1 + 4018)] = @tir.if_then_else((((threadIdx.x_1 < 7) && (1 <= (rx.outer.outer + floormod(threadIdx.x_1, 7)))) && ((rx.outer.outer + floormod(threadIdx.x_1, 7)) < 8)), data[((((cse_var_2 + (floordiv((threadIdx.x_1 + 4018), 63)*49)) + rx.outer.outer) + threadIdx.x_1) + 41)], 0f32, dtype=float32)
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1808)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 80), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
+ attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1: Buffer(kernel.shared, float32, [1536], [], scope="shared")[(threadIdx.x_2*4)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv((floormod(threadIdx.x_2, 48)*4), 3)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 1)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv(((floormod(threadIdx.x_2, 48)*4) + 1), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 2)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floordiv(((floormod(threadIdx.x_2, 48)*4) + 2), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 3)] = kernel[((((((blockIdx.x*36864) + (floordiv(threadIdx.x_2, 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 1), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1809)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 27), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1[((threadIdx.x_2*4) + 392)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 8), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 393)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 3), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 394)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 10), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 395)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 98), 48)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*4) + 392), 3) + 1), 64)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1810)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 6), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ kernel.shared_1[((threadIdx.x_2*4) + 784)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 16), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 785)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 17), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 786)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 6), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ kernel.shared_1[((threadIdx.x_2*4) + 787)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 196), 48)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*4) + 784), 3) + 1), 64)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
}
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1811)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 83), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1812)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 28), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1813)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 7), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1814)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 86), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1815)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 29), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1816)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 8), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1817)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 89), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1818)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 30), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1819)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 9), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1820)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 92), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1821)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*32), 3) + 31), 48)*3)) + floormod((threadIdx.x_2*2), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1822)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floormod((floordiv(((threadIdx.x_2*32) + 1792), 3) + 10), 48)*3)) + floormod(((threadIdx.x_2*2) + 1), 3))]
- }
- if @tir.likely((threadIdx.x_2 < 16), dtype=bool) {
- kernel.shared_1[((threadIdx.x_2*32) + 1823)] = kernel[(((((blockIdx.x*73728) + (floordiv(((threadIdx.x_2*2) + 113), 9)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*32) + 95), 144), 3)*3)) + floormod(((threadIdx.x_2*2) + 2), 3))]
+ attr [IterVar(threadIdx.x_2, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 98 {
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1176)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 8), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1177)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 25), 192), 3)*9)) + (floormod((threadIdx.x_2 + 1), 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1178)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floordiv(floormod(((threadIdx.x_2*4) + 26), 192), 3)*9)) + (floormod((threadIdx.x_2 + 2), 3)*3)) + rx.outer.outer)]
+ }
+ if @tir.likely((threadIdx.x_2 < 90), dtype=bool) {
+ kernel.shared_1[((threadIdx.x_2*4) + 1179)] = kernel[((((((blockIdx.x*36864) + (floordiv((threadIdx.x_2 + 294), 48)*4608)) + cse_var_1) + (floormod((floordiv((threadIdx.x_2*4), 3) + 9), 64)*9)) + (floormod(threadIdx.x_2, 3)*3)) + rx.outer.outer)]
+ }
}
- }
- for (rc.outer.inner: int32, 0, 8) {
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9))]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[4] = (conv2d_nchw_1[4] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[5] = (conv2d_nchw_1[5] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18))]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 1)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 2)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 3)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 4)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 5)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 6)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 7)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 26)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 8)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 9)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 10)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 89)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 11)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 12)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 13)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 14)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 15)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 16)]))
- conv2d_nchw_1[6] = (conv2d_nchw_1[6] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 107)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 17)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9))]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 9)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 18)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 81)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 90)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 99)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[7] = (conv2d_nchw_1[7] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 1)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 10)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 19)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 82)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 91)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 100)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[8] = (conv2d_nchw_1[8] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 2)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 11)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 20)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 83)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 92)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 101)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[9] = (conv2d_nchw_1[9] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 3)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 12)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 21)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 84)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 93)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 102)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[10] = (conv2d_nchw_1[10] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 4)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 13)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 22)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 85)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 94)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 103)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[11] = (conv2d_nchw_1[11] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 5)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 14)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 23)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 86)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 95)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 104)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[12] = (conv2d_nchw_1[12] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 6)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 144)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 7)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 145)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 8)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 146)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 15)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 147)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 16)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 148)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 17)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 149)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 24)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 150)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 25)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 151)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 26)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 152)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 87)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 153)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 88)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 154)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 89)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 155)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 96)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 156)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 97)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 157)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 98)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 158)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 105)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 159)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 106)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 160)]))
- conv2d_nchw_1[13] = (conv2d_nchw_1[13] + (pad_temp.shared_1[(((rc.outer.inner*162) + (floormod(threadIdx.x, 7)*9)) + 107)]*kernel.shared_1[(((floordiv(threadIdx.x, 7)*288) + (rc.outer.inner*18)) + 161)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[(floordiv(threadIdx.x, 49)*384)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 768)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 192)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[floormod(threadIdx.x, 49)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 960)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 769)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 193)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 7)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 961)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 2)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 770)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 194)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 14)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 962)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 3)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 771)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 195)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 63)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 963)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 4)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 772)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 196)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 70)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 964)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 5)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 773)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 197)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 77)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 965)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 6)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 774)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 198)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 126)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 966)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 7)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 775)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 199)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 133)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 967)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 8)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 776)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 200)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 140)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 968)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 9)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 777)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 201)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 189)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 969)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 10)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 778)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 202)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 196)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 970)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 11)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 779)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 203)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 203)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 971)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 12)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 780)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 204)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 252)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 972)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 13)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 781)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 205)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 259)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 973)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 14)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 782)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 206)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 266)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 974)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 15)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 783)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 207)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 315)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 975)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 16)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 784)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 208)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 322)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 976)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 17)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 785)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 209)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 329)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 977)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 18)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 786)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 210)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 378)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 978)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 19)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 787)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 211)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 385)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 979)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 20)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 788)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 212)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 392)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 980)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 21)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 789)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 213)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 441)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 981)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 22)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 790)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 214)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 448)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 982)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 23)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 791)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 215)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 455)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 983)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 24)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 792)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 216)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 504)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 984)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 25)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 793)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 217)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 511)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 985)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 26)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 794)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 218)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 518)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 986)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 27)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 795)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 219)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 567)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 987)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 28)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 796)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 220)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 574)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 988)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 29)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 797)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 221)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 581)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 989)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 30)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 798)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 222)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 630)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 990)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 31)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 799)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 223)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 637)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 991)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 32)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 800)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 224)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 644)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 992)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 33)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 801)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 225)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 693)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 993)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 34)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 802)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 226)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 700)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 994)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 35)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 803)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 227)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 707)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 995)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 36)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 804)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 228)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 756)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 996)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 37)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 805)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 229)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 763)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 997)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 38)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 806)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 230)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 770)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 998)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 39)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 807)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 231)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 819)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 999)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 40)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 808)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 232)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 826)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1000)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 41)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 809)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 233)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 833)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1001)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 42)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 810)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 234)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 882)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1002)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 43)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 811)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 235)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 889)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1003)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 44)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 812)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 236)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 896)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1004)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 45)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 813)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 237)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 945)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1005)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 46)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 814)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 238)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 952)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1006)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 47)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 815)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 239)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 959)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1007)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 48)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 816)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 240)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1008)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1008)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 49)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 817)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 241)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1015)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1009)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 50)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 818)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 242)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1022)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1010)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 51)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 819)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 243)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1071)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1011)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 52)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 820)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 244)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1078)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1012)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 53)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 821)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 245)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1085)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1013)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 54)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 822)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 246)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1134)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1014)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 55)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 823)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 247)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1141)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1015)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 56)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 824)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 248)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1148)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1016)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 57)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 825)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 249)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1197)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1017)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 58)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 826)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 250)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1204)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1018)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 59)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 827)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 251)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1211)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1019)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 60)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 828)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 252)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1260)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1020)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 61)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 829)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 253)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1267)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1021)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 62)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 830)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 254)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1274)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1022)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 63)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 831)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 255)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1323)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1023)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 64)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 832)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 256)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1330)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1024)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 65)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 833)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 257)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1337)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1025)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 66)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 834)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 258)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1386)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1026)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 67)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 835)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 259)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1393)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1027)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 68)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 836)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 260)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1400)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1028)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 69)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 837)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 261)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1449)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1029)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 70)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 838)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 262)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1456)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1030)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 71)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 839)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 263)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1463)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1031)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 72)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 840)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 264)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1512)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1032)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 73)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 841)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 265)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1519)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1033)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 74)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 842)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 266)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1526)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1034)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 75)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 843)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 267)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1575)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1035)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 76)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 844)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 268)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1582)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1036)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 77)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 845)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 269)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1589)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1037)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 78)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 846)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 270)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1638)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1038)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 79)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 847)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 271)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1645)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1039)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 80)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 848)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 272)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1652)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1040)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 81)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 849)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 273)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1701)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1041)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 82)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 850)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 274)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1708)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1042)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 83)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 851)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 275)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1715)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1043)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 84)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 852)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 276)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1764)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1044)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 85)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 853)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 277)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1771)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1045)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 86)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 854)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 278)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1778)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1046)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 87)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 855)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 279)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1827)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1047)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 88)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 856)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 280)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1834)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1048)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 89)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 857)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 281)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1841)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1049)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 90)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 858)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 282)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1890)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1050)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 91)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 859)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 283)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1897)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1051)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 92)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 860)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 284)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1904)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1052)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 93)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 861)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 285)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1953)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1053)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 94)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 862)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 286)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1960)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1054)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 95)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 863)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 287)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 1967)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1055)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 96)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 864)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 288)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2016)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1056)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 97)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 865)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 289)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2023)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1057)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 98)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 866)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 290)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2030)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1058)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 99)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 867)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 291)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2079)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1059)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 100)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 868)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 292)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2086)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1060)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 101)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 869)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 293)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2093)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1061)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 102)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 870)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 294)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2142)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1062)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 103)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 871)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 295)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2149)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1063)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 104)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 872)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 296)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2156)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1064)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 105)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 873)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 297)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2205)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1065)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 106)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 874)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 298)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2212)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1066)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 107)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 875)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 299)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2219)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1067)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 108)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 876)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 300)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2268)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1068)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 109)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 877)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 301)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2275)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1069)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 110)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 878)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 302)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2282)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1070)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 111)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 879)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 303)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2331)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1071)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 112)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 880)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 304)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2338)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1072)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 113)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 881)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 305)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2345)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1073)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 114)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 882)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 306)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2394)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1074)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 115)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 883)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 307)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2401)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1075)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 116)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 884)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 308)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2408)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1076)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 117)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 885)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 309)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2457)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1077)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 118)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 886)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 310)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2464)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1078)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 119)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 887)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 311)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2471)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1079)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 120)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 888)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 312)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2520)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1080)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 121)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 889)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 313)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2527)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1081)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 122)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 890)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 314)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2534)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1082)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 123)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 891)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 315)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2583)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1083)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 124)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 892)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 316)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2590)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1084)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 125)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 893)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 317)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2597)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1085)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 126)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 894)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 318)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2646)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1086)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 127)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 895)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 319)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2653)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1087)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 128)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 896)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 320)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2660)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1088)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 129)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 897)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 321)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2709)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1089)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 130)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 898)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 322)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2716)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1090)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 131)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 899)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 323)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2723)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1091)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 132)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 900)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 324)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2772)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1092)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 133)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 901)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 325)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2779)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1093)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 134)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 902)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 326)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2786)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1094)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 135)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 903)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 327)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2835)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1095)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 136)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 904)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 328)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2842)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1096)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 137)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 905)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 329)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2849)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1097)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 138)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 906)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 330)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2898)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1098)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 139)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 907)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 331)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2905)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1099)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 140)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 908)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 332)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2912)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1100)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 141)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 909)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 333)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2961)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1101)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 142)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 910)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 334)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2968)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1102)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 143)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 911)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 335)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 2975)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1103)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 144)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 912)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 336)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3024)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1104)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 145)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 913)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 337)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3031)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1105)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 146)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 914)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 338)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3038)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1106)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 147)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 915)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 339)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3087)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1107)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 148)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 916)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 340)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3094)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1108)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 149)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 917)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 341)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3101)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1109)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 150)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 918)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 342)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3150)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1110)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 151)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 919)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 343)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3157)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1111)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 152)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 920)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 344)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3164)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1112)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 153)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 921)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 345)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3213)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1113)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 154)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 922)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 346)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3220)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1114)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 155)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 923)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 347)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3227)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1115)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 156)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 924)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 348)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3276)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1116)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 157)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 925)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 349)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3283)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1117)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 158)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 926)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 350)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3290)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1118)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 159)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 927)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 351)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3339)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1119)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 160)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 928)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 352)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3346)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1120)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 161)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 929)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 353)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3353)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1121)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 162)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 930)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 354)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3402)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1122)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 163)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 931)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 355)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3409)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1123)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 164)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 932)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 356)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3416)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1124)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 165)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 933)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 357)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3465)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1125)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 166)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 934)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 358)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3472)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1126)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 167)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 935)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 359)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3479)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1127)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 168)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 936)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 360)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3528)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1128)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 169)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 937)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 361)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3535)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1129)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 170)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 938)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 362)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3542)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1130)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 171)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 939)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 363)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3591)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1131)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 172)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 940)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 364)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3598)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1132)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 173)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 941)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 365)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3605)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1133)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 174)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 942)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 366)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3654)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1134)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 175)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 943)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 367)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3661)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1135)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 176)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 944)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 368)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3668)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1136)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 177)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 945)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 369)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3717)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1137)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 178)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 946)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 370)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3724)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1138)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 179)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 947)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 371)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3731)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1139)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 180)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 948)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 372)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3780)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1140)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 181)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 949)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 373)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3787)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1141)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 182)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 950)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 374)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3794)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1142)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 183)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 951)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 375)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3843)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1143)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 184)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 952)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 376)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3850)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1144)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 185)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 953)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 377)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3857)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1145)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 186)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 954)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 378)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3906)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1146)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 187)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 955)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 379)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3913)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1147)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 188)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 956)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 380)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3920)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1148)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 189)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 957)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 381)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3969)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1149)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 190)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 958)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 382)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3976)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1150)]))
+ conv2d_nchw_1[0] = (conv2d_nchw_1[0] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 191)]))
+ conv2d_nchw_1[2] = (conv2d_nchw_1[2] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 959)]))
+ conv2d_nchw_1[1] = (conv2d_nchw_1[1] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 383)]))
+ conv2d_nchw_1[3] = (conv2d_nchw_1[3] + (pad_temp.shared_1[(floormod(threadIdx.x, 49) + 3983)]*kernel.shared_1[((floordiv(threadIdx.x, 49)*384) + 1151)]))
}
}
}
for (i1.inner: int32, 0, 2) {
- for (i3.inner: int32, 0, 7) {
- compute[(((((blockIdx.x*784) + (floordiv(threadIdx.x, 7)*98)) + (i1.inner*49)) + (floormod(threadIdx.x, 7)*7)) + i3.inner)] = max((conv2d_nchw_1[((i1.inner*7) + i3.inner)] + bias[(((blockIdx.x*16) + (floordiv(threadIdx.x, 7)*2)) + i1.inner)]), 0f32)
- }
+ compute[((((blockIdx.x*392) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49))] = max((conv2d_nchw_1[i1.inner] + bias[(((blockIdx.x*8) + (floordiv(threadIdx.x, 49)*2)) + i1.inner)]), 0f32)
+ compute[(((((blockIdx.x*392) + (floordiv(threadIdx.x, 49)*98)) + (i1.inner*49)) + floormod(threadIdx.x, 49)) + 196)] = max((conv2d_nchw_1[(i1.inner + 2)] + bias[((((blockIdx.x*8) + (floordiv(threadIdx.x, 49)*2)) + i1.inner) + 4)]), 0f32)
}
}
}
@@ -987,7 +1428,7 @@ cooperative fetching, unrolling and operator fusion.</p>
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.229 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.330 ms
</pre></div>
</div>
</div>
@@ -1016,36 +1457,36 @@ conv2d_nchw_nn_o_i, conv2d_nchw_nn_i = s[conv2d_nchw].split(conv2d_nchw_nn, fact
conv2d_nchw_nn_o_o_i, conv2d_nchw_nn_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_i, factor=1)
conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_i, factor=1)
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
-conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
-conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
-conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
+conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=2)
+conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=1)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=2)
+conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=2)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
-conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
+conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=7)
conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
-conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=8)
+conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=32)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=3)
conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
-conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=3)
+conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
-compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=2)
+compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=2)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
-compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
+compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=7)
compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
@@ -1063,14 +1504,14 @@ s[compute].bind(compute_i0_o_o_i_i1_o_o_i_fused_i2_o_o_i_fused_i3_o_o_i_fused, t
compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused = s[compute].fuse(compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i)
s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread_axis("threadIdx.x"))
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=32)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=98)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=98)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 1024)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1090,440 +1531,859 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
- float conv2d_nchw[14];
- __shared__ float pad_temp_shared[1296];
- __shared__ float kernel_shared[2304];
+extern "C" __global__ void __launch_bounds__(98) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ float conv2d_nchw[4];
+ __shared__ float pad_temp_shared[4032];
+ __shared__ float kernel_shared[1536];
conv2d_nchw[0] = 0.000000e+00f;
- conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
+ conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[3] = 0.000000e+00f;
- conv2d_nchw[4] = 0.000000e+00f;
- conv2d_nchw[5] = 0.000000e+00f;
- conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
- conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
- conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
- conv2d_nchw[12] = 0.000000e+00f;
- conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
- __syncthreads();
- pad_temp_shared[((int)threadIdx.x)] = ((((9 <= ((int)threadIdx.x)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[((((rc_outer_outer * 784) + ((((int)threadIdx.x) / 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 56)] = (((((9 <= ((((int)threadIdx.x) + 56) % 81)) && (((((int)threadIdx.x) + 56) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 56) / 81) * 49)) + ((((((int)threadIdx.x) + 56) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 112)] = (((((9 <= ((((int)threadIdx.x) + 31) % 81)) && (((((int)threadIdx.x) + 31) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 112) / 81) * 49)) + ((((((int)threadIdx.x) + 31) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 168)] = ((((3 <= ((int)threadIdx.x)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 168) / 81) * 49)) + (((((int)threadIdx.x) + 6) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 224)] = (((((9 <= ((((int)threadIdx.x) + 62) % 81)) && (((((int)threadIdx.x) + 62) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 224) / 81) * 49)) + ((((((int)threadIdx.x) + 62) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 280)] = (((((9 <= ((((int)threadIdx.x) + 37) % 81)) && (((((int)threadIdx.x) + 37) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 280) / 81) * 49)) + ((((((int)threadIdx.x) + 37) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 336)] = (((1 <= ((((int)threadIdx.x) + 3) % 9)) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 336) / 81) * 49)) + (((((int)threadIdx.x) + 12) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((9 <= ((((int)threadIdx.x) + 68) % 81)) && (((((int)threadIdx.x) + 68) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 392) / 81) * 49)) + ((((((int)threadIdx.x) + 68) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 448)] = (((((9 <= ((((int)threadIdx.x) + 43) % 81)) && (((((int)threadIdx.x) + 43) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 448) / 81) * 49)) + ((((((int)threadIdx.x) + 43) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 504)] = ((((((int)threadIdx.x) < 54) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 504) / 81) * 49)) + ((((int)threadIdx.x) / 9) * 7)) + (((int)threadIdx.x) % 9)) + 6)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 560)] = (((((9 <= ((((int)threadIdx.x) + 74) % 81)) && (((((int)threadIdx.x) + 74) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 2) % 9))) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 560) / 81) * 49)) + ((((((int)threadIdx.x) + 74) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 616)] = (((((9 <= ((((int)threadIdx.x) + 49) % 81)) && (((((int)threadIdx.x) + 49) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 616) / 81) * 49)) + ((((((int)threadIdx.x) + 49) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 672)] = ((((((int)threadIdx.x) < 48) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 672) / 81) * 49)) + (((((int)threadIdx.x) + 24) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 728)] = (((((9 <= ((((int)threadIdx.x) + 80) % 81)) && (((((int)threadIdx.x) + 80) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 728) / 81) * 49)) + ((((((int)threadIdx.x) + 80) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((9 <= ((((int)threadIdx.x) + 55) % 81)) && (((((int)threadIdx.x) + 55) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 1) % 9))) && (((((int)threadIdx.x) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 784) / 81) * 49)) + ((((((int)threadIdx.x) + 55) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 1) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 840)] = (((((9 <= ((((int)threadIdx.x) + 30) % 81)) && (((((int)threadIdx.x) + 30) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 3) % 9))) && (((((int)threadIdx.x) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 840) / 81) * 49)) + ((((((int)threadIdx.x) + 30) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 3) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 896)] = ((((4 <= ((int)threadIdx.x)) && (1 <= ((((int)threadIdx.x) + 5) % 9))) && (((((int)threadIdx.x) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 896) / 81) * 49)) + (((((int)threadIdx.x) + 5) / 9) * 7)) + ((((int)threadIdx.x) + 5) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 952)] = (((((9 <= ((((int)threadIdx.x) + 61) % 81)) && (((((int)threadIdx.x) + 61) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 7) % 9))) && (((((int)threadIdx.x) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 952) / 81) * 49)) + ((((((int)threadIdx.x) + 61) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 7) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1008)] = (((((1 <= (((((int)threadIdx.x) / 9) + 4) % 9)) && (((((int)threadIdx.x) + 36) % 81) < 72)) && (1 <= (((int)threadIdx.x) % 9))) && ((((int)threadIdx.x) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1008) / 81) * 49)) + ((((((int)threadIdx.x) / 9) + 4) % 9) * 7)) + (((int)threadIdx.x) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1064)] = (((1 <= ((((int)threadIdx.x) + 2) % 9)) && (((((int)threadIdx.x) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1064) / 81) * 49)) + (((((int)threadIdx.x) + 11) / 9) * 7)) + ((((int)threadIdx.x) + 2) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1120)] = (((((9 <= ((((int)threadIdx.x) + 67) % 81)) && (((((int)threadIdx.x) + 67) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 4) % 9))) && (((((int)threadIdx.x) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1120) / 81) * 49)) + ((((((int)threadIdx.x) + 67) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 4) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((9 <= ((((int)threadIdx.x) + 42) % 81)) && (((((int)threadIdx.x) + 42) % 81) < 72)) && (1 <= ((((int)threadIdx.x) + 6) % 9))) && (((((int)threadIdx.x) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1176) / 81) * 49)) + ((((((int)threadIdx.x) + 42) % 81) / 9) * 7)) + ((((int)threadIdx.x) + 6) % 9)) - 8)] : 0.000000e+00f);
- pad_temp_shared[(((int)threadIdx.x) + 1232)] = ((((((int)threadIdx.x) < 55) && (1 <= ((((int)threadIdx.x) + 8) % 9))) && (((((int)threadIdx.x) + 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) + 1232) / 81) * 49)) + (((((int)threadIdx.x) + 17) / 9) * 7)) + ((((int)threadIdx.x) + 8) % 9)) - 8)] : 0.000000e+00f);
- if (((int)threadIdx.x) < 8) {
- pad_temp_shared[(((int)threadIdx.x) + 1288)] = 0.000000e+00f;
- }
- kernel_shared[(((int)threadIdx.x) * 32)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) * 32) % 144) / 3) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 1)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 1) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 2)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 2) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 3)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 1) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 4)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 4) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 5)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 5) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 6)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 2) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 7)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 7) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 8)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 8) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 9)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 3) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 10)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 10) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 11)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 11) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 12)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 4) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 13)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 13) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 14)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 14) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 15)] = kernel[(((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) * 2) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 5) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 16)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 16) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 17)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 17) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 18)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 6) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 19)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 19) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 20)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 20) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 21)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 7) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 22)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 22) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 23)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 23) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 24)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 8) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 25)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 25) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 26)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 26) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 27)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 9) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 28)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 28) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 29)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 29) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 30)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 10) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- kernel_shared[((((int)threadIdx.x) * 32) + 31)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 1) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 31) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1792)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 64) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1793)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 65) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1794)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 22) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1795)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 1) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1796)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 68) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1797)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 23) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1798)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 2) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1799)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 71) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1800)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 24) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1801)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 3) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1802)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 74) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1803)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 25) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1804)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 4) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1805)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 77) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1806)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 26) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1807)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 112) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 5) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1808)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 80) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1809)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 27) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1810)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 6) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1811)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 83) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1812)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 28) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1813)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 7) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1814)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 86) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1815)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 29) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1816)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 8) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1817)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 89) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1818)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 30) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1819)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 9) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1820)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 92) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1821)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) / 3) + 31) % 48) * 3)) + ((((int)threadIdx.x) * 2) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1822)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + ((((((((int)threadIdx.x) * 32) + 1792) / 3) + 10) % 48) * 3)) + (((((int)threadIdx.x) * 2) + 1) % 3))];
- }
- if (((int)threadIdx.x) < 16) {
- kernel_shared[((((int)threadIdx.x) * 32) + 1823)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((((int)threadIdx.x) * 2) + 113) / 9) * 4608)) + (rc_outer_outer * 144)) + (((((((int)threadIdx.x) * 32) + 95) % 144) / 3) * 3)) + (((((int)threadIdx.x) * 2) + 2) % 3))];
- }
- __syncthreads();
- for (int rc_outer_inner = 0; rc_outer_inner < 8; ++rc_outer_inner) {
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9))] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18))]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 26)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 89)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 107)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9))] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 90)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 99)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 91)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 100)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 92)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 101)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 21)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 84)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 93)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 102)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 22)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 85)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 94)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 103)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 23)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 86)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 95)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 104)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 144)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 145)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 146)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 147)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 148)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 149)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 24)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 150)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 25)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 151)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 26)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 152)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 87)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 153)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 88)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 154)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 89)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 155)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 96)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 156)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 97)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 157)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 98)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 158)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 105)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 159)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 106)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 160)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 162) + ((((int)threadIdx.x) % 7) * 9)) + 107)] * kernel_shared[((((((int)threadIdx.x) / 7) * 288) + (rc_outer_inner * 18)) + 161)]));
+ for (int rc_outer_outer = 0; rc_outer_outer < 8; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
+ __syncthreads();
+ pad_temp_shared[((int)threadIdx.x)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 98)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 98) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 196)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 196) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 294)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 294) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 392)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 392) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 490)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 490) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 588)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 588) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 686)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 686) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 784)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 784) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 882)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 678)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 980)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 980) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1078)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1078) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1176)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1176) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1274)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1274) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1372)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1372) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1470)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1470) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1568)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1568) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1666)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1666) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1764)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 1364)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1862)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1862) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 1960)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 1960) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2058)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2058) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2156)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2156) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2254)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2254) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2352)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2352) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2450)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2450) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2548)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2548) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2646)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 2050)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2744)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2744) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2842)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2842) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 2940)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 2940) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3038)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3038) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3136)] = (((((1 <= (((((int)threadIdx.x) / 7) + 7) % 9)) && ((((((int)threadIdx.x) / 7) + 7) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3136) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 7) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3234)] = (((((1 <= (((((int)threadIdx.x) / 7) + 3) % 9)) && ((((((int)threadIdx.x) / 7) + 3) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3234) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 3) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3332)] = (((((1 <= (((((int)threadIdx.x) / 7) + 8) % 9)) && ((((((int)threadIdx.x) / 7) + 8) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3332) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 8) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3430)] = (((((1 <= (((((int)threadIdx.x) / 7) + 4) % 9)) && ((((((int)threadIdx.x) / 7) + 4) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3430) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 4) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3528)] = (((((7 <= (((int)threadIdx.x) % 63)) && ((((int)threadIdx.x) % 63) < 56)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + ((((int)threadIdx.x) / 63) * 49)) + rx_outer_outer) + (((int)threadIdx.x) % 63)) + 2736)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3626)] = (((((1 <= (((((int)threadIdx.x) / 7) + 5) % 9)) && ((((((int)threadIdx.x) / 7) + 5) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3626) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 5) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3724)] = (((((1 <= (((((int)threadIdx.x) / 7) + 1) % 9)) && ((((((int)threadIdx.x) / 7) + 1) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3724) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 1) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3822)] = (((((1 <= (((((int)threadIdx.x) / 7) + 6) % 9)) && ((((((int)threadIdx.x) / 7) + 6) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3822) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 6) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) + 3920)] = (((((1 <= (((((int)threadIdx.x) / 7) + 2) % 9)) && ((((((int)threadIdx.x) / 7) + 2) % 9) < 8)) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[((((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 3920) / 63) * 49)) + ((((((int)threadIdx.x) / 7) + 2) % 9) * 7)) + rx_outer_outer) + (((int)threadIdx.x) % 7)) - 8)] : 0.000000e+00f);
+ if (((int)threadIdx.x) < 14) {
+ pad_temp_shared[(((int)threadIdx.x) + 4018)] = ((((((int)threadIdx.x) < 7) && (1 <= (rx_outer_outer + (((int)threadIdx.x) % 7)))) && ((rx_outer_outer + (((int)threadIdx.x) % 7)) < 8)) ? data[(((((rc_outer_outer * 3136) + (((((int)threadIdx.x) + 4018) / 63) * 49)) + rx_outer_outer) + ((int)threadIdx.x)) + 41)] : 0.000000e+00f);
+ }
+ kernel_shared[(((int)threadIdx.x) * 4)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((int)threadIdx.x) % 48) * 4) / 3) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 1)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) % 48) * 4) + 1) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 2)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) % 48) * 4) + 2) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 3)] = kernel[((((((((int)blockIdx.x) * 36864) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 1) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 392)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 8) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 393)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 3) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 394)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 10) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 395)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 98) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((((int)threadIdx.x) * 4) + 392) / 3) + 1) & 63) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 784)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 16) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 785)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 17) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 786)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 6) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[((((int)threadIdx.x) * 4) + 787)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 196) / 48) * 4608)) + (rc_outer_outer * 576)) + ((((((((int)threadIdx.x) * 4) + 784) / 3) + 1) & 63) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1176)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 8) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1177)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 25) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1178)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) + 26) % 192) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ }
+ if (((int)threadIdx.x) < 90) {
+ kernel_shared[((((int)threadIdx.x) * 4) + 1179)] = kernel[((((((((int)blockIdx.x) * 36864) + (((((int)threadIdx.x) + 294) / 48) * 4608)) + (rc_outer_outer * 576)) + (((((((int)threadIdx.x) * 4) / 3) + 9) & 63) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ }
+ __syncthreads();
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[((((int)threadIdx.x) / 49) * 384)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 768)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 192)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((int)threadIdx.x) % 49)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 960)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 769)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 193)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 7)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 961)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 770)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 194)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 14)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 962)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 771)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 195)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 63)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 963)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 772)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 196)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 70)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 964)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 773)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 197)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 77)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 965)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 774)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 198)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 126)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 966)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 775)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 199)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 133)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 967)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 776)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 200)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 140)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 968)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 777)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 201)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 189)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 969)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 778)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 202)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 196)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 970)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 779)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 203)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 203)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 971)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 12)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 780)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 204)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 252)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 972)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 13)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 781)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 205)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 259)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 973)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 14)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 782)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 206)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 266)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 974)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 15)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 783)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 207)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 315)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 975)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 16)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 784)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 208)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 322)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 976)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 17)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 785)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 209)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 329)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 977)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 18)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 786)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 210)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 378)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 978)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 19)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 787)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 211)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 385)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 979)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 20)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 788)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 212)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 392)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 980)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 21)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 789)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 213)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 441)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 981)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 22)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 790)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 214)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 448)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 982)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 23)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 791)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 215)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 455)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 983)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 24)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 792)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 216)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 504)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 984)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 25)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 793)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 217)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 511)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 985)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 26)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 794)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 218)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 518)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 986)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 27)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 795)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 219)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 567)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 987)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 28)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 796)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 220)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 574)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 988)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 29)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 797)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 221)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 581)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 989)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 30)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 798)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 222)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 630)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 990)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 31)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 799)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 223)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 637)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 991)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 32)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 800)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 224)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 644)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 992)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 33)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 801)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 225)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 693)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 993)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 34)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 802)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 226)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 700)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 994)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 35)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 803)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 227)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 707)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 995)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 36)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 804)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 228)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 756)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 996)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 37)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 805)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 229)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 763)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 997)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 38)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 806)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 230)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 770)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 998)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 39)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 807)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 231)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 819)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 999)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 40)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 808)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 232)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 826)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1000)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 41)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 809)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 233)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 833)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1001)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 42)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 810)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 234)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 882)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1002)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 43)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 811)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 235)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 889)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1003)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 44)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 812)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 236)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 896)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1004)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 45)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 813)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 237)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 945)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1005)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 46)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 814)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 238)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 952)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1006)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 47)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 815)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 239)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 959)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1007)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 48)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 816)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 240)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1008)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1008)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 49)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 817)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 241)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1015)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1009)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 50)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 818)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 242)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1022)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1010)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 51)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 819)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 243)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1071)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1011)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 52)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 820)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 244)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1078)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1012)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 53)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 821)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 245)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1085)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1013)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 54)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 822)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 246)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1134)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1014)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 55)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 823)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 247)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1141)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1015)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 56)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 824)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 248)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1148)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1016)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 57)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 825)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 249)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1197)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1017)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 58)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 826)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 250)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1204)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1018)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 59)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 827)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 251)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1211)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1019)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 60)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 828)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 252)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1260)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1020)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 61)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 829)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 253)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1267)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1021)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 62)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 830)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 254)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1274)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1022)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 63)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 831)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 255)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1323)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1023)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 64)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 832)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 256)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1330)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1024)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 65)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 833)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 257)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1337)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1025)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 66)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 834)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 258)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1386)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1026)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 67)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 835)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 259)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1393)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1027)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 68)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 836)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 260)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1400)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1028)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 69)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 837)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 261)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1449)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1029)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 70)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 838)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 262)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1456)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1030)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 71)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 839)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 263)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1463)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1031)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 72)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 840)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 264)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1512)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1032)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 73)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 841)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 265)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1519)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1033)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 74)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 842)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 266)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1526)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1034)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 75)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 843)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 267)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1575)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1035)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 76)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 844)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 268)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1582)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1036)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 77)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 845)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 269)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1589)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1037)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 78)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 846)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 270)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1638)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1038)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 79)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 847)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 271)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1645)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1039)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 80)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 848)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 272)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1652)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1040)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 81)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 849)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 273)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1701)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1041)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 82)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 850)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 274)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1708)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1042)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 83)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 851)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 275)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1715)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1043)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 84)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 852)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 276)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1764)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1044)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 85)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 853)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 277)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1771)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1045)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 86)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 854)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 278)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1778)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1046)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 87)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 855)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 279)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1827)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1047)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 88)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 856)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 280)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1834)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1048)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 89)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 857)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 281)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1841)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1049)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 90)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 858)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 282)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1890)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1050)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 91)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 859)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 283)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1897)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1051)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 92)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 860)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 284)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1904)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1052)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 93)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 861)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 285)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1953)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1053)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 94)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 862)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 286)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1960)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1054)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 95)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 863)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 287)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 1967)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1055)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 96)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 864)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 288)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2016)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1056)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 97)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 865)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 289)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2023)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1057)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 98)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 866)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 290)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2030)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1058)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 99)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 867)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 291)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2079)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1059)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 100)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 868)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 292)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2086)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1060)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 101)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 869)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 293)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2093)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1061)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 102)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 870)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 294)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2142)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1062)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 103)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 871)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 295)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2149)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1063)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 104)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 872)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 296)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2156)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1064)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 105)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 873)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 297)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2205)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1065)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 106)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 874)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 298)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2212)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1066)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 107)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 875)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 299)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2219)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1067)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 108)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 876)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 300)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2268)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1068)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 109)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 877)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 301)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2275)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1069)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 110)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 878)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 302)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2282)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1070)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 111)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 879)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 303)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2331)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1071)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 112)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 880)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 304)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2338)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1072)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 113)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 881)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 305)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2345)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1073)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 114)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 882)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 306)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2394)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1074)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 115)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 883)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 307)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2401)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1075)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 116)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 884)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 308)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2408)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1076)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 117)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 885)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 309)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2457)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1077)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 118)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 886)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 310)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2464)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1078)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 119)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 887)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 311)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2471)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1079)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 120)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 888)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 312)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2520)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1080)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 121)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 889)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 313)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2527)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1081)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 122)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 890)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 314)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2534)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1082)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 123)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 891)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 315)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2583)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1083)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 124)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 892)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 316)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2590)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1084)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 125)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 893)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 317)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2597)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1085)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 126)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 894)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 318)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2646)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1086)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 127)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 895)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 319)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2653)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1087)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 128)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 896)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 320)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2660)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1088)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 129)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 897)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 321)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2709)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1089)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 130)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 898)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 322)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2716)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1090)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 131)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 899)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 323)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2723)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1091)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 132)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 900)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 324)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2772)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1092)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 133)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 901)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 325)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2779)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1093)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 134)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 902)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 326)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2786)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1094)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 135)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 903)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 327)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2835)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1095)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 136)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 904)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 328)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2842)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1096)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 137)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 905)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 329)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2849)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1097)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 138)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 906)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 330)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2898)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1098)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 139)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 907)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 331)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2905)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1099)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 140)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 908)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 332)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2912)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1100)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 141)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 909)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 333)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2961)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1101)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 142)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 910)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 334)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2968)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1102)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 143)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 911)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 335)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 2975)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1103)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 144)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 912)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 336)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3024)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1104)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 145)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 913)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 337)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3031)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1105)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 146)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 914)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 338)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3038)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1106)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 147)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 915)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 339)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3087)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1107)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 148)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 916)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 340)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3094)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1108)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 149)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 917)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 341)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3101)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1109)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 150)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 918)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 342)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3150)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1110)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 151)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 919)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 343)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3157)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1111)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 152)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 920)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 344)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3164)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1112)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 153)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 921)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 345)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3213)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1113)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 154)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 922)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 346)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3220)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1114)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 155)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 923)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 347)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3227)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1115)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 156)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 924)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 348)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3276)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1116)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 157)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 925)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 349)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3283)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1117)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 158)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 926)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 350)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3290)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1118)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 159)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 927)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 351)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3339)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1119)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 160)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 928)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 352)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3346)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1120)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 161)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 929)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 353)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3353)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1121)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 162)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 930)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 354)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3402)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1122)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 163)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 931)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 355)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3409)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1123)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 164)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 932)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 356)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3416)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1124)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 165)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 933)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 357)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3465)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1125)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 166)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 934)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 358)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3472)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1126)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 167)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 935)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 359)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3479)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1127)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 168)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 936)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 360)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3528)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1128)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 169)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 937)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 361)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3535)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1129)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 170)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 938)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 362)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3542)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1130)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 171)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 939)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 363)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3591)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1131)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 172)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 940)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 364)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3598)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1132)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 173)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 941)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 365)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3605)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1133)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 174)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 942)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 366)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3654)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1134)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 175)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 943)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 367)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3661)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1135)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 176)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 944)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 368)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3668)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1136)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 177)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 945)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 369)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3717)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1137)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 178)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 946)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 370)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3724)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1138)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 179)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 947)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 371)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3731)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1139)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 180)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 948)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 372)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3780)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1140)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 181)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 949)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 373)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3787)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1141)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 182)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 950)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 374)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3794)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1142)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 183)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 951)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 375)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3843)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1143)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 184)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 952)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 376)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3850)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1144)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 185)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 953)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 377)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3857)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1145)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 186)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 954)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 378)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3906)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1146)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 187)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 955)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 379)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3913)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1147)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 188)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 956)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 380)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3920)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1148)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 189)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 957)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 381)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3969)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1149)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 190)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 958)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 382)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3976)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1150)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 191)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 959)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 383)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[((((int)threadIdx.x) % 49) + 3983)] * kernel_shared[(((((int)threadIdx.x) / 49) * 384) + 1151)]));
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ compute[((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 8) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 392) + ((((int)threadIdx.x) / 49) * 98)) + (i1_inner * 49)) + (((int)threadIdx.x) % 49)) + 196)] = max((conv2d_nchw[(i1_inner + 2)] + bias[((((((int)blockIdx.x) * 8) + ((((int)threadIdx.x) / 49) * 2)) + i1_inner) + 4)]), 0.000000e+00f);
}
}
</pre></div>
@@ -1560,7 +2420,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 40.059 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 54.405 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index b77300381..91d31c990 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -901,7 +901,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 9.7007 9.7019 9.7260 9.6743 0.0211
+ 9.7218 9.7307 9.7445 9.6903 0.0230
</pre></div>
</div>
</div>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index dad8c158f..be9f83fff 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -920,7 +920,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 756.0719 756.0888 756.1048 756.0220 0.0359
+ 755.4379 755.4041 756.4968 754.4129 0.8511
</pre></div>
</div>
</div>
@@ -942,7 +942,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 22.883 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 21.949 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index 2b8b15168..9488a8ca2 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -620,14 +620,14 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
placeholder_4: Buffer(placeholder_14: Pointer(float32), float32, [65536], []),
compute: Buffer(compute_2: Pointer(float32), float32, [65536], [])}
buffer_map = {placeholder_5: placeholder, placeholder_6: placeholder_1, placeholder_7: placeholder_2, placeholder_8: placeholder_3, placeholder_9: placeholder_4, compute_1: compute}
- preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_5: placeholder_16: Buffer(placeholder_10, float32, [128, 256], []), placeholder_7: placeholder_17: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_18: Buffer(placeholder_11, float32, [4916, 16, 1], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_8: placeholder_19: Buffer(placeholder_13, int32, [33], [])} {
- for (i0.outer.i1.outer.fused: int32, 0, 128) "parallel" {
- allocate(compute_4: Pointer(global float32), float32, [512]), storage_scope = global {
- for (i.outer.inner: int32, 0, 2) {
- for (i.inner.init: int32, 0, 16) {
- let cse_var_1: int32 = ((i.outer.inner*256) + (i.inner.init*16))
+ preflattened_buffer_map = {placeholder_9: placeholder_15: Buffer(placeholder_14, float32, [128, 512], []), placeholder_7: placeholder_16: Buffer(placeholder_12, int32, [4916], []), placeholder_6: placeholder_17: Buffer(placeholder_11, float32, [4916, 16, 1], []), placeholder_5: placeholder_18: Buffer(placeholder_10, float32, [128, 256], []), compute_1: compute_3: Buffer(compute_2, float32, [128, 512], []), placeholder_8: placeholder_19: Buffer(placeholder_13, int32, [33], [])} {
+ for (i0.outer.i1.outer.fused: int32, 0, 32) "parallel" {
+ allocate(compute_4: Pointer(global float32), float32, [2048]), storage_scope = global {
+ for (i.outer.inner: int32, 0, 4) {
+ for (i.inner.init: int32, 0, 32) {
+ let cse_var_1: int32 = ((i.outer.inner*512) + (i.inner.init*16))
{
- compute_5: Buffer(compute_4, float32, [512], [])[cse_var_1] = 0f32
+ compute_5: Buffer(compute_4, float32, [2048], [])[cse_var_1] = 0f32
compute_5[(cse_var_1 + 1)] = 0f32
compute_5[(cse_var_1 + 2)] = 0f32
compute_5[(cse_var_1 + 3)] = 0f32
@@ -645,83 +645,78 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
compute_5[(cse_var_1 + 15)] = 0f32
}
}
- for (elem_idx: int32, 0, let cse_var_2: int32 = floormod(i0.outer.i1.outer.fused, 32) in (placeholder_3[(cse_var_2 + 1)] - placeholder_3[cse_var_2])) {
- for (i.inner: int32, 0, 16) {
- let cse_var_3: int32 = floormod(i0.outer.i1.outer.fused, 32)
- {
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_4: int32 = ((i.outer.inner*256) + (i.inner*16))
- compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[((placeholder_3[cse_var_3]*16) + (elem_idx*16))]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_5: int32 = (((i.outer.inner*256) + (i.inner*16)) + 1)
- compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 1)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_6: int32 = (((i.outer.inner*256) + (i.inner*16)) + 2)
- compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 2)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_7: int32 = (((i.outer.inner*256) + (i.inner*16)) + 3)
- compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 3)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_8: int32 = (((i.outer.inner*256) + (i.inner*16)) + 4)
- compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 4)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_9: int32 = (((i.outer.inner*256) + (i.inner*16)) + 5)
- compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 5)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_10: int32 = (((i.outer.inner*256) + (i.inner*16)) + 6)
- compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 6)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_11: int32 = (((i.outer.inner*256) + (i.inner*16)) + 7)
- compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 7)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_12: int32 = (((i.outer.inner*256) + (i.inner*16)) + 8)
- compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 8)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_13: int32 = (((i.outer.inner*256) + (i.inner*16)) + 9)
- compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 9)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_14: int32 = (((i.outer.inner*256) + (i.inner*16)) + 10)
- compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 10)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_15: int32 = (((i.outer.inner*256) + (i.inner*16)) + 11)
- compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 11)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_16: int32 = (((i.outer.inner*256) + (i.inner*16)) + 12)
- compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 12)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_17: int32 = (((i.outer.inner*256) + (i.inner*16)) + 13)
- compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 13)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_18: int32 = (((i.outer.inner*256) + (i.inner*16)) + 14)
- compute_5[cse_var_18] = (compute_5[cse_var_18] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 14)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
- if @tir.likely((elem_idx < (placeholder_3[(cse_var_3 + 1)] - placeholder_3[cse_var_3])), dtype=bool) {
- let cse_var_19: int32 = (((i.outer.inner*256) + (i.inner*16)) + 15)
- compute_5[cse_var_19] = (compute_5[cse_var_19] + (placeholder_1[(((placeholder_3[cse_var_3]*16) + (elem_idx*16)) + 15)]*max(placeholder[((((floordiv(i0.outer.i1.outer.fused, 32)*8192) + (i.outer.inner*4096)) + (i.inner*256)) + placeholder_2[(placeholder_3[cse_var_3] + elem_idx)])], 0f32)))
- }
+ for (elem_idx: int32, 0, (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])) {
+ for (i.inner: int32, 0, 32) {
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_2: int32 = ((i.outer.inner*512) + (i.inner*16))
+ compute_5[cse_var_2] = (compute_5[cse_var_2] + (placeholder_1[((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16))]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_3: int32 = (((i.outer.inner*512) + (i.inner*16)) + 1)
+ compute_5[cse_var_3] = (compute_5[cse_var_3] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 1)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_4: int32 = (((i.outer.inner*512) + (i.inner*16)) + 2)
+ compute_5[cse_var_4] = (compute_5[cse_var_4] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 2)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_5: int32 = (((i.outer.inner*512) + (i.inner*16)) + 3)
+ compute_5[cse_var_5] = (compute_5[cse_var_5] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 3)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_6: int32 = (((i.outer.inner*512) + (i.inner*16)) + 4)
+ compute_5[cse_var_6] = (compute_5[cse_var_6] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 4)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_7: int32 = (((i.outer.inner*512) + (i.inner*16)) + 5)
+ compute_5[cse_var_7] = (compute_5[cse_var_7] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 5)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_8: int32 = (((i.outer.inner*512) + (i.inner*16)) + 6)
+ compute_5[cse_var_8] = (compute_5[cse_var_8] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 6)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_9: int32 = (((i.outer.inner*512) + (i.inner*16)) + 7)
+ compute_5[cse_var_9] = (compute_5[cse_var_9] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 7)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_10: int32 = (((i.outer.inner*512) + (i.inner*16)) + 8)
+ compute_5[cse_var_10] = (compute_5[cse_var_10] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 8)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_11: int32 = (((i.outer.inner*512) + (i.inner*16)) + 9)
+ compute_5[cse_var_11] = (compute_5[cse_var_11] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 9)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_12: int32 = (((i.outer.inner*512) + (i.inner*16)) + 10)
+ compute_5[cse_var_12] = (compute_5[cse_var_12] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 10)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_13: int32 = (((i.outer.inner*512) + (i.inner*16)) + 11)
+ compute_5[cse_var_13] = (compute_5[cse_var_13] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 11)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_14: int32 = (((i.outer.inner*512) + (i.inner*16)) + 12)
+ compute_5[cse_var_14] = (compute_5[cse_var_14] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 12)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_15: int32 = (((i.outer.inner*512) + (i.inner*16)) + 13)
+ compute_5[cse_var_15] = (compute_5[cse_var_15] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 13)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_16: int32 = (((i.outer.inner*512) + (i.inner*16)) + 14)
+ compute_5[cse_var_16] = (compute_5[cse_var_16] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 14)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
+ }
+ if @tir.likely((elem_idx < (placeholder_3[(i0.outer.i1.outer.fused + 1)] - placeholder_3[i0.outer.i1.outer.fused])), dtype=bool) {
+ let cse_var_17: int32 = (((i.outer.inner*512) + (i.inner*16)) + 15)
+ compute_5[cse_var_17] = (compute_5[cse_var_17] + (placeholder_1[(((placeholder_3[i0.outer.i1.outer.fused]*16) + (elem_idx*16)) + 15)]*max(placeholder[(((i.outer.inner*8192) + (i.inner*256)) + placeholder_2[(placeholder_3[i0.outer.i1.outer.fused] + elem_idx)])], 0f32)))
}
}
}
}
- for (i0.inner: int32, 0, 32) {
- for (i1.inner: int32, 0, 16) {
- let cse_var_20: int32 = ((((floordiv(i0.outer.i1.outer.fused, 32)*16384) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 32)*16)) + i1.inner)
- compute[cse_var_20] = max((compute_5[((i0.inner*16) + i1.inner)] + placeholder_4[cse_var_20]), 0f32)
- }
+ for (i0.inner: int32, 0, 128) {
+ let cse_var_18: int32 = ((i0.inner*512) + (i0.outer.i1.outer.fused*16))
+ compute[ramp(cse_var_18, 1, 16)] = max((compute_5[ramp((i0.inner*16), 1, 16)] + placeholder_4[ramp(cse_var_18, 1, 16)]), broadcast(0f32, 16))
}
}
}
@@ -759,7 +754,7 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.697 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.699 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index c5653a0e8..dc10a0e03 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:43.722</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:44.500</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -331,11 +331,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:43.691</p></td>
+<td><p>00:44.465</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.016</p></td>
+<td><p>00:00.020</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index 2d5ba49b2..a7233f3f2 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -1167,8 +1167,8 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 4, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2885496
-No: 6 GFLOPS: 103.61/103.61 result: MeasureResult(costs=(0.002234297895833333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.6444010734558105, timestamp=1657174711.450943) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
-No: 7 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 6 GFLOPS: 93.95/93.95 result: MeasureResult(costs=(0.002464068875,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8310317993164062, timestamp=1657179140.2786798) [('tile_f', [-1, 1, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3754080
+No: 7 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1291,7 +1291,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 256, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6225319
-No: 8 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 8 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1414,7 +1414,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 64]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,943546
-No: 9 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 9 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1537,7 +1537,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2868708
-No: 10 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 10 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
res = future.result()
File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
@@ -1555,7 +1555,7 @@ No: 10 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
TimeoutError
[('tile_f', [-1, 32, 2, 4]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 2]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4691833
-No: 11 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 11 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1678,7 +1678,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 2, 64]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1042124
-No: 12 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 12 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1801,7 +1801,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 32, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10013405
-No: 13 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 13 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -1924,7 +1924,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 2]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6732082
-No: 14 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 14 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2047,7 +2047,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 32]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7536735
-No: 15 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 15 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2170,7 +2170,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,482121
-No: 16 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 16 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2293,7 +2293,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 1, 16]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 32, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2824525
-No: 17 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 17 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2416,7 +2416,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4559286
-No: 18 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 18 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 588, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 540, in _build_func_common
@@ -2539,7 +2539,7 @@ Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 871, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9677544
-No: 19 GFLOPS: 0.00/103.61 result: Traceback (most recent call last):
+No: 19 GFLOPS: 0.00/93.95 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 738, in __call__
yield remote, remote.load_module(os.path.split(build_result.filename)[1])
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 702, in run_through_rpc
@@ -2627,7 +2627,7 @@ tvm._ffi.base.TVMError: Traceback (most recent call last):
15: _PyEval_EvalFrameDefault
14: 0x0000000000537c30
13: _PyObject_FastCallKeywords
- 12: 0x00007f2274af7fa2
+ 12: 0x00007f5e8b033fa2
11: _ctypes_callproc
10: ffi_call
9: ffi_call_unix64
@@ -2692,7 +2692,7 @@ Traceback (most recent call last):
21: _PyFunction_FastCallKeywords
20: _PyEval_EvalFrameDefault
19: _PyFunction_FastCall [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6390073
-No: 20 GFLOPS: 144.75/144.75 result: MeasureResult(costs=(0.0015993116699999999,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.454596996307373, timestamp=1657174738.1003814) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
+No: 20 GFLOPS: 144.09/144.09 result: MeasureResult(costs=(0.00160666181,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.4486920833587646, timestamp=1657179166.8123953) [('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
</pre></div>
</div>
<p>Finally we can inspect the best config from log file, check correctness,
@@ -2733,7 +2733,7 @@ and measure running time.</p>
Best config:
[('tile_f', [-1, 1, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9881539
Finish loading 20 records
-Time cost of this operator: 0.002037
+Time cost of this operator: 0.002029
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 3b398cb79..60476accd 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -578,10 +578,10 @@ the tuned operator.</p>
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 311.7 98.705 (1, 2, 10, 10, 3) 2 1 [311.7]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.125 0.99 (1, 6, 10, 10) 1 1 [3.125]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.964 0.305 (1, 1, 10, 10, 3) 1 1 [0.964]
-Total_time - 315.789 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.8 98.713 (1, 2, 10, 10, 3) 2 1 [310.8]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.094 0.983 (1, 6, 10, 10) 1 1 [3.094]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.959 0.304 (1, 1, 10, 10, 3) 1 1 [0.959]
+Total_time - 314.852 - - - - -
</pre></div>
</div>
</div>
@@ -634,10 +634,10 @@ Total_time -
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 88.0 96.911 (1, 6, 10, 10, 1) 2 1 [88.0]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.845 2.032 (1, 6, 10, 10) 1 1 [1.845]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.96 1.057 (1, 1, 10, 10, 3) 1 1 [0.96]
-Total_time - 90.805 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 133.7 97.938 (1, 6, 10, 10, 1) 2 1 [133.7]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.818 1.332 (1, 6, 10, 10) 1 1 [1.818]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.996 0.73 (1, 1, 10, 10, 3) 1 1 [0.996]
+Total_time - 136.514 - - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index c4158ca53..08cd012bb 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -510,7 +510,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpno5zwqco/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmprszi7_mv/images/random'
</pre></div>
</div>
</div>
@@ -570,8 +570,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpno5zwqco/images/target contains 8144 images
-/tmp/tmpno5zwqco/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmprszi7_mv/images/target contains 8144 images
+/tmp/tmprszi7_mv/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -683,13 +683,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 56s - loss: 0.2213 - accuracy: 0.9240 - val_loss: 0.1338 - val_accuracy: 0.9562
+328/328 - 56s - loss: 0.2335 - accuracy: 0.9211 - val_loss: 0.1229 - val_accuracy: 0.9603
Epoch 2/3
-328/328 - 53s - loss: 0.0986 - accuracy: 0.9627 - val_loss: 0.1379 - val_accuracy: 0.9543
+328/328 - 53s - loss: 0.0972 - accuracy: 0.9621 - val_loss: 0.1103 - val_accuracy: 0.9630
Epoch 3/3
-328/328 - 53s - loss: 0.0656 - accuracy: 0.9755 - val_loss: 0.1003 - val_accuracy: 0.9653
+328/328 - 52s - loss: 0.0682 - accuracy: 0.9757 - val_loss: 0.1070 - val_accuracy: 0.9694
-<keras.callbacks.History object at 0x7f5b41476610>
+<keras.callbacks.History object at 0x7f2b7a78bf90>
</pre></div>
</div>
</div>
@@ -951,7 +951,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 0.282 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 15.759 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 2483d91aa..310fb7318 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>05:49.383</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>06:01.915</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,15 +331,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>05:00.282</p></td>
+<td><p>05:15.759</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:45.588</p></td>
+<td><p>00:42.880</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.510</p></td>
+<td><p>00:03.274</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_ethosu.html#sphx-glr-how-to-work-with-microtvm-micro-ethosu-py"><span class="std std-ref">Running TVM on bare metal Arm(R) Cortex(R)-M55 CPU and Ethos(TM)-U55 NPU with CMSIS-NN</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_ethosu.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index bad59edda..a3c338184 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:11.408</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:11.310</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,11 +331,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:09.906</p></td>
+<td><p>00:09.919</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.496</p></td>
+<td><p>00:01.385</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 511585852..54555a069 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -517,7 +517,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f5aba38c4d0>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f2acde41680>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index b67e26c9d..181bc5bf1 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -322,7 +322,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:04.025</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:04.149</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -331,35 +331,35 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:01.882</p></td>
+<td><p>00:01.955</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:00.929</p></td>
+<td><p>00:00.908</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.525</p></td>
+<td><p>00:00.566</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.513</p></td>
+<td><p>00:00.540</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.101</p></td>
+<td><p>00:00.099</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
-<td><p>00:00.035</p></td>
+<td><p>00:00.036</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tedd.html#sphx-glr-how-to-work-with-schedules-tedd-py"><span class="std std-ref">Use Tensor Expression Debug Display (TEDD) for Visualization</span></a> (<code class="docutils literal notranslate"><span class="pre">tedd.py</span></code>)</p></td>
-<td><p>00:00.027</p></td>
+<td><p>00:00.030</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.014</p></td>
+<td><p>00:00.015</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index aa71eb477..6647eb34c 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -572,7 +572,7 @@ The importing needs to happen before the tensorized GEMV being executed.</p>
C: Buffer(C_2: Pointer(float32), float32, [524288], [])}
buffer_map = {A_1: A, B_1: B, C_1: C}
preflattened_buffer_map = {A_1: A_3: Buffer(A_2, float32, [1024, 64], []), B_1: B_3: Buffer(B_2, float32, [512, 64], []), C_1: C_3: Buffer(C_2, float32, [1024, 512], [])} {
- attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpgatjuc7x/input0.cc'\nsource_filename = \"/tmp/tmpgatjuc7x/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
+ attr [IterVar(i: int32, (nullptr), "DataPar", "")] "pragma_import_llvm" = "; ModuleID = '/tmp/tmpg1mz_rs_/input0.cc'\nsource_filename = \"/tmp/tmpg1mz_rs_/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = allo [...]
for (i, 0, 1024) {
for (j.outer: int32, 0, 32) {
@tir.call_extern("gemv_update", @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), C_2, ((i*512) + (j.outer*16)), 16, 2, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), A_2, (i*64), 64, 1, dtype=handle), @tir.tvm_access_ptr(@tir.type_annotation(, dtype=float32), B_2, (j.outer*1024), 1024, 1, dtype=handle), 16, 64, 64, dtype=int32)
diff --git a/docs/reference/api/doxygen/annotated.html b/docs/reference/api/doxygen/annotated.html
index ea0b62871..a8cb30df0 100644
--- a/docs/reference/api/doxygen/annotated.html
+++ b/docs/reference/api/doxygen/annotated.html
@@ -671,393 +671,401 @@ $(function() {
<tr id="row_1_7_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc.html" target="_self">TypedPackedFunc</a></td><td class="desc">Please refer to <a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc_3_01R_07Args_8_8_8_08_4.html#TypedPackedFuncAnchor">TypedPackedFunc<R(Args..)></a> </td></tr>
<tr id="row_1_7_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1runtime_1_1TypedPackedFunc_3_01R_07Args_8_8_8_08_4.html" target="_self">TypedPackedFunc< R(Args...)></a></td><td class="desc">A <a class="el" href="classtvm_1_1runtime_1_1PackedFunc.html" title="Packed function is a type-erased function. The arguments are passed by [...]
<tr id="row_1_7_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1runtime_1_1TypeIndex.html" target="_self">TypeIndex</a></td><td class="desc">Namespace for the list of type index </td></tr>
-<tr id="row_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_8_" class="arrow" onclick="toggleFolder('1_8_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1support.html" target="_self">support</a></td><td class="desc"></td></tr>
-<tr id="row_1_8_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1LinearCongruentialEngine.html" target="_self">LinearCongruentialEngine</a></td><td class="desc">This linear congruential engine is a drop-in replacement for std::minstd_rand. It strictly corresponds to std::minstd_rand and is designed to be platform-independent [...]
-<tr id="row_1_8_1_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_8_1_" class="arrow" onclick="toggleFolder('1_8_1_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1Span.html" target="_self">Span</a></td><td class="desc">A partial implementation of the C++20 std::span </td></tr>
-<tr id="row_1_8_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1Span_1_1iterator__base.html" target="_self">iterator_base</a></td><td class="desc"></td></tr>
-<tr id="row_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_9_" class="arrow" onclick="toggleFolder('1_9_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1te.html" target="_self">te</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computation result [...]
-<tr id="row_1_9_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1BaseComputeOpNode.html" target="_self">BaseComputeOpNode</a></td><td class="desc">A Compute op that compute a tensor on certain domain. This is the base class for <a class="el" href="classtvm_1_1te_1_1ComputeOp.html" title="Managed reference to ComputeOpNode. ">Compu [...]
-<tr id="row_1_9_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ComputeOp.html" target="_self">ComputeOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ComputeOpNode.html" title="A Compute op that compute a tensor on certain domain. ">ComputeOpNode</a> </td></tr>
-<tr id="row_1_9_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ComputeOpNode.html" target="_self">ComputeOpNode</a></td><td class="desc">A Compute op that compute a tensor on certain domain </td></tr>
-<tr id="row_1_9_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ExternOp.html" target="_self">ExternOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ExternOpNode.html" title="External computation that cannot be splitted. ">ExternOpNode</a> </td></tr>
-<tr id="row_1_9_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ExternOpNode.html" target="_self">ExternOpNode</a></td><td class="desc">External computation that cannot be splitted </td></tr>
-<tr id="row_1_9_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Fuse.html" target="_self">Fuse</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1FuseNode.html" title="Fuse two domains into one domain. ">FuseNode</a> </td></tr>
-<tr id="row_1_9_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1FuseNode.html" target="_self">FuseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Fuse.html" title="Managed reference to FuseNode. ">Fuse</a> two domains into one domain </td></tr>
-<tr id="row_1_9_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1HybridOp.html" target="_self">HybridOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1HybridOpNode.html" title="A computation operator that generated by hybrid script. ">HybridOpNode</a> </td></tr>
-<tr id="row_1_9_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1HybridOpNode.html" target="_self">HybridOpNode</a></td><td class="desc">A computation operator that generated by hybrid script </td></tr>
-<tr id="row_1_9_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarAttr.html" target="_self">IterVarAttr</a></td><td class="desc">Additional scheduable attributes about IterVar </td></tr>
-<tr id="row_1_9_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarAttrNode.html" target="_self">IterVarAttrNode</a></td><td class="desc">Node container for IterVar attr </td></tr>
-<tr id="row_1_9_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarRelation.html" target="_self">IterVarRelation</a></td><td class="desc">The schedule relation between IterVars can be <a class="el" href="classtvm_1_1te_1_1Split.html" title="Managed reference to SplitNode. ">Split</a>, <a class="el" href="classtvm_1_1te_1_1Fu [...]
-<tr id="row_1_9_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarRelationNode.html" target="_self">IterVarRelationNode</a></td><td class="desc">Base node of iteration var </td></tr>
-<tr id="row_1_9_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Operation.html" target="_self">Operation</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Operation.html" title="Operation that produces tensors. ">Operation</a> that produces tensors </td></tr>
-<tr id="row_1_9_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1OperationNode.html" target="_self">OperationNode</a></td><td class="desc">Base class of all operation nodes </td></tr>
-<tr id="row_1_9_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1PlaceholderOp.html" target="_self">PlaceholderOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1PlaceholderOpNode.html" title="A placeholder op represents an input placeholder. ">PlaceholderOpNode</a> </td></tr>
-<tr id="row_1_9_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1PlaceholderOpNode.html" target="_self">PlaceholderOpNode</a></td><td class="desc">A placeholder op represents an input placeholder </td></tr>
-<tr id="row_1_9_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Rebase.html" target="_self">Rebase</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1RebaseNode.html" title="Rebase the iteration to make min to be 0. This is useful to normalize the Schedule to make every leaf...">RebaseNode</a> </ [...]
-<tr id="row_1_9_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1RebaseNode.html" target="_self">RebaseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Rebase.html" title="Managed reference to RebaseNode. ">Rebase</a> the iteration to make min to be 0. This is useful to normalize the <a class="el" href="classtv [...]
-<tr id="row_1_9_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScanOp.html" target="_self">ScanOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ScanOpNode.html" title="Symbolic scan. ">ScanOpNode</a> </td></tr>
-<tr id="row_1_9_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScanOpNode.html" target="_self">ScanOpNode</a></td><td class="desc">Symbolic scan </td></tr>
-<tr id="row_1_9_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Schedule.html" target="_self">Schedule</a></td><td class="desc">Global schedule container For operations and all the operations they depend on. The schedule per <a class="el" href="classtvm_1_1te_1_1Operation.html" title="Operation that produces tensors. ">Operation [...]
-<tr id="row_1_9_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScheduleNode.html" target="_self">ScheduleNode</a></td><td class="desc">Node container for schedule </td></tr>
-<tr id="row_1_9_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Singleton.html" target="_self">Singleton</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1SingletonNode.html" title="Singleton iterator [0, 1) ">SingletonNode</a> </td></tr>
-<tr id="row_1_9_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SingletonNode.html" target="_self">SingletonNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Singleton.html" title="Managed reference to SingletonNode. ">Singleton</a> iterator [0, 1) </td></tr>
-<tr id="row_1_9_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SpecializedCondition.html" target="_self">SpecializedCondition</a></td><td class="desc">Specialized condition to enable op specialization </td></tr>
-<tr id="row_1_9_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SpecializedConditionNode.html" target="_self">SpecializedConditionNode</a></td><td class="desc">Container for specialization conditions </td></tr>
-<tr id="row_1_9_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Split.html" target="_self">Split</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1SplitNode.html" title="Split the parent domain into product of outer and iter. ">SplitNode</a> </td></tr>
-<tr id="row_1_9_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SplitNode.html" target="_self">SplitNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Split.html" title="Managed reference to SplitNode. ">Split</a> the parent domain into product of outer and iter </td></tr>
-<tr id="row_1_9_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Stage.html" target="_self">Stage</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Stage.html" title="Stage, contains scheduling for a stage of computation. ">Stage</a>, contains scheduling for a stage of computation </td></tr>
-<tr id="row_1_9_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1StageNode.html" target="_self">StageNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Stage.html" title="Stage, contains scheduling for a stage of computation. ">Stage</a> </td></tr>
-<tr id="row_1_9_31_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_9_31_" class="arrow" onclick="toggleFolder('1_9_31_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Tensor.html" target="_self">Tensor</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate [...]
-<tr id="row_1_9_31_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Tensor_1_1Slice.html" target="_self">Slice</a></td><td class="desc">Data structure to represent a slice that fixes first k coordinates. This is used to enable syntax sugar of <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a [...]
-<tr id="row_1_9_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorComputeOp.html" target="_self">TensorComputeOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorComputeOpNode.html" title="A TenorCompute op that compute a tensor with an tensor intrinsic. ">TensorComputeOpNode</a> </td></tr>
-<tr id="row_1_9_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorComputeOpNode.html" target="_self">TensorComputeOpNode</a></td><td class="desc">A TenorCompute op that compute a tensor with an tensor intrinsic </td></tr>
-<tr id="row_1_9_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1te_1_1TensorDom.html" target="_self">TensorDom</a></td><td class="desc">Temporary data structure to store union of bounds of each axis of <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computation [...]
-<tr id="row_1_9_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrin.html" target="_self">TensorIntrin</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorIntrinNode.html" title="Node to represent a Tensor intrinsic operator. ">TensorIntrinNode</a> </td></tr>
-<tr id="row_1_9_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinCall.html" target="_self">TensorIntrinCall</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorIntrinCallNode.html">TensorIntrinCallNode</a> </td></tr>
-<tr id="row_1_9_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinCallNode.html" target="_self">TensorIntrinCallNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_9_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinNode.html" target="_self">TensorIntrinNode</a></td><td class="desc">Node to represent a <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computation result. ">Tensor</a> intrinsic o [...]
-<tr id="row_1_9_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorNode.html" target="_self">TensorNode</a></td><td class="desc">Node to represent a tensor </td></tr>
-<tr id="row_1_9_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Transform.html" target="_self">Transform</a></td><td class="desc"></td></tr>
-<tr id="row_1_9_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TransformNode.html" target="_self">TransformNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Transform.html">Transform</a> iterator according to some arbitrary expression </td></tr>
-<tr id="row_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_10_" class="arrow" onclick="toggleFolder('1_10_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir.html" target="_self">tir</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_10_0_" class="arrow" onclick="toggleFolder('1_10_0_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir_1_1usmp.html" target="_self">usmp</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_1_10_0_0_" class="arrow" onclick="toggleFolder('1_10_0_0_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir_1_1usmp_1_1algo.html" target="_self">algo</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1algo_1_1GreedyBase.html" target="_self">GreedyBase</a></td><td class="desc">This is the base class for Greedy Algorithms where the sorting is specialized in the extended classes based on the greedy criteria </td></tr>
-<tr id="row_1_10_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1AllocatedPoolInfo.html" target="_self">AllocatedPoolInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1AllocatedPoolInfoNode.html" target="_self">AllocatedPoolInfoNode</a></td><td class="desc">This object contains information post-allocation for <a class="el" href="classtvm_1_1PoolInfo.html" title="Base class for WorkspacePoolInfo and ConstantPoolInfo. "> [...]
-<tr id="row_1_10_0_3_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1BufferInfo.html" target="_self">BufferInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_4_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1BufferInfoAnalysis.html" target="_self">BufferInfoAnalysis</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_5_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1BufferInfoAnalysisNode.html" target="_self">BufferInfoAnalysisNode</a></td><td class="desc">This is a composite node that is produced by extract_buffer_info analysis pass that contains useful global information that could be useful for memory planning al [...]
-<tr id="row_1_10_0_6_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1BufferInfoNode.html" target="_self">BufferInfoNode</a></td><td class="desc">Describes an abstract memory buffer that will get allocated inside a pool. The actual memory buffer in represented by <a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocati [...]
-<tr id="row_1_10_0_7_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1PoolAllocation.html" target="_self">PoolAllocation</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_0_8_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocationNode.html" target="_self">PoolAllocationNode</a></td><td class="desc">The pool allocation produced after the USMP algorithm </td></tr>
-<tr id="row_1_10_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Add.html" target="_self">Add</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AddNode.html" title="a + b ">AddNode</a> </td></tr>
-<tr id="row_1_10_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AddNode.html" target="_self">AddNode</a></td><td class="desc"><ul>
+<tr id="row_1_8_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_8_" class="arrow" onclick="toggleFolder('1_8_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1script.html" target="_self">script</a></td><td class="desc"></td></tr>
+<tr id="row_1_8_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_8_0_" class="arrow" onclick="toggleFolder('1_8_0_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1script_1_1printer.html" target="_self">printer</a></td><td class="desc"></td></tr>
+<tr id="row_1_8_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1Doc.html" target="_self">Doc</a></td><td class="desc">Reference type of <a class="el" href="classtvm_1_1script_1_1printer_1_1DocNode.html" title="The base class of all Doc. ">DocNode</a> </td></tr>
+<tr id="row_1_8_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1DocNode.html" target="_self">DocNode</a></td><td class="desc">The base class of all <a class="el" href="classtvm_1_1script_1_1printer_1_1Doc.html" title="Reference type of DocNode. ">Doc</a> </td></tr>
+<tr id="row_1_8_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1ExprDoc.html" target="_self">ExprDoc</a></td><td class="desc">Reference type of <a class="el" href="classtvm_1_1script_1_1printer_1_1ExprDocNode.html" title="The base class of expression doc. ">ExprDocNode</a> </td></tr>
+<tr id="row_1_8_0_3_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1ExprDocNode.html" target="_self">ExprDocNode</a></td><td class="desc">The base class of expression doc </td></tr>
+<tr id="row_1_8_0_4_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1LiteralDoc.html" target="_self">LiteralDoc</a></td><td class="desc">Reference type of <a class="el" href="classtvm_1_1script_1_1printer_1_1LiteralDocNode.html" title="Doc that represents literal value. ">LiteralDocNode</a> </td></tr>
+<tr id="row_1_8_0_5_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1script_1_1printer_1_1LiteralDocNode.html" target="_self">LiteralDocNode</a></td><td class="desc"><a class="el" href="classtvm_1_1script_1_1printer_1_1Doc.html" title="Reference type of DocNode. ">Doc</a> that represents literal value </td></tr>
+<tr id="row_1_9_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_9_" class="arrow" onclick="toggleFolder('1_9_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1support.html" target="_self">support</a></td><td class="desc"></td></tr>
+<tr id="row_1_9_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1LinearCongruentialEngine.html" target="_self">LinearCongruentialEngine</a></td><td class="desc">This linear congruential engine is a drop-in replacement for std::minstd_rand. It strictly corresponds to std::minstd_rand and is designed to be platform-independent [...]
+<tr id="row_1_9_1_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_9_1_" class="arrow" onclick="toggleFolder('1_9_1_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1Span.html" target="_self">Span</a></td><td class="desc">A partial implementation of the C++20 std::span </td></tr>
+<tr id="row_1_9_1_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1support_1_1Span_1_1iterator__base.html" target="_self">iterator_base</a></td><td class="desc"></td></tr>
+<tr id="row_1_10_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_10_" class="arrow" onclick="toggleFolder('1_10_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1te.html" target="_self">te</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computation res [...]
+<tr id="row_1_10_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1BaseComputeOpNode.html" target="_self">BaseComputeOpNode</a></td><td class="desc">A Compute op that compute a tensor on certain domain. This is the base class for <a class="el" href="classtvm_1_1te_1_1ComputeOp.html" title="Managed reference to ComputeOpNode. ">Comp [...]
+<tr id="row_1_10_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ComputeOp.html" target="_self">ComputeOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ComputeOpNode.html" title="A Compute op that compute a tensor on certain domain. ">ComputeOpNode</a> </td></tr>
+<tr id="row_1_10_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ComputeOpNode.html" target="_self">ComputeOpNode</a></td><td class="desc">A Compute op that compute a tensor on certain domain </td></tr>
+<tr id="row_1_10_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ExternOp.html" target="_self">ExternOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ExternOpNode.html" title="External computation that cannot be splitted. ">ExternOpNode</a> </td></tr>
+<tr id="row_1_10_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ExternOpNode.html" target="_self">ExternOpNode</a></td><td class="desc">External computation that cannot be splitted </td></tr>
+<tr id="row_1_10_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Fuse.html" target="_self">Fuse</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1FuseNode.html" title="Fuse two domains into one domain. ">FuseNode</a> </td></tr>
+<tr id="row_1_10_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1FuseNode.html" target="_self">FuseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Fuse.html" title="Managed reference to FuseNode. ">Fuse</a> two domains into one domain </td></tr>
+<tr id="row_1_10_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1HybridOp.html" target="_self">HybridOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1HybridOpNode.html" title="A computation operator that generated by hybrid script. ">HybridOpNode</a> </td></tr>
+<tr id="row_1_10_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1HybridOpNode.html" target="_self">HybridOpNode</a></td><td class="desc">A computation operator that generated by hybrid script </td></tr>
+<tr id="row_1_10_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarAttr.html" target="_self">IterVarAttr</a></td><td class="desc">Additional scheduable attributes about IterVar </td></tr>
+<tr id="row_1_10_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarAttrNode.html" target="_self">IterVarAttrNode</a></td><td class="desc">Node container for IterVar attr </td></tr>
+<tr id="row_1_10_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarRelation.html" target="_self">IterVarRelation</a></td><td class="desc">The schedule relation between IterVars can be <a class="el" href="classtvm_1_1te_1_1Split.html" title="Managed reference to SplitNode. ">Split</a>, <a class="el" href="classtvm_1_1te_1_1F [...]
+<tr id="row_1_10_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1IterVarRelationNode.html" target="_self">IterVarRelationNode</a></td><td class="desc">Base node of iteration var </td></tr>
+<tr id="row_1_10_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Operation.html" target="_self">Operation</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Operation.html" title="Operation that produces tensors. ">Operation</a> that produces tensors </td></tr>
+<tr id="row_1_10_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1OperationNode.html" target="_self">OperationNode</a></td><td class="desc">Base class of all operation nodes </td></tr>
+<tr id="row_1_10_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1PlaceholderOp.html" target="_self">PlaceholderOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1PlaceholderOpNode.html" title="A placeholder op represents an input placeholder. ">PlaceholderOpNode</a> </td></tr>
+<tr id="row_1_10_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1PlaceholderOpNode.html" target="_self">PlaceholderOpNode</a></td><td class="desc">A placeholder op represents an input placeholder </td></tr>
+<tr id="row_1_10_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Rebase.html" target="_self">Rebase</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1RebaseNode.html" title="Rebase the iteration to make min to be 0. This is useful to normalize the Schedule to make every leaf...">RebaseNode</a> < [...]
+<tr id="row_1_10_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1RebaseNode.html" target="_self">RebaseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Rebase.html" title="Managed reference to RebaseNode. ">Rebase</a> the iteration to make min to be 0. This is useful to normalize the <a class="el" href="classt [...]
+<tr id="row_1_10_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScanOp.html" target="_self">ScanOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1ScanOpNode.html" title="Symbolic scan. ">ScanOpNode</a> </td></tr>
+<tr id="row_1_10_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScanOpNode.html" target="_self">ScanOpNode</a></td><td class="desc">Symbolic scan </td></tr>
+<tr id="row_1_10_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Schedule.html" target="_self">Schedule</a></td><td class="desc">Global schedule container For operations and all the operations they depend on. The schedule per <a class="el" href="classtvm_1_1te_1_1Operation.html" title="Operation that produces tensors. ">Operatio [...]
+<tr id="row_1_10_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1ScheduleNode.html" target="_self">ScheduleNode</a></td><td class="desc">Node container for schedule </td></tr>
+<tr id="row_1_10_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Singleton.html" target="_self">Singleton</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1SingletonNode.html" title="Singleton iterator [0, 1) ">SingletonNode</a> </td></tr>
+<tr id="row_1_10_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SingletonNode.html" target="_self">SingletonNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Singleton.html" title="Managed reference to SingletonNode. ">Singleton</a> iterator [0, 1) </td></tr>
+<tr id="row_1_10_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SpecializedCondition.html" target="_self">SpecializedCondition</a></td><td class="desc">Specialized condition to enable op specialization </td></tr>
+<tr id="row_1_10_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SpecializedConditionNode.html" target="_self">SpecializedConditionNode</a></td><td class="desc">Container for specialization conditions </td></tr>
+<tr id="row_1_10_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Split.html" target="_self">Split</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1SplitNode.html" title="Split the parent domain into product of outer and iter. ">SplitNode</a> </td></tr>
+<tr id="row_1_10_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1SplitNode.html" target="_self">SplitNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Split.html" title="Managed reference to SplitNode. ">Split</a> the parent domain into product of outer and iter </td></tr>
+<tr id="row_1_10_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Stage.html" target="_self">Stage</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Stage.html" title="Stage, contains scheduling for a stage of computation. ">Stage</a>, contains scheduling for a stage of computation </td></tr>
+<tr id="row_1_10_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1StageNode.html" target="_self">StageNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Stage.html" title="Stage, contains scheduling for a stage of computation. ">Stage</a> </td></tr>
+<tr id="row_1_10_31_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_10_31_" class="arrow" onclick="toggleFolder('1_10_31_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Tensor.html" target="_self">Tensor</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermedi [...]
+<tr id="row_1_10_31_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Tensor_1_1Slice.html" target="_self">Slice</a></td><td class="desc">Data structure to represent a slice that fixes first k coordinates. This is used to enable syntax sugar of <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing [...]
+<tr id="row_1_10_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorComputeOp.html" target="_self">TensorComputeOp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorComputeOpNode.html" title="A TenorCompute op that compute a tensor with an tensor intrinsic. ">TensorComputeOpNode</a> </td></tr>
+<tr id="row_1_10_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorComputeOpNode.html" target="_self">TensorComputeOpNode</a></td><td class="desc">A TenorCompute op that compute a tensor with an tensor intrinsic </td></tr>
+<tr id="row_1_10_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1te_1_1TensorDom.html" target="_self">TensorDom</a></td><td class="desc">Temporary data structure to store union of bounds of each axis of <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computatio [...]
+<tr id="row_1_10_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrin.html" target="_self">TensorIntrin</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorIntrinNode.html" title="Node to represent a Tensor intrinsic operator. ">TensorIntrinNode</a> </td></tr>
+<tr id="row_1_10_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinCall.html" target="_self">TensorIntrinCall</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1te_1_1TensorIntrinCallNode.html">TensorIntrinCallNode</a> </td></tr>
+<tr id="row_1_10_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinCallNode.html" target="_self">TensorIntrinCallNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_10_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorIntrinNode.html" target="_self">TensorIntrinNode</a></td><td class="desc">Node to represent a <a class="el" href="classtvm_1_1te_1_1Tensor.html" title="Tensor structure representing a possible input, or intermediate computation result. ">Tensor</a> intrinsic [...]
+<tr id="row_1_10_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TensorNode.html" target="_self">TensorNode</a></td><td class="desc">Node to represent a tensor </td></tr>
+<tr id="row_1_10_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1Transform.html" target="_self">Transform</a></td><td class="desc"></td></tr>
+<tr id="row_1_10_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1te_1_1TransformNode.html" target="_self">TransformNode</a></td><td class="desc"><a class="el" href="classtvm_1_1te_1_1Transform.html">Transform</a> iterator according to some arbitrary expression </td></tr>
+<tr id="row_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_11_" class="arrow" onclick="toggleFolder('1_11_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir.html" target="_self">tir</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_11_0_" class="arrow" onclick="toggleFolder('1_11_0_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir_1_1usmp.html" target="_self">usmp</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span id="arr_1_11_0_0_" class="arrow" onclick="toggleFolder('1_11_0_0_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1tir_1_1usmp_1_1algo.html" target="_self">algo</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_0_0_" class="even" style="display:none;"><td class="entry"><span style="width:80px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1algo_1_1GreedyBase.html" target="_self">GreedyBase</a></td><td class="desc">This is the base class for Greedy Algorithms where the sorting is specialized in the extended classes based on the greedy criteria </td></tr>
+<tr id="row_1_11_0_1_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1AllocatedPoolInfo.html" target="_self">AllocatedPoolInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_2_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1AllocatedPoolInfoNode.html" target="_self">AllocatedPoolInfoNode</a></td><td class="desc">This object contains information post-allocation for <a class="el" href="classtvm_1_1PoolInfo.html" title="Base class for WorkspacePoolInfo and ConstantPoolInfo. "> [...]
+<tr id="row_1_11_0_3_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1BufferInfo.html" target="_self">BufferInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_4_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1BufferInfoAnalysis.html" target="_self">BufferInfoAnalysis</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_5_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1BufferInfoAnalysisNode.html" target="_self">BufferInfoAnalysisNode</a></td><td class="desc">This is a composite node that is produced by extract_buffer_info analysis pass that contains useful global information that could be useful for memory planning al [...]
+<tr id="row_1_11_0_6_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1BufferInfoNode.html" target="_self">BufferInfoNode</a></td><td class="desc">Describes an abstract memory buffer that will get allocated inside a pool. The actual memory buffer in represented by <a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocati [...]
+<tr id="row_1_11_0_7_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1usmp_1_1PoolAllocation.html" target="_self">PoolAllocation</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_0_8_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1usmp_1_1PoolAllocationNode.html" target="_self">PoolAllocationNode</a></td><td class="desc">The pool allocation produced after the USMP algorithm </td></tr>
+<tr id="row_1_11_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Add.html" target="_self">Add</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AddNode.html" title="a + b ">AddNode</a> </td></tr>
+<tr id="row_1_11_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AddNode.html" target="_self">AddNode</a></td><td class="desc"><ul>
<li>b </li>
</ul>
</td></tr>
-<tr id="row_1_10_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Allocate.html" target="_self">Allocate</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AllocateNode.html" title="Allocate a buffer that can be used in body. ">AllocateNode</a> </td></tr>
-<tr id="row_1_10_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateConst.html" target="_self">AllocateConst</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AllocateConstNode.html" title="Allocate a buffer that can be used in body. ">AllocateConstNode</a> </td></tr>
-<tr id="row_1_10_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateConstNode.html" target="_self">AllocateConstNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Allocate.html" title="Managed reference to AllocateNode. ">Allocate</a> a buffer that can be used in body </td></tr>
-<tr id="row_1_10_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateNode.html" target="_self">AllocateNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Allocate.html" title="Managed reference to AllocateNode. ">Allocate</a> a buffer that can be used in body </td></tr>
-<tr id="row_1_10_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1And.html" target="_self">And</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AndNode.html" title="a && b ">AndNode</a> </td></tr>
-<tr id="row_1_10_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AndNode.html" target="_self">AndNode</a></td><td class="desc">&& b </td></tr>
-<tr id="row_1_10_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Any.html" target="_self">Any</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AnyNode.html" title="Any shape. ">AnyNode</a> </td></tr>
-<tr id="row_1_10_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AnyNode.html" target="_self">AnyNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Any.html" title="Managed reference to AnyNode. ">Any</a> shape </td></tr>
-<tr id="row_1_10_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AssertStmt.html" target="_self">AssertStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AssertStmtNode.html" title="Assert condition, if an error occurs, return the error message. ">AssertStmtNode</a> </td></tr>
-<tr id="row_1_10_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AssertStmtNode.html" target="_self">AssertStmtNode</a></td><td class="desc">Assert condition, if an error occurs, return the error message </td></tr>
-<tr id="row_1_10_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AttrStmt.html" target="_self">AttrStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AttrStmtNode.html" title="Define certain auxiliary attribute for the body to be a symbolic value. This provide auxiliary inform...">AttrStmt [...]
-<tr id="row_1_10_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AttrStmtNode.html" target="_self">AttrStmtNode</a></td><td class="desc">Define certain auxiliary attribute for the body to be a symbolic value. This provide auxiliary information for IR passes that transforms body </td></tr>
-<tr id="row_1_10_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BijectiveLayout.html" target="_self">BijectiveLayout</a></td><td class="desc">Bijective function mapping for data layout transformation. Given two <a class="el" href="classtvm_1_1tir_1_1Layout.html" title="Managed reference to LayoutNode. ">Layout</a>, <a class="e [...]
-<tr id="row_1_10_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BijectiveLayoutNode.html" target="_self">BijectiveLayoutNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BinaryOpNode.html" target="_self">BinaryOpNode</a></td><td class="desc">Base template to implement binary ops </td></tr>
-<tr id="row_1_10_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Block.html" target="_self">Block</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockNode.html" title="A block is a basic schedule unit in TIR. ">BlockNode</a> </td></tr>
-<tr id="row_1_10_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1BlockInfo.html" target="_self">BlockInfo</a></td><td class="desc">The information about a TensorIR block, it contains two categories of information 1) Info on the block scope rooted at a specific block, including dependency tracking, flags indicating if the scope [...]
-<tr id="row_1_10_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockNode.html" target="_self">BlockNode</a></td><td class="desc">A block is a basic schedule unit in TIR </td></tr>
-<tr id="row_1_10_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRealize.html" target="_self">BlockRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html" title="A block realization node represents execution of the block at the binding values. ...">BlockRealizeNod [...]
-<tr id="row_1_10_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html" target="_self">BlockRealizeNode</a></td><td class="desc">A block realization node represents execution of the block at the binding values </td></tr>
-<tr id="row_1_10_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRV.html" target="_self">BlockRV</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockRVNode.html" title="A random variable that evaluates to a TensorIR block. ">BlockRVNode</a> </td></tr>
-<tr id="row_1_10_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRVNode.html" target="_self">BlockRVNode</a></td><td class="desc">A random variable that evaluates to a TensorIR block </td></tr>
-<tr id="row_1_10_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockScope.html" target="_self">BlockScope</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockScopeNode.html" title="An object with 1-to-1 correspondence with each block reference in the sref tree. This data structure ...">Bl [...]
-<tr id="row_1_10_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockScopeNode.html" target="_self">BlockScopeNode</a></td><td class="desc">An object with 1-to-1 correspondence with each block reference in the sref tree. This data structure is used to track the producer-consumer dependencies between blocks. <a class="el" href= [...]
-<tr id="row_1_10_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Broadcast.html" target="_self">Broadcast</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html" title="Create a vector where all the elements are value. ">BroadcastNode</a> </td></tr>
-<tr id="row_1_10_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html" target="_self">BroadcastNode</a></td><td class="desc">Create a vector where all the elements are value </td></tr>
-<tr id="row_1_10_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Buffer.html" target="_self">Buffer</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Buffer.html" title="Buffer is a symbolic n-darray structure. It is a composition of primitive symbolic types...">Buffer</a> is a symbolic n-darray structure. It is a [...]
-<tr id="row_1_10_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferLoad.html" target="_self">BufferLoad</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferLoadNode.html" title="Load value from the high dimension buffer. ">BufferLoadNode</a> </td></tr>
-<tr id="row_1_10_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferLoadNode.html" target="_self">BufferLoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> value from the high dimension buffer </td></tr>
-<tr id="row_1_10_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferNode.html" target="_self">BufferNode</a></td><td class="desc">Node to represent a buffer </td></tr>
-<tr id="row_1_10_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRealize.html" target="_self">BufferRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferRealizeNode.html" title="Annotate the region where the buffer need to be read and write in the body. We only need to allocat [...]
-<tr id="row_1_10_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRealizeNode.html" target="_self">BufferRealizeNode</a></td><td class="desc">Annotate the region where the buffer need to be read and write in the body. We only need to allocate the space for the corresponding region </td></tr>
-<tr id="row_1_10_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRegion.html" target="_self">BufferRegion</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferRegionNode.html" title="Representing the region of multi-dimensional buffer access. ">BufferRegionNode</a> </td></tr>
-<tr id="row_1_10_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRegionNode.html" target="_self">BufferRegionNode</a></td><td class="desc">Representing the region of multi-dimensional buffer access </td></tr>
-<tr id="row_1_10_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferStore.html" target="_self">BufferStore</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferStoreNode.html" title="Store value to the high dimension buffer. ">BufferStoreNode</a> </td></tr>
-<tr id="row_1_10_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferStoreNode.html" target="_self">BufferStoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value to the high dimension buffer </td></tr>
-<tr id="row_1_10_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Call.html" target="_self">Call</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CallNode.html" title="Call node. ">CallNode</a> </td></tr>
-<tr id="row_1_10_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CallNode.html" target="_self">CallNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Call.html" title="Managed reference to CallNode. ">Call</a> node </td></tr>
-<tr id="row_1_10_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Cast.html" target="_self">Cast</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CastNode.html" title="Cast value from one data type to another. ">CastNode</a> </td></tr>
-<tr id="row_1_10_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CastNode.html" target="_self">CastNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Cast.html" title="Managed reference to CastNode. ">Cast</a> value from one data type to another </td></tr>
-<tr id="row_1_10_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CmpOpNode.html" target="_self">CmpOpNode</a></td><td class="desc">Base template to implement comparison ops </td></tr>
-<tr id="row_1_10_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CommReducer.html" target="_self">CommReducer</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CommReducerNode.html" title="A commutative reducer node to represent a commutative binary operator with identity element...">CommReduc [...]
-<tr id="row_1_10_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CommReducerNode.html" target="_self">CommReducerNode</a></td><td class="desc">A commutative reducer node to represent a commutative binary operator with identity element </td></tr>
-<tr id="row_1_10_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DataProducer.html" target="_self">DataProducer</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DataProducerNode.html" title="Base node for data producers. ">DataProducerNode</a> </td></tr>
-<tr id="row_1_10_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DataProducerNode.html" target="_self">DataProducerNode</a></td><td class="desc">Base node for data producers </td></tr>
-<tr id="row_1_10_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Dependency.html" target="_self">Dependency</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DependencyNode.html" title="A tuple (src, dst, kind) representing certain types of dependency. For example, (A, B, kRAW) means block B d [...]
-<tr id="row_1_10_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DependencyNode.html" target="_self">DependencyNode</a></td><td class="desc">A tuple (src, dst, kind) representing certain types of dependency. <a class="el" href="classtvm_1_1tir_1_1For.html" title="Managed reference to ForNode. ">For</a> example, (A, B, kRAW) mea [...]
-<tr id="row_1_10_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Div.html" target="_self">Div</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DivNode.html" title="a / b in the C semnatics. ">DivNode</a> </td></tr>
-<tr id="row_1_10_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DivNode.html" target="_self">DivNode</a></td><td class="desc">/ b in the C semnatics </td></tr>
-<tr id="row_1_10_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EQ.html" target="_self">EQ</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1EQNode.html" title="a == b ">EQNode</a> </td></tr>
-<tr id="row_1_10_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EQNode.html" target="_self">EQNode</a></td><td class="desc">== b </td></tr>
-<tr id="row_1_10_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Evaluate.html" target="_self">Evaluate</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1EvaluateNode.html" title="Evaluates an expression. This is mostly used for putting a Call node into Stmt. ">EvaluateNode</a> </td></tr>
-<tr id="row_1_10_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EvaluateNode.html" target="_self">EvaluateNode</a></td><td class="desc">Evaluates an expression. This is mostly used for putting a <a class="el" href="classtvm_1_1tir_1_1Call.html" title="Managed reference to CallNode. ">Call</a> node into <a class="el" href="clas [...]
-<tr id="row_1_10_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1ExprDeepEqual.html" target="_self">ExprDeepEqual</a></td><td class="desc">Compare two expressions recursively and check if they are equal to each other without var remapping </td></tr>
-<tr id="row_1_10_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprFunctor.html" target="_self">ExprFunctor</a></td><td class="desc">A dynamical functor that dispatches on in the first Expr argument. You can use this as a more powerful Visitor, since it allows you to define function signatures of Visit Function </td></tr>
-<tr id="row_1_10_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprFunctor_3_01R_07const_01PrimExpr_01_6n_00_01Args_8_8_8_08_4.html" target="_self">ExprFunctor< R(const PrimExpr &n, Args...)></a></td><td class="desc"></td></tr>
-<tr id="row_1_10_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprMutator.html" target="_self">ExprMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1ExprMutator.html" title="ExprMutator that mutates expressions. ">ExprMutator</a> that mutates expressions </td></tr>
-<tr id="row_1_10_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprVisitor.html" target="_self">ExprVisitor</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1ExprVisitor.html" title="ExprVisitor. ">ExprVisitor</a> </td></tr>
-<tr id="row_1_10_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorDiv.html" target="_self">FloorDiv</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1FloorDivNode.html" title="Floor division, floor(a/b) ">FloorDivNode</a> </td></tr>
-<tr id="row_1_10_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorDivNode.html" target="_self">FloorDivNode</a></td><td class="desc">Floor division, floor(a/b) </td></tr>
-<tr id="row_1_10_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorMod.html" target="_self">FloorMod</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1FloorModNode.html" title="The remainder of the floordiv. ">FloorModNode</a> </td></tr>
-<tr id="row_1_10_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorModNode.html" target="_self">FloorModNode</a></td><td class="desc">The remainder of the floordiv </td></tr>
-<tr id="row_1_10_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1For.html" target="_self">For</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ForNode.html" title="A for loop, with poissible type annotations. ">ForNode</a> </td></tr>
-<tr id="row_1_10_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ForNode.html" target="_self">ForNode</a></td><td class="desc">A for loop, with poissible type annotations </td></tr>
-<tr id="row_1_10_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GE.html" target="_self">GE</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1GENode.html" title="a >= b ">GENode</a> </td></tr>
-<tr id="row_1_10_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GENode.html" target="_self">GENode</a></td><td class="desc">>= b </td></tr>
-<tr id="row_1_10_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GT.html" target="_self">GT</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1GTNode.html" title="a > b ">GTNode</a> </td></tr>
-<tr id="row_1_10_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GTNode.html" target="_self">GTNode</a></td><td class="desc">> b </td></tr>
-<tr id="row_1_10_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IfThenElse.html" target="_self">IfThenElse</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1IfThenElseNode.html" title="IfThenElse statment. ">IfThenElseNode</a> </td></tr>
-<tr id="row_1_10_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IfThenElseNode.html" target="_self">IfThenElseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1IfThenElse.html" title="Managed reference to IfThenElseNode. ">IfThenElse</a> statment </td></tr>
-<tr id="row_1_10_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IndexMap.html" target="_self">IndexMap</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IndexMapNode.html" target="_self">IndexMapNode</a></td><td class="desc">Defines a mapping between two representations of indices into a buffer </td></tr>
-<tr id="row_1_10_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Instruction.html" target="_self">Instruction</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1InstructionNode.html" title="Schedule instructions each corresponds to a schedule primitive. ">InstructionNode</a> </td></tr>
-<tr id="row_1_10_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKind.html" target="_self">InstructionKind</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1InstructionKindNode.html" title="Kind of an instruction, e.g. Split, Reorder, etc. Besides the name, every kind of instruction [...]
-<tr id="row_1_10_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKindNode.html" target="_self">InstructionKindNode</a></td><td class="desc">Kind of an instruction, e.g. Split, Reorder, etc. Besides the name, every kind of instruction has its own properties, including: 1) A boolean indicating if the instruction is pur [...]
-<tr id="row_1_10_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKindRegEntry.html" target="_self">InstructionKindRegEntry</a></td><td class="desc">An entry in the registry of <a class="el" href="classtvm_1_1tir_1_1InstructionKind.html" title="Managed reference to InstructionKindNode. ">InstructionKind</a> </td></tr>
-<tr id="row_1_10_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionNode.html" target="_self">InstructionNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Schedule.html" title="Managed reference to ScheduleNode. ">Schedule</a> instructions each corresponds to a schedule primitive </td></tr>
-<tr id="row_1_10_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IterVar.html" target="_self">IterVar</a></td><td class="desc">Iteration Variable, represents an iteration over an integer interval </td></tr>
-<tr id="row_1_10_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IterVarNode.html" target="_self">IterVarNode</a></td><td class="desc">An iteration variable representing an iteration over a one dimensional interval </td></tr>
-<tr id="row_1_10_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Layout.html" target="_self">Layout</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LayoutNode.html" title="Layout is to describe how data is organized within an N-dimention tensor. It is composed of upper cas...">LayoutNode</a> [...]
-<tr id="row_1_10_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LayoutAxis.html" target="_self">LayoutAxis</a></td><td class="desc"></td></tr>
-<tr id="row_1_10_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LayoutNode.html" target="_self">LayoutNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Layout.html" title="Managed reference to LayoutNode. ">Layout</a> is to describe how data is organized within an N-dimention tensor. It is composed of upper [...]
-<tr id="row_1_10_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LE.html" target="_self">LE</a></td><td class="desc">Managed reference to <a class="el" href="structtvm_1_1tir_1_1LENode.html" title="a <= b ">LENode</a> </td></tr>
-<tr id="row_1_10_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1LENode.html" target="_self">LENode</a></td><td class="desc"><= b </td></tr>
-<tr id="row_1_10_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Let.html" target="_self">Let</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LetNode.html" title="Let binding. Bind var to value then evaluate body. ">LetNode</a> </td></tr>
-<tr id="row_1_10_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetNode.html" target="_self">LetNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Let.html" title="Managed reference to LetNode. ">Let</a> binding. Bind var to value then evaluate body </td></tr>
-<tr id="row_1_10_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetStmt.html" target="_self">LetStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LetStmtNode.html" title="Let binding, bind var to value, then run body. ">LetStmtNode</a> </td></tr>
-<tr id="row_1_10_90_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetStmtNode.html" target="_self">LetStmtNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Let.html" title="Managed reference to LetNode. ">Let</a> binding, bind var to value, then run body </td></tr>
-<tr id="row_1_10_91_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Load.html" target="_self">Load</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LoadNode.html" title="Load the value from buffer_var. ">LoadNode</a> </td></tr>
-<tr id="row_1_10_92_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoadNode.html" target="_self">LoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> the value from buffer_var </td></tr>
-<tr id="row_1_10_93_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoopRV.html" target="_self">LoopRV</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LoopRVNode.html" title="A random variable that evaluates to a TensorIR for loop. ">LoopRVNode</a> </td></tr>
-<tr id="row_1_10_94_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoopRVNode.html" target="_self">LoopRVNode</a></td><td class="desc">A random variable that evaluates to a TensorIR for loop </td></tr>
-<tr id="row_1_10_95_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LT.html" target="_self">LT</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LTNode.html" title="a < b ">LTNode</a> </td></tr>
-<tr id="row_1_10_96_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LTNode.html" target="_self">LTNode</a></td><td class="desc">< b </td></tr>
-<tr id="row_1_10_97_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MatchBufferRegion.html" target="_self">MatchBufferRegion</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MatchBufferRegionNode.html" title="Match introduces a constraint that the source buffer region can be remapped to the data [...]
-<tr id="row_1_10_98_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MatchBufferRegionNode.html" target="_self">MatchBufferRegionNode</a></td><td class="desc">Match introduces a constraint that the source buffer region can be remapped to the data layout specified by the buffer field. The constraint can be checked in later part of l [...]
-<tr id="row_1_10_99_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Max.html" target="_self">Max</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MaxNode.html" title="max(a, b) ">MaxNode</a> </td></tr>
-<tr id="row_1_10_100_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MaxNode.html" target="_self">MaxNode</a></td><td class="desc">Max(a, b) </td></tr>
-<tr id="row_1_10_101_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Min.html" target="_self">Min</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MinNode.html" title="min(a, b) ">MinNode</a> </td></tr>
-<tr id="row_1_10_102_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MinNode.html" target="_self">MinNode</a></td><td class="desc">Min(a, b) </td></tr>
-<tr id="row_1_10_103_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Mod.html" target="_self">Mod</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ModNode.html" title="a % b in the C semnatics. ">ModNode</a> </td></tr>
-<tr id="row_1_10_104_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ModNode.html" target="_self">ModNode</a></td><td class="desc">% b in the C semnatics </td></tr>
-<tr id="row_1_10_105_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Mul.html" target="_self">Mul</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MulNode.html" title="a * b ">MulNode</a> </td></tr>
-<tr id="row_1_10_106_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MulNode.html" target="_self">MulNode</a></td><td class="desc"><ul>
+<tr id="row_1_11_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Allocate.html" target="_self">Allocate</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AllocateNode.html" title="Allocate a buffer that can be used in body. ">AllocateNode</a> </td></tr>
+<tr id="row_1_11_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateConst.html" target="_self">AllocateConst</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AllocateConstNode.html" title="Allocate a buffer that can be used in body. ">AllocateConstNode</a> </td></tr>
+<tr id="row_1_11_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateConstNode.html" target="_self">AllocateConstNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Allocate.html" title="Managed reference to AllocateNode. ">Allocate</a> a buffer that can be used in body </td></tr>
+<tr id="row_1_11_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AllocateNode.html" target="_self">AllocateNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Allocate.html" title="Managed reference to AllocateNode. ">Allocate</a> a buffer that can be used in body </td></tr>
+<tr id="row_1_11_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1And.html" target="_self">And</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AndNode.html" title="a && b ">AndNode</a> </td></tr>
+<tr id="row_1_11_8_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AndNode.html" target="_self">AndNode</a></td><td class="desc">&& b </td></tr>
+<tr id="row_1_11_9_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Any.html" target="_self">Any</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AnyNode.html" title="Any shape. ">AnyNode</a> </td></tr>
+<tr id="row_1_11_10_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AnyNode.html" target="_self">AnyNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Any.html" title="Managed reference to AnyNode. ">Any</a> shape </td></tr>
+<tr id="row_1_11_11_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AssertStmt.html" target="_self">AssertStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AssertStmtNode.html" title="Assert condition, if an error occurs, return the error message. ">AssertStmtNode</a> </td></tr>
+<tr id="row_1_11_12_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AssertStmtNode.html" target="_self">AssertStmtNode</a></td><td class="desc">Assert condition, if an error occurs, return the error message </td></tr>
+<tr id="row_1_11_13_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AttrStmt.html" target="_self">AttrStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1AttrStmtNode.html" title="Define certain auxiliary attribute for the body to be a symbolic value. This provide auxiliary inform...">AttrStmt [...]
+<tr id="row_1_11_14_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1AttrStmtNode.html" target="_self">AttrStmtNode</a></td><td class="desc">Define certain auxiliary attribute for the body to be a symbolic value. This provide auxiliary information for IR passes that transforms body </td></tr>
+<tr id="row_1_11_15_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BijectiveLayout.html" target="_self">BijectiveLayout</a></td><td class="desc">Bijective function mapping for data layout transformation. Given two <a class="el" href="classtvm_1_1tir_1_1Layout.html" title="Managed reference to LayoutNode. ">Layout</a>, <a class="e [...]
+<tr id="row_1_11_16_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BijectiveLayoutNode.html" target="_self">BijectiveLayoutNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_17_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BinaryOpNode.html" target="_self">BinaryOpNode</a></td><td class="desc">Base template to implement binary ops </td></tr>
+<tr id="row_1_11_18_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Block.html" target="_self">Block</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockNode.html" title="A block is a basic schedule unit in TIR. ">BlockNode</a> </td></tr>
+<tr id="row_1_11_19_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1BlockInfo.html" target="_self">BlockInfo</a></td><td class="desc">The information about a TensorIR block, it contains two categories of information 1) Info on the block scope rooted at a specific block, including dependency tracking, flags indicating if the scope [...]
+<tr id="row_1_11_20_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockNode.html" target="_self">BlockNode</a></td><td class="desc">A block is a basic schedule unit in TIR </td></tr>
+<tr id="row_1_11_21_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRealize.html" target="_self">BlockRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html" title="A block realization node represents execution of the block at the binding values. ...">BlockRealizeNod [...]
+<tr id="row_1_11_22_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRealizeNode.html" target="_self">BlockRealizeNode</a></td><td class="desc">A block realization node represents execution of the block at the binding values </td></tr>
+<tr id="row_1_11_23_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRV.html" target="_self">BlockRV</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockRVNode.html" title="A random variable that evaluates to a TensorIR block. ">BlockRVNode</a> </td></tr>
+<tr id="row_1_11_24_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockRVNode.html" target="_self">BlockRVNode</a></td><td class="desc">A random variable that evaluates to a TensorIR block </td></tr>
+<tr id="row_1_11_25_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockScope.html" target="_self">BlockScope</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BlockScopeNode.html" title="An object with 1-to-1 correspondence with each block reference in the sref tree. This data structure ...">Bl [...]
+<tr id="row_1_11_26_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BlockScopeNode.html" target="_self">BlockScopeNode</a></td><td class="desc">An object with 1-to-1 correspondence with each block reference in the sref tree. This data structure is used to track the producer-consumer dependencies between blocks. <a class="el" href= [...]
+<tr id="row_1_11_27_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Broadcast.html" target="_self">Broadcast</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html" title="Create a vector where all the elements are value. ">BroadcastNode</a> </td></tr>
+<tr id="row_1_11_28_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BroadcastNode.html" target="_self">BroadcastNode</a></td><td class="desc">Create a vector where all the elements are value </td></tr>
+<tr id="row_1_11_29_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Buffer.html" target="_self">Buffer</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Buffer.html" title="Buffer is a symbolic n-darray structure. It is a composition of primitive symbolic types...">Buffer</a> is a symbolic n-darray structure. It is a [...]
+<tr id="row_1_11_30_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferLoad.html" target="_self">BufferLoad</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferLoadNode.html" title="Load value from the high dimension buffer. ">BufferLoadNode</a> </td></tr>
+<tr id="row_1_11_31_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferLoadNode.html" target="_self">BufferLoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> value from the high dimension buffer </td></tr>
+<tr id="row_1_11_32_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferNode.html" target="_self">BufferNode</a></td><td class="desc">Node to represent a buffer </td></tr>
+<tr id="row_1_11_33_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRealize.html" target="_self">BufferRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferRealizeNode.html" title="Annotate the region where the buffer need to be read and write in the body. We only need to allocat [...]
+<tr id="row_1_11_34_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRealizeNode.html" target="_self">BufferRealizeNode</a></td><td class="desc">Annotate the region where the buffer need to be read and write in the body. We only need to allocate the space for the corresponding region </td></tr>
+<tr id="row_1_11_35_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRegion.html" target="_self">BufferRegion</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferRegionNode.html" title="Representing the region of multi-dimensional buffer access. ">BufferRegionNode</a> </td></tr>
+<tr id="row_1_11_36_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferRegionNode.html" target="_self">BufferRegionNode</a></td><td class="desc">Representing the region of multi-dimensional buffer access </td></tr>
+<tr id="row_1_11_37_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferStore.html" target="_self">BufferStore</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1BufferStoreNode.html" title="Store value to the high dimension buffer. ">BufferStoreNode</a> </td></tr>
+<tr id="row_1_11_38_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1BufferStoreNode.html" target="_self">BufferStoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value to the high dimension buffer </td></tr>
+<tr id="row_1_11_39_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Call.html" target="_self">Call</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CallNode.html" title="Call node. ">CallNode</a> </td></tr>
+<tr id="row_1_11_40_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CallNode.html" target="_self">CallNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Call.html" title="Managed reference to CallNode. ">Call</a> node </td></tr>
+<tr id="row_1_11_41_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Cast.html" target="_self">Cast</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CastNode.html" title="Cast value from one data type to another. ">CastNode</a> </td></tr>
+<tr id="row_1_11_42_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CastNode.html" target="_self">CastNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Cast.html" title="Managed reference to CastNode. ">Cast</a> value from one data type to another </td></tr>
+<tr id="row_1_11_43_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CmpOpNode.html" target="_self">CmpOpNode</a></td><td class="desc">Base template to implement comparison ops </td></tr>
+<tr id="row_1_11_44_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CommReducer.html" target="_self">CommReducer</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1CommReducerNode.html" title="A commutative reducer node to represent a commutative binary operator with identity element...">CommReduc [...]
+<tr id="row_1_11_45_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1CommReducerNode.html" target="_self">CommReducerNode</a></td><td class="desc">A commutative reducer node to represent a commutative binary operator with identity element </td></tr>
+<tr id="row_1_11_46_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DataProducer.html" target="_self">DataProducer</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DataProducerNode.html" title="Base node for data producers. ">DataProducerNode</a> </td></tr>
+<tr id="row_1_11_47_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DataProducerNode.html" target="_self">DataProducerNode</a></td><td class="desc">Base node for data producers </td></tr>
+<tr id="row_1_11_48_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Dependency.html" target="_self">Dependency</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DependencyNode.html" title="A tuple (src, dst, kind) representing certain types of dependency. For example, (A, B, kRAW) means block B d [...]
+<tr id="row_1_11_49_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DependencyNode.html" target="_self">DependencyNode</a></td><td class="desc">A tuple (src, dst, kind) representing certain types of dependency. <a class="el" href="classtvm_1_1tir_1_1For.html" title="Managed reference to ForNode. ">For</a> example, (A, B, kRAW) mea [...]
+<tr id="row_1_11_50_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Div.html" target="_self">Div</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1DivNode.html" title="a / b in the C semnatics. ">DivNode</a> </td></tr>
+<tr id="row_1_11_51_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1DivNode.html" target="_self">DivNode</a></td><td class="desc">/ b in the C semnatics </td></tr>
+<tr id="row_1_11_52_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EQ.html" target="_self">EQ</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1EQNode.html" title="a == b ">EQNode</a> </td></tr>
+<tr id="row_1_11_53_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EQNode.html" target="_self">EQNode</a></td><td class="desc">== b </td></tr>
+<tr id="row_1_11_54_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Evaluate.html" target="_self">Evaluate</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1EvaluateNode.html" title="Evaluates an expression. This is mostly used for putting a Call node into Stmt. ">EvaluateNode</a> </td></tr>
+<tr id="row_1_11_55_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1EvaluateNode.html" target="_self">EvaluateNode</a></td><td class="desc">Evaluates an expression. This is mostly used for putting a <a class="el" href="classtvm_1_1tir_1_1Call.html" title="Managed reference to CallNode. ">Call</a> node into <a class="el" href="clas [...]
+<tr id="row_1_11_56_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1ExprDeepEqual.html" target="_self">ExprDeepEqual</a></td><td class="desc">Compare two expressions recursively and check if they are equal to each other without var remapping </td></tr>
+<tr id="row_1_11_57_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprFunctor.html" target="_self">ExprFunctor</a></td><td class="desc">A dynamical functor that dispatches on in the first Expr argument. You can use this as a more powerful Visitor, since it allows you to define function signatures of Visit Function </td></tr>
+<tr id="row_1_11_58_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprFunctor_3_01R_07const_01PrimExpr_01_6n_00_01Args_8_8_8_08_4.html" target="_self">ExprFunctor< R(const PrimExpr &n, Args...)></a></td><td class="desc"></td></tr>
+<tr id="row_1_11_59_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprMutator.html" target="_self">ExprMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1ExprMutator.html" title="ExprMutator that mutates expressions. ">ExprMutator</a> that mutates expressions </td></tr>
+<tr id="row_1_11_60_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ExprVisitor.html" target="_self">ExprVisitor</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1ExprVisitor.html" title="ExprVisitor. ">ExprVisitor</a> </td></tr>
+<tr id="row_1_11_61_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorDiv.html" target="_self">FloorDiv</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1FloorDivNode.html" title="Floor division, floor(a/b) ">FloorDivNode</a> </td></tr>
+<tr id="row_1_11_62_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorDivNode.html" target="_self">FloorDivNode</a></td><td class="desc">Floor division, floor(a/b) </td></tr>
+<tr id="row_1_11_63_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorMod.html" target="_self">FloorMod</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1FloorModNode.html" title="The remainder of the floordiv. ">FloorModNode</a> </td></tr>
+<tr id="row_1_11_64_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1FloorModNode.html" target="_self">FloorModNode</a></td><td class="desc">The remainder of the floordiv </td></tr>
+<tr id="row_1_11_65_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1For.html" target="_self">For</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ForNode.html" title="A for loop, with poissible type annotations. ">ForNode</a> </td></tr>
+<tr id="row_1_11_66_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ForNode.html" target="_self">ForNode</a></td><td class="desc">A for loop, with poissible type annotations </td></tr>
+<tr id="row_1_11_67_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GE.html" target="_self">GE</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1GENode.html" title="a >= b ">GENode</a> </td></tr>
+<tr id="row_1_11_68_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GENode.html" target="_self">GENode</a></td><td class="desc">>= b </td></tr>
+<tr id="row_1_11_69_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GT.html" target="_self">GT</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1GTNode.html" title="a > b ">GTNode</a> </td></tr>
+<tr id="row_1_11_70_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1GTNode.html" target="_self">GTNode</a></td><td class="desc">> b </td></tr>
+<tr id="row_1_11_71_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IfThenElse.html" target="_self">IfThenElse</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1IfThenElseNode.html" title="IfThenElse statment. ">IfThenElseNode</a> </td></tr>
+<tr id="row_1_11_72_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IfThenElseNode.html" target="_self">IfThenElseNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1IfThenElse.html" title="Managed reference to IfThenElseNode. ">IfThenElse</a> statment </td></tr>
+<tr id="row_1_11_73_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IndexMap.html" target="_self">IndexMap</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_74_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IndexMapNode.html" target="_self">IndexMapNode</a></td><td class="desc">Defines a mapping between two representations of indices into a buffer </td></tr>
+<tr id="row_1_11_75_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Instruction.html" target="_self">Instruction</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1InstructionNode.html" title="Schedule instructions each corresponds to a schedule primitive. ">InstructionNode</a> </td></tr>
+<tr id="row_1_11_76_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKind.html" target="_self">InstructionKind</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1InstructionKindNode.html" title="Kind of an instruction, e.g. Split, Reorder, etc. Besides the name, every kind of instruction [...]
+<tr id="row_1_11_77_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKindNode.html" target="_self">InstructionKindNode</a></td><td class="desc">Kind of an instruction, e.g. Split, Reorder, etc. Besides the name, every kind of instruction has its own properties, including: 1) A boolean indicating if the instruction is pur [...]
+<tr id="row_1_11_78_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionKindRegEntry.html" target="_self">InstructionKindRegEntry</a></td><td class="desc">An entry in the registry of <a class="el" href="classtvm_1_1tir_1_1InstructionKind.html" title="Managed reference to InstructionKindNode. ">InstructionKind</a> </td></tr>
+<tr id="row_1_11_79_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1InstructionNode.html" target="_self">InstructionNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Schedule.html" title="Managed reference to ScheduleNode. ">Schedule</a> instructions each corresponds to a schedule primitive </td></tr>
+<tr id="row_1_11_80_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IterVar.html" target="_self">IterVar</a></td><td class="desc">Iteration Variable, represents an iteration over an integer interval </td></tr>
+<tr id="row_1_11_81_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1IterVarNode.html" target="_self">IterVarNode</a></td><td class="desc">An iteration variable representing an iteration over a one dimensional interval </td></tr>
+<tr id="row_1_11_82_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Layout.html" target="_self">Layout</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LayoutNode.html" title="Layout is to describe how data is organized within an N-dimention tensor. It is composed of upper cas...">LayoutNode</a> [...]
+<tr id="row_1_11_83_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LayoutAxis.html" target="_self">LayoutAxis</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_84_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LayoutNode.html" target="_self">LayoutNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Layout.html" title="Managed reference to LayoutNode. ">Layout</a> is to describe how data is organized within an N-dimention tensor. It is composed of upper [...]
+<tr id="row_1_11_85_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LE.html" target="_self">LE</a></td><td class="desc">Managed reference to <a class="el" href="structtvm_1_1tir_1_1LENode.html" title="a <= b ">LENode</a> </td></tr>
+<tr id="row_1_11_86_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1tir_1_1LENode.html" target="_self">LENode</a></td><td class="desc"><= b </td></tr>
+<tr id="row_1_11_87_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Let.html" target="_self">Let</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LetNode.html" title="Let binding. Bind var to value then evaluate body. ">LetNode</a> </td></tr>
+<tr id="row_1_11_88_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetNode.html" target="_self">LetNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Let.html" title="Managed reference to LetNode. ">Let</a> binding. Bind var to value then evaluate body </td></tr>
+<tr id="row_1_11_89_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetStmt.html" target="_self">LetStmt</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LetStmtNode.html" title="Let binding, bind var to value, then run body. ">LetStmtNode</a> </td></tr>
+<tr id="row_1_11_90_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LetStmtNode.html" target="_self">LetStmtNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Let.html" title="Managed reference to LetNode. ">Let</a> binding, bind var to value, then run body </td></tr>
+<tr id="row_1_11_91_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Load.html" target="_self">Load</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LoadNode.html" title="Load the value from buffer_var. ">LoadNode</a> </td></tr>
+<tr id="row_1_11_92_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoadNode.html" target="_self">LoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> the value from buffer_var </td></tr>
+<tr id="row_1_11_93_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoopRV.html" target="_self">LoopRV</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LoopRVNode.html" title="A random variable that evaluates to a TensorIR for loop. ">LoopRVNode</a> </td></tr>
+<tr id="row_1_11_94_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LoopRVNode.html" target="_self">LoopRVNode</a></td><td class="desc">A random variable that evaluates to a TensorIR for loop </td></tr>
+<tr id="row_1_11_95_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LT.html" target="_self">LT</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1LTNode.html" title="a < b ">LTNode</a> </td></tr>
+<tr id="row_1_11_96_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1LTNode.html" target="_self">LTNode</a></td><td class="desc">< b </td></tr>
+<tr id="row_1_11_97_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MatchBufferRegion.html" target="_self">MatchBufferRegion</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MatchBufferRegionNode.html" title="Match introduces a constraint that the source buffer region can be remapped to the data [...]
+<tr id="row_1_11_98_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MatchBufferRegionNode.html" target="_self">MatchBufferRegionNode</a></td><td class="desc">Match introduces a constraint that the source buffer region can be remapped to the data layout specified by the buffer field. The constraint can be checked in later part of l [...]
+<tr id="row_1_11_99_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Max.html" target="_self">Max</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MaxNode.html" title="max(a, b) ">MaxNode</a> </td></tr>
+<tr id="row_1_11_100_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MaxNode.html" target="_self">MaxNode</a></td><td class="desc">Max(a, b) </td></tr>
+<tr id="row_1_11_101_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Min.html" target="_self">Min</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MinNode.html" title="min(a, b) ">MinNode</a> </td></tr>
+<tr id="row_1_11_102_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MinNode.html" target="_self">MinNode</a></td><td class="desc">Min(a, b) </td></tr>
+<tr id="row_1_11_103_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Mod.html" target="_self">Mod</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ModNode.html" title="a % b in the C semnatics. ">ModNode</a> </td></tr>
+<tr id="row_1_11_104_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ModNode.html" target="_self">ModNode</a></td><td class="desc">% b in the C semnatics </td></tr>
+<tr id="row_1_11_105_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Mul.html" target="_self">Mul</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1MulNode.html" title="a * b ">MulNode</a> </td></tr>
+<tr id="row_1_11_106_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1MulNode.html" target="_self">MulNode</a></td><td class="desc"><ul>
<li>b </li>
</ul>
</td></tr>
-<tr id="row_1_10_107_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NE.html" target="_self">NE</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1NENode.html" title="a != b ">NENode</a> </td></tr>
-<tr id="row_1_10_108_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NENode.html" target="_self">NENode</a></td><td class="desc">!= b </td></tr>
-<tr id="row_1_10_109_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Not.html" target="_self">Not</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1NotNode.html" title="!a ">NotNode</a> </td></tr>
-<tr id="row_1_10_110_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NotNode.html" target="_self">NotNode</a></td><td class="desc">!a </td></tr>
-<tr id="row_1_10_111_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Or.html" target="_self">Or</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1OrNode.html" title="a || b ">OrNode</a> </td></tr>
-<tr id="row_1_10_112_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1OrNode.html" target="_self">OrNode</a></td><td class="desc">|| b </td></tr>
-<tr id="row_1_10_113_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Prefetch.html" target="_self">Prefetch</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1PrefetchNode.html" title="A prefetch hint for a buffer. ">PrefetchNode</a> </td></tr>
-<tr id="row_1_10_114_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrefetchNode.html" target="_self">PrefetchNode</a></td><td class="desc">A prefetch hint for a buffer </td></tr>
-<tr id="row_1_10_115_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" target="_self">PrimFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1PrimFuncNode.html" title="Primitive functions that contains TIR statements. ">PrimFuncNode</a> </td></tr>
-<tr id="row_1_10_116_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrimFuncNode.html" target="_self">PrimFuncNode</a></td><td class="desc">Primitive functions that contains TIR statements </td></tr>
-<tr id="row_1_10_117_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerLoad.html" target="_self">ProducerLoad</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerLoadNode.html" title="Load value from the result produced by the producer. ">ProducerLoadNode</a> </td></tr>
-<tr id="row_1_10_118_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerLoadNode.html" target="_self">ProducerLoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> value from the result produced by the producer </td></tr>
-<tr id="row_1_10_119_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerRealize.html" target="_self">ProducerRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerRealizeNode.html" title="Annotate the bounds where the data produced by the producer need to be written and read in b [...]
-<tr id="row_1_10_120_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerRealizeNode.html" target="_self">ProducerRealizeNode</a></td><td class="desc">Annotate the bounds where the data produced by the producer need to be written and read in body. We will need to allocate space for the corresponding regions </td></tr>
-<tr id="row_1_10_121_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerStore.html" target="_self">ProducerStore</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerStoreNode.html" title="Store value into mult-dimensional array that will be read by the consumer of the producer. ">Produ [...]
-<tr id="row_1_10_122_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerStoreNode.html" target="_self">ProducerStoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value into mult-dimensional array that will be read by the consumer of the prod [...]
-<tr id="row_1_10_123_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Ramp.html" target="_self">Ramp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1RampNode.html" title="Construct a vector with lanes elements where its i-th element equals base + i * stride. This is useful to construct a index f [...]
-<tr id="row_1_10_124_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1RampNode.html" target="_self">RampNode</a></td><td class="desc">Construct a vector with lanes elements where its i-th element equals base + i * stride. This is useful to construct a index for a continuous vector load </td></tr>
-<tr id="row_1_10_125_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Reduce.html" target="_self">Reduce</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ReduceNode.html" title="Reduction operator operator. ">ReduceNode</a> </td></tr>
-<tr id="row_1_10_126_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ReduceNode.html" target="_self">ReduceNode</a></td><td class="desc">Reduction operator operator </td></tr>
-<tr id="row_1_10_127_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Schedule.html" target="_self">Schedule</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ScheduleNode.html" title="The user-facing schedule class. ">ScheduleNode</a> </td></tr>
-<tr id="row_1_10_128_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleNode.html" target="_self">ScheduleNode</a></td><td class="desc">The user-facing schedule class </td></tr>
-<tr id="row_1_10_129_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleState.html" target="_self">ScheduleState</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ScheduleStateNode.html" title="The state of scheduling, which exposes a Replace method as the primary interface for all the sched [...]
-<tr id="row_1_10_130_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleStateNode.html" target="_self">ScheduleStateNode</a></td><td class="desc">The state of scheduling, which exposes a <code>Replace</code> method as the primary interface for all the scheduling primitives to manipulate the TensorIR </td></tr>
-<tr id="row_1_10_131_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Select.html" target="_self">Select</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1SelectNode.html" title="return true_value if condition is true, otherwise return false_value. ">SelectNode</a> </td></tr>
-<tr id="row_1_10_132_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SelectNode.html" target="_self">SelectNode</a></td><td class="desc">Return true_value if condition is true, otherwise return false_value </td></tr>
-<tr id="row_1_10_133_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_10_133_" class="arrow" onclick="toggleFolder('1_10_133_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmt.html" target="_self">SeqStmt</a></td><td class="desc">Sequence statement </td></tr>
-<tr id="row_1_10_133_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmt_1_1Flattener.html" target="_self">Flattener</a></td><td class="desc">Helper class to flatten sequence of arguments into Array </td></tr>
-<tr id="row_1_10_134_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmtNode.html" target="_self">SeqStmtNode</a></td><td class="desc">The container of seq statement. Represent a sequence of statements </td></tr>
-<tr id="row_1_10_135_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Shuffle.html" target="_self">Shuffle</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ShuffleNode.html" title="Shuffle instruction. vec = concat(vectors) result = (vec[indices[0]], vec[indices[1]] ...">ShuffleNode</a> </td></tr>
-<tr id="row_1_10_136_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ShuffleNode.html" target="_self">ShuffleNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Shuffle.html" title="Managed reference to ShuffleNode. ">Shuffle</a> instruction. vec = concat(vectors) result = (vec[indices[0]], vec[indices[1]] ...) </ [...]
-<tr id="row_1_10_137_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SizeVar.html" target="_self">SizeVar</a></td><td class="desc">Named variable represents a tensor index size </td></tr>
-<tr id="row_1_10_138_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SizeVarNode.html" target="_self">SizeVarNode</a></td><td class="desc">A variable node represent a tensor index size, whose value must be non-negative </td></tr>
-<tr id="row_1_10_139_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Stmt.html" target="_self">Stmt</a></td><td class="desc">Container of all statements </td></tr>
-<tr id="row_1_10_140_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtExprMutator.html" target="_self">StmtExprMutator</a></td><td class="desc">Mutator that recursively mutates stmts and exprs on them </td></tr>
-<tr id="row_1_10_141_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtExprVisitor.html" target="_self">StmtExprVisitor</a></td><td class="desc">Visitor that recursively visit stmts and exprs on them </td></tr>
-<tr id="row_1_10_142_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtFunctor.html" target="_self">StmtFunctor</a></td><td class="desc">Same as <a class="el" href="classtvm_1_1tir_1_1ExprFunctor.html" title="A dynamical functor that dispatches on in the first Expr argument. You can use this as a more powerfu...">ExprFunctor</a> [...]
-<tr id="row_1_10_143_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtFunctor_3_01R_07const_01Stmt_01_6n_00_01Args_8_8_8_01args_08_4.html" target="_self">StmtFunctor< R(const Stmt &n, Args... args)></a></td><td class="desc"></td></tr>
-<tr id="row_1_10_144_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtMutator.html" target="_self">StmtMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1StmtMutator.html" title="StmtMutator that mutates the statements. ">StmtMutator</a> that mutates the statements </td></tr>
-<tr id="row_1_10_145_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtNode.html" target="_self">StmtNode</a></td><td class="desc">Base node of all statements </td></tr>
-<tr id="row_1_10_146_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtSRef.html" target="_self">StmtSRef</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StmtSRefNode.html" title="An object that refers to schedulable elements (block/for-loop) in TensorIR, aka "sref". ">StmtSRefNode< [...]
-<tr id="row_1_10_147_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtSRefNode.html" target="_self">StmtSRefNode</a></td><td class="desc">An object that refers to schedulable elements (block/for-loop) in TensorIR, aka "sref" </td></tr>
-<tr id="row_1_10_148_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtVisitor.html" target="_self">StmtVisitor</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1StmtVisitor.html" title="StmtVisitor. ">StmtVisitor</a> </td></tr>
-<tr id="row_1_10_149_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Store.html" target="_self">Store</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StoreNode.html" title="Store value to the buffer. ">StoreNode</a> </td></tr>
-<tr id="row_1_10_150_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StoreNode.html" target="_self">StoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value to the buffer </td></tr>
-<tr id="row_1_10_151_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StringImm.html" target="_self">StringImm</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StringImmNode.html" title="String constants, only used in asserts. ">StringImmNode</a> </td></tr>
-<tr id="row_1_10_152_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StringImmNode.html" target="_self">StringImmNode</a></td><td class="desc">String constants, only used in asserts </td></tr>
-<tr id="row_1_10_153_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Sub.html" target="_self">Sub</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1SubNode.html" title="a - b ">SubNode</a> </td></tr>
-<tr id="row_1_10_154_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SubNode.html" target="_self">SubNode</a></td><td class="desc"><ul>
+<tr id="row_1_11_107_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NE.html" target="_self">NE</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1NENode.html" title="a != b ">NENode</a> </td></tr>
+<tr id="row_1_11_108_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NENode.html" target="_self">NENode</a></td><td class="desc">!= b </td></tr>
+<tr id="row_1_11_109_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Not.html" target="_self">Not</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1NotNode.html" title="!a ">NotNode</a> </td></tr>
+<tr id="row_1_11_110_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1NotNode.html" target="_self">NotNode</a></td><td class="desc">!a </td></tr>
+<tr id="row_1_11_111_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Or.html" target="_self">Or</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1OrNode.html" title="a || b ">OrNode</a> </td></tr>
+<tr id="row_1_11_112_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1OrNode.html" target="_self">OrNode</a></td><td class="desc">|| b </td></tr>
+<tr id="row_1_11_113_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Prefetch.html" target="_self">Prefetch</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1PrefetchNode.html" title="A prefetch hint for a buffer. ">PrefetchNode</a> </td></tr>
+<tr id="row_1_11_114_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrefetchNode.html" target="_self">PrefetchNode</a></td><td class="desc">A prefetch hint for a buffer </td></tr>
+<tr id="row_1_11_115_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrimFunc.html" target="_self">PrimFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1PrimFuncNode.html" title="Primitive functions that contains TIR statements. ">PrimFuncNode</a> </td></tr>
+<tr id="row_1_11_116_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1PrimFuncNode.html" target="_self">PrimFuncNode</a></td><td class="desc">Primitive functions that contains TIR statements </td></tr>
+<tr id="row_1_11_117_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerLoad.html" target="_self">ProducerLoad</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerLoadNode.html" title="Load value from the result produced by the producer. ">ProducerLoadNode</a> </td></tr>
+<tr id="row_1_11_118_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerLoadNode.html" target="_self">ProducerLoadNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Load.html" title="Managed reference to LoadNode. ">Load</a> value from the result produced by the producer </td></tr>
+<tr id="row_1_11_119_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerRealize.html" target="_self">ProducerRealize</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerRealizeNode.html" title="Annotate the bounds where the data produced by the producer need to be written and read in b [...]
+<tr id="row_1_11_120_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerRealizeNode.html" target="_self">ProducerRealizeNode</a></td><td class="desc">Annotate the bounds where the data produced by the producer need to be written and read in body. We will need to allocate space for the corresponding regions </td></tr>
+<tr id="row_1_11_121_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerStore.html" target="_self">ProducerStore</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ProducerStoreNode.html" title="Store value into mult-dimensional array that will be read by the consumer of the producer. ">Produ [...]
+<tr id="row_1_11_122_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ProducerStoreNode.html" target="_self">ProducerStoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value into mult-dimensional array that will be read by the consumer of the prod [...]
+<tr id="row_1_11_123_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Ramp.html" target="_self">Ramp</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1RampNode.html" title="Construct a vector with lanes elements where its i-th element equals base + i * stride. This is useful to construct a index f [...]
+<tr id="row_1_11_124_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1RampNode.html" target="_self">RampNode</a></td><td class="desc">Construct a vector with lanes elements where its i-th element equals base + i * stride. This is useful to construct a index for a continuous vector load </td></tr>
+<tr id="row_1_11_125_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Reduce.html" target="_self">Reduce</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ReduceNode.html" title="Reduction operator operator. ">ReduceNode</a> </td></tr>
+<tr id="row_1_11_126_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ReduceNode.html" target="_self">ReduceNode</a></td><td class="desc">Reduction operator operator </td></tr>
+<tr id="row_1_11_127_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Schedule.html" target="_self">Schedule</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ScheduleNode.html" title="The user-facing schedule class. ">ScheduleNode</a> </td></tr>
+<tr id="row_1_11_128_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleNode.html" target="_self">ScheduleNode</a></td><td class="desc">The user-facing schedule class </td></tr>
+<tr id="row_1_11_129_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleState.html" target="_self">ScheduleState</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ScheduleStateNode.html" title="The state of scheduling, which exposes a Replace method as the primary interface for all the sched [...]
+<tr id="row_1_11_130_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ScheduleStateNode.html" target="_self">ScheduleStateNode</a></td><td class="desc">The state of scheduling, which exposes a <code>Replace</code> method as the primary interface for all the scheduling primitives to manipulate the TensorIR </td></tr>
+<tr id="row_1_11_131_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Select.html" target="_self">Select</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1SelectNode.html" title="return true_value if condition is true, otherwise return false_value. ">SelectNode</a> </td></tr>
+<tr id="row_1_11_132_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SelectNode.html" target="_self">SelectNode</a></td><td class="desc">Return true_value if condition is true, otherwise return false_value </td></tr>
+<tr id="row_1_11_133_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span id="arr_1_11_133_" class="arrow" onclick="toggleFolder('1_11_133_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmt.html" target="_self">SeqStmt</a></td><td class="desc">Sequence statement </td></tr>
+<tr id="row_1_11_133_0_" class="even" style="display:none;"><td class="entry"><span style="width:64px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmt_1_1Flattener.html" target="_self">Flattener</a></td><td class="desc">Helper class to flatten sequence of arguments into Array </td></tr>
+<tr id="row_1_11_134_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SeqStmtNode.html" target="_self">SeqStmtNode</a></td><td class="desc">The container of seq statement. Represent a sequence of statements </td></tr>
+<tr id="row_1_11_135_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Shuffle.html" target="_self">Shuffle</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1ShuffleNode.html" title="Shuffle instruction. vec = concat(vectors) result = (vec[indices[0]], vec[indices[1]] ...">ShuffleNode</a> </td></tr>
+<tr id="row_1_11_136_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1ShuffleNode.html" target="_self">ShuffleNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Shuffle.html" title="Managed reference to ShuffleNode. ">Shuffle</a> instruction. vec = concat(vectors) result = (vec[indices[0]], vec[indices[1]] ...) </ [...]
+<tr id="row_1_11_137_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SizeVar.html" target="_self">SizeVar</a></td><td class="desc">Named variable represents a tensor index size </td></tr>
+<tr id="row_1_11_138_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SizeVarNode.html" target="_self">SizeVarNode</a></td><td class="desc">A variable node represent a tensor index size, whose value must be non-negative </td></tr>
+<tr id="row_1_11_139_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Stmt.html" target="_self">Stmt</a></td><td class="desc">Container of all statements </td></tr>
+<tr id="row_1_11_140_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtExprMutator.html" target="_self">StmtExprMutator</a></td><td class="desc">Mutator that recursively mutates stmts and exprs on them </td></tr>
+<tr id="row_1_11_141_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtExprVisitor.html" target="_self">StmtExprVisitor</a></td><td class="desc">Visitor that recursively visit stmts and exprs on them </td></tr>
+<tr id="row_1_11_142_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtFunctor.html" target="_self">StmtFunctor</a></td><td class="desc">Same as <a class="el" href="classtvm_1_1tir_1_1ExprFunctor.html" title="A dynamical functor that dispatches on in the first Expr argument. You can use this as a more powerfu...">ExprFunctor</a> [...]
+<tr id="row_1_11_143_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtFunctor_3_01R_07const_01Stmt_01_6n_00_01Args_8_8_8_01args_08_4.html" target="_self">StmtFunctor< R(const Stmt &n, Args... args)></a></td><td class="desc"></td></tr>
+<tr id="row_1_11_144_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtMutator.html" target="_self">StmtMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1StmtMutator.html" title="StmtMutator that mutates the statements. ">StmtMutator</a> that mutates the statements </td></tr>
+<tr id="row_1_11_145_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtNode.html" target="_self">StmtNode</a></td><td class="desc">Base node of all statements </td></tr>
+<tr id="row_1_11_146_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtSRef.html" target="_self">StmtSRef</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StmtSRefNode.html" title="An object that refers to schedulable elements (block/for-loop) in TensorIR, aka "sref". ">StmtSRefNode< [...]
+<tr id="row_1_11_147_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtSRefNode.html" target="_self">StmtSRefNode</a></td><td class="desc">An object that refers to schedulable elements (block/for-loop) in TensorIR, aka "sref" </td></tr>
+<tr id="row_1_11_148_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StmtVisitor.html" target="_self">StmtVisitor</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1StmtVisitor.html" title="StmtVisitor. ">StmtVisitor</a> </td></tr>
+<tr id="row_1_11_149_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Store.html" target="_self">Store</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StoreNode.html" title="Store value to the buffer. ">StoreNode</a> </td></tr>
+<tr id="row_1_11_150_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StoreNode.html" target="_self">StoreNode</a></td><td class="desc"><a class="el" href="classtvm_1_1tir_1_1Store.html" title="Managed reference to StoreNode. ">Store</a> value to the buffer </td></tr>
+<tr id="row_1_11_151_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StringImm.html" target="_self">StringImm</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1StringImmNode.html" title="String constants, only used in asserts. ">StringImmNode</a> </td></tr>
+<tr id="row_1_11_152_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1StringImmNode.html" target="_self">StringImmNode</a></td><td class="desc">String constants, only used in asserts </td></tr>
+<tr id="row_1_11_153_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Sub.html" target="_self">Sub</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1SubNode.html" title="a - b ">SubNode</a> </td></tr>
+<tr id="row_1_11_154_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1SubNode.html" target="_self">SubNode</a></td><td class="desc"><ul>
<li>b </li>
</ul>
</td></tr>
-<tr id="row_1_10_155_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TensorIntrin.html" target="_self">TensorIntrin</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1TensorIntrinNode.html" title="Tensor intrinsics for tensorization. ">TensorIntrinNode</a> </td></tr>
-<tr id="row_1_10_156_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TensorIntrinNode.html" target="_self">TensorIntrinNode</a></td><td class="desc">Tensor intrinsics for tensorization </td></tr>
-<tr id="row_1_10_157_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Trace.html" target="_self">Trace</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1TraceNode.html" title="An execution trace of a scheduling program. ">TraceNode</a> </td></tr>
-<tr id="row_1_10_158_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TraceNode.html" target="_self">TraceNode</a></td><td class="desc">An execution trace of a scheduling program </td></tr>
-<tr id="row_1_10_159_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Var.html" target="_self">Var</a></td><td class="desc">Named variable in TIR </td></tr>
-<tr id="row_1_10_160_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1VarNode.html" target="_self">VarNode</a></td><td class="desc">A variable node in the IR </td></tr>
-<tr id="row_1_10_161_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1While.html" target="_self">While</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1WhileNode.html" title="A While loop. ">WhileNode</a> </td></tr>
-<tr id="row_1_10_162_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1WhileNode.html" target="_self">WhileNode</a></td><td class="desc">A <a class="el" href="classtvm_1_1tir_1_1While.html" title="Managed reference to WhileNode. ">While</a> loop </td></tr>
-<tr id="row_1_11_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_11_" class="arrow" onclick="toggleFolder('1_11_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1topi.html" target="_self">topi</a></td><td class="desc"></td></tr>
-<tr id="row_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_12_" class="arrow" onclick="toggleFolder('1_12_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
-<tr id="row_1_12_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1Pass.html" target="_self">Pass</a></td><td class="desc"></td></tr>
-<tr id="row_1_12_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassContext.html" target="_self">PassContext</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassContext.html" title="PassContext that is used to configure the pass behavior. ">PassContext</a> that is used to configure the pass behavior [...]
-<tr id="row_1_12_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassContextNode.html" target="_self">PassContextNode</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassContextNode.html" title="PassContextNode contains the information that a pass can rely on, such as analysis results...">PassContextN [...]
-<tr id="row_1_12_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassInfo.html" target="_self">PassInfo</a></td><td class="desc">Managed reference class for <a class="el" href="classtvm_1_1transform_1_1PassInfoNode.html" title="Meta data that will be used to help optimization and analysis. ">PassInfoNode</a> </td></tr>
-<tr id="row_1_12_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassInfoNode.html" target="_self">PassInfoNode</a></td><td class="desc">Meta data that will be used to help optimization and analysis </td></tr>
-<tr id="row_1_12_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassNode.html" target="_self">PassNode</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassNode.html" title="PassNode is the base type of differnt types of optimization passes. It is designed as a pure class an...">PassNode</a> is the ba [...]
-<tr id="row_1_12_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1Sequential.html" target="_self">Sequential</a></td><td class="desc"></td></tr>
-<tr id="row_1_12_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1SequentialNode.html" target="_self">SequentialNode</a></td><td class="desc">The <a class="el" href="classtvm_1_1transform_1_1SequentialNode.html" title="The SequentialNode contains a set of passes that transform Relay programs from one AST to another sem..."> [...]
-<tr id="row_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AffineType.html" target="_self">AffineType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1AffineTypeNode.html" title="AffineType representation. ">AffineTypeNode</a> </td></tr>
-<tr id="row_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AffineTypeNode.html" target="_self">AffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1AffineType.html" title="Managed reference to AffineTypeNode. ">AffineType</a> representation </td></tr>
-<tr id="row_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1AttrError.html" target="_self">AttrError</a></td><td class="desc">Error thrown during attribute checking </td></tr>
-<tr id="row_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrFieldInfo.html" target="_self">AttrFieldInfo</a></td><td class="desc"><a class="el" href="classtvm_1_1AttrFieldInfo.html" title="AttrFieldInfo. ">AttrFieldInfo</a> </td></tr>
-<tr id="row_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrFieldInfoNode.html" target="_self">AttrFieldInfoNode</a></td><td class="desc">Information about attribute fields in string representations </td></tr>
-<tr id="row_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistry.html" target="_self">AttrRegistry</a></td><td class="desc"></td></tr>
-<tr id="row_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistryMap.html" target="_self">AttrRegistryMap</a></td><td class="desc">Map<Key, ValueType> used to store meta-data </td></tr>
-<tr id="row_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistryMapContainerMap.html" target="_self">AttrRegistryMapContainerMap</a></td><td class="desc">Generic attribute map </td></tr>
-<tr id="row_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Attrs.html" target="_self">Attrs</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseAttrsNode.html" title="Base class of all attribute class. ">BaseAttrsNode</a> </td></tr>
-<tr id="row_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrsNode.html" target="_self">AttrsNode</a></td><td class="desc">The base class of the all the Use "curiously recurring template pattern" </td></tr>
-<tr id="row_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrVisitor.html" target="_self">AttrVisitor</a></td><td class="desc">Visitor class to get the attributes of an AST/IR node. The content is going to be called for each field </td></tr>
-<tr id="row_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseAttrsNode.html" target="_self">BaseAttrsNode</a></td><td class="desc">Base class of all attribute class </td></tr>
-<tr id="row_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseExpr.html" target="_self">BaseExpr</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseExprNode.html" title="Base type of all the expressions. ">BaseExprNode</a> </td></tr>
-<tr id="row_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseExprNode.html" target="_self">BaseExprNode</a></td><td class="desc">Base type of all the expressions </td></tr>
-<tr id="row_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseFunc.html" target="_self">BaseFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseFuncNode.html" title="Base node of all functions. ">BaseFuncNode</a> </td></tr>
-<tr id="row_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseFuncNode.html" target="_self">BaseFuncNode</a></td><td class="desc">Base node of all functions </td></tr>
-<tr id="row_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseTensorType.html" target="_self">BaseTensorType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseTensorTypeNode.html" title="Base of all Tensor types This container can hold TensorType or GenericTensorType. ...">BaseTensorTypeNode</a> </td></tr>
-<tr id="row_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseTensorTypeNode.html" target="_self">BaseTensorTypeNode</a></td><td class="desc">Base of all Tensor types This container can hold <a class="el" href="classtvm_1_1TensorType.html" title="Managed reference to TensorTypeNode. ">TensorType</a> or GenericTensorType </td></tr>
-<tr id="row_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseValueEqual.html" target="_self">BaseValueEqual</a></td><td class="desc">Equality definition of base value class </td></tr>
-<tr id="row_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseValueHash.html" target="_self">BaseValueHash</a></td><td class="desc">Hash definition of base value classes </td></tr>
-<tr id="row_1_33_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Bool.html" target="_self">Bool</a></td><td class="desc">Boolean constant </td></tr>
-<tr id="row_1_34_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompilationConfig.html" target="_self">CompilationConfig</a></td><td class="desc">Managed reference class to <code><a class="el" href="classtvm_1_1CompilationConfig.html" title="Managed reference class to CompilationConfig. ">CompilationConfig</a></code> </td></tr>
-<tr id="row_1_35_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompilationConfigNode.html" target="_self">CompilationConfigNode</a></td><td class="desc">Gathers the <code>Targets</code> and distinguished <code>VirtualDevices</code> in canonical form needed to compile a Relay module for execution over possibly heterogeneous devices. Cen [...]
-<tr id="row_1_36_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompileError.html" target="_self">CompileError</a></td><td class="desc">Custom Error class to be thrown during compilation </td></tr>
-<tr id="row_1_37_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantInfo.html" target="_self">ConstantInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_38_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantInfoNode.html" target="_self">ConstantInfoNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_39_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantMemoryPools.html" target="_self">ConstantMemoryPools</a></td><td class="desc"></td></tr>
-<tr id="row_1_40_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantMemoryPoolsNode.html" target="_self">ConstantMemoryPoolsNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_41_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantPoolInfo.html" target="_self">ConstantPoolInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_42_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantPoolInfoNode.html" target="_self">ConstantPoolInfoNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_43_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Constructor.html" target="_self">Constructor</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1ConstructorNode.html" title="ADT constructor. Constructors compare by pointer equality. ">ConstructorNode</a> </td></tr>
-<tr id="row_1_44_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstructorNode.html" target="_self">ConstructorNode</a></td><td class="desc">ADT constructor. Constructors compare by pointer equality </td></tr>
-<tr id="row_1_45_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Diagnostic.html" target="_self">Diagnostic</a></td><td class="desc"></td></tr>
-<tr id="row_1_46_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticBuilder.html" target="_self">DiagnosticBuilder</a></td><td class="desc">A wrapper around std::stringstream to build a diagnostic </td></tr>
-<tr id="row_1_47_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticContext.html" target="_self">DiagnosticContext</a></td><td class="desc"></td></tr>
-<tr id="row_1_48_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticContextNode.html" target="_self">DiagnosticContextNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_49_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticNode.html" target="_self">DiagnosticNode</a></td><td class="desc">A compiler diagnostic message </td></tr>
-<tr id="row_1_50_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticRenderer.html" target="_self">DiagnosticRenderer</a></td><td class="desc"></td></tr>
-<tr id="row_1_51_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticRendererNode.html" target="_self">DiagnosticRendererNode</a></td><td class="desc">Display diagnostics in a given display format </td></tr>
-<tr id="row_1_52_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DictAttrs.html" target="_self">DictAttrs</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1DictAttrsNode.html" title="Specialized attribute type that is backed by a map. The DictAttrsNode implements the Attrs behavior...">DictAttrsNode</a> </td></tr>
-<tr id="row_1_53_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DictAttrsNode.html" target="_self">DictAttrsNode</a></td><td class="desc">Specialized attribute type that is backed by a map. The <a class="el" href="classtvm_1_1DictAttrsNode.html" title="Specialized attribute type that is backed by a map. The DictAttrsNode implements the [...]
-<tr id="row_1_54_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1EnvFunc.html" target="_self">EnvFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1EnvFuncNode.html" title="A serializable function backed by TVM's global environment. ">EnvFuncNode</a> </td></tr>
-<tr id="row_1_55_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1EnvFuncNode.html" target="_self">EnvFuncNode</a></td><td class="desc">A serializable function backed by TVM's global environment </td></tr>
-<tr id="row_1_56_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ErrorBuilder.html" target="_self">ErrorBuilder</a></td><td class="desc">A wrapper around std::stringstream to build error </td></tr>
-<tr id="row_1_57_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ErrorReporter.html" target="_self">ErrorReporter</a></td><td class="desc">An abstraction around how errors are stored and reported. Designed to be opaque to users, so we can support a robust and simpler error reporting mode, as well as a more complex mode </td></tr>
-<tr id="row_1_58_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FloatImm.html" target="_self">FloatImm</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1FloatImmNode.html" title="Constant floating point literals in the program. ">FloatImmNode</a> </td></tr>
-<tr id="row_1_59_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FloatImmNode.html" target="_self">FloatImmNode</a></td><td class="desc">Constant floating point literals in the program </td></tr>
-<tr id="row_1_60_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FuncType.html" target="_self">FuncType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1FuncTypeNode.html" title="Function type. ">FuncTypeNode</a> </td></tr>
-<tr id="row_1_61_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FuncTypeNode.html" target="_self">FuncTypeNode</a></td><td class="desc">Function type </td></tr>
-<tr id="row_1_62_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GenericFunc.html" target="_self">GenericFunc</a></td><td class="desc">Generic function that can be specialized on a per-target basis </td></tr>
-<tr id="row_1_63_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GenericFuncNode.html" target="_self">GenericFuncNode</a></td><td class="desc">Represents a generic function that can be specialized on a per-target basis </td></tr>
-<tr id="row_1_64_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalTypeVar.html" target="_self">GlobalTypeVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1GlobalTypeVarNode.html" title="A global type variable that is used for defining new types or type aliases. ">GlobalTypeVarNode</a> </td></tr>
-<tr id="row_1_65_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalTypeVarNode.html" target="_self">GlobalTypeVarNode</a></td><td class="desc">A global type variable that is used for defining new types or type aliases </td></tr>
-<tr id="row_1_66_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalVar.html" target="_self">GlobalVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1GlobalVarNode.html" title="Global variable that lives in the top-level module. ">GlobalVarNode</a> </td></tr>
-<tr id="row_1_67_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalVarNode.html" target="_self">GlobalVarNode</a></td><td class="desc">Global variable that lives in the top-level module </td></tr>
-<tr id="row_1_68_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IncompleteType.html" target="_self">IncompleteType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1IncompleteTypeNode.html" title="Intermediate values that is used to indicate incomplete type during type inference. ">IncompleteTypeNode</a> </td></tr>
-<tr id="row_1_69_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IncompleteTypeNode.html" target="_self">IncompleteTypeNode</a></td><td class="desc">Intermediate values that is used to indicate incomplete type during type inference </td></tr>
-<tr id="row_1_70_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Integer.html" target="_self">Integer</a></td><td class="desc">Container of constant int that adds more constructors </td></tr>
-<tr id="row_1_71_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IntImm.html" target="_self">IntImm</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1IntImmNode.html" title="Constant integer literals in the program. ">IntImmNode</a> </td></tr>
-<tr id="row_1_72_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IntImmNode.html" target="_self">IntImmNode</a></td><td class="desc">Constant integer literals in the program </td></tr>
-<tr id="row_1_73_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IRModule.html" target="_self">IRModule</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1IRModuleNode.html" title="IRModule that holds functions and type definitions. ">IRModuleNode</a> </td></tr>
-<tr id="row_1_74_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IRModuleNode.html" target="_self">IRModuleNode</a></td><td class="desc"><a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> that holds functions and type definitions </td></tr>
-<tr id="row_1_75_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1MemoryInfo.html" target="_self">MemoryInfo</a></td><td class="desc">Defines memory info </td></tr>
-<tr id="row_1_76_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1MemoryInfoNode.html" target="_self">MemoryInfoNode</a></td><td class="desc">Memory information of special memory region. Use <a class="el" href="classtvm_1_1MemoryInfo.html" title="Defines memory info. ">MemoryInfo</a> as its container type </td></tr>
-<tr id="row_1_77_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1NDArrayContainerTrait.html" target="_self">NDArrayContainerTrait</a></td><td class="desc"></td></tr>
-<tr id="row_1_78_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1NodeFunctor.html" target="_self">NodeFunctor</a></td><td class="desc">A dynamically dispatched functor on the type of the first argument </td></tr>
-<tr id="row_1_79_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1NodeFunctor_3_01R_07const_01ObjectRef_01_6n_00_01Args_8_8_8_08_4.html" target="_self">NodeFunctor< R(const ObjectRef &n, Args...)></a></td><td class="desc"></td></tr>
-<tr id="row_1_80_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Op.html" target="_self">Op</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1OpNode.html" title="Primitive Op(builtin intrinsics) ">OpNode</a> </td></tr>
-<tr id="row_1_81_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpAttrMap.html" target="_self">OpAttrMap</a></td><td class="desc">Map<Op,ValueType> used to store meta-information about <a class="el" href="classtvm_1_1Op.html" title="Managed reference class to OpNode. ">Op</a> </td></tr>
-<tr id="row_1_82_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpNode.html" target="_self">OpNode</a></td><td class="desc">Primitive Op(builtin intrinsics) </td></tr>
-<tr id="row_1_83_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpRegEntry.html" target="_self">OpRegEntry</a></td><td class="desc">Helper structure to register operators </td></tr>
-<tr id="row_1_84_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PointerType.html" target="_self">PointerType</a></td><td class="desc"></td></tr>
-<tr id="row_1_85_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PointerTypeNode.html" target="_self">PointerTypeNode</a></td><td class="desc">Low-level raw pointer type </td></tr>
-<tr id="row_1_86_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PoolInfo.html" target="_self">PoolInfo</a></td><td class="desc">Base class for <a class="el" href="classtvm_1_1WorkspacePoolInfo.html">WorkspacePoolInfo</a> and <a class="el" href="classtvm_1_1ConstantPoolInfo.html">ConstantPoolInfo</a> </td></tr>
-<tr id="row_1_87_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1PoolInfoNode.html" target="_self">PoolInfoNode</a></td><td class="desc">Describes a pool of memory accessible by one or more targets </td></tr>
-<tr id="row_1_88_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PoolInfoProperties.html" target="_self">PoolInfoProperties</a></td><td class="desc"></td></tr>
-<tr id="row_1_89_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1PoolInfoPropertiesNode.html" target="_self">PoolInfoPropertiesNode</a></td><td class="desc">Describes a pool of memory properties </td></tr>
-<tr id="row_1_90_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimExpr.html" target="_self">PrimExpr</a></td><td class="desc">Reference to <a class="el" href="classtvm_1_1PrimExprNode.html" title="Base node of all primitive expressions. ">PrimExprNode</a> </td></tr>
-<tr id="row_1_91_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimExprNode.html" target="_self">PrimExprNode</a></td><td class="desc">Base node of all primitive expressions </td></tr>
-<tr id="row_1_92_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimType.html" target="_self">PrimType</a></td><td class="desc"></td></tr>
-<tr id="row_1_93_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimTypeNode.html" target="_self">PrimTypeNode</a></td><td class="desc">Primitive data types used in the low-level IR </td></tr>
-<tr id="row_1_94_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Range.html" target="_self">Range</a></td><td class="desc"><a class="el" href="classtvm_1_1Range.html" title="Range constainer. ">Range</a> constainer </td></tr>
-<tr id="row_1_95_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RangeNode.html" target="_self">RangeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Range.html" title="Range constainer. ">Range</a> over one dimension </td></tr>
-<tr id="row_1_96_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_96_" class="arrow" onclick="toggleFolder('1_96_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReflectionVTable.html" target="_self">ReflectionVTable</a></td><td class="desc">Virtual function table to support IR/AST node reflection </td></tr>
-<tr id="row_1_96_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReflectionVTable_1_1Registry.html" target="_self">Registry</a></td><td class="desc"><a class="el" href="classtvm_1_1ReflectionVTable_1_1Registry.html" title="Registry of a reflection table. ">Registry</a> of a reflection table </td></tr>
-<tr id="row_1_97_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayExpr.html" target="_self">RelayExpr</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1RelayExprNode.html" title="Base node of all non-primitive expressions. ">RelayExprNode</a> </td></tr>
-<tr id="row_1_98_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayExprNode.html" target="_self">RelayExprNode</a></td><td class="desc">Base node of all non-primitive expressions </td></tr>
-<tr id="row_1_99_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayRefType.html" target="_self">RelayRefType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1RelayRefTypeNode.html" title="Reference Type High-level Relay IR. ">RelayRefTypeNode</a> </td></tr>
-<tr id="row_1_100_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayRefTypeNode.html" target="_self">RelayRefTypeNode</a></td><td class="desc">Reference <a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> High-level Relay IR </td></tr>
-<tr id="row_1_101_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReprPrinter.html" target="_self">ReprPrinter</a></td><td class="desc">A printer class to print the AST/IR nodes </td></tr>
-<tr id="row_1_102_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_102_" class="arrow" onclick="toggleFolder('1_102_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SEqualReducer.html" target="_self">SEqualReducer</a></td><td class="desc">A Reducer class to reduce the structural equality result of two objects </td></tr>
-<tr id="row_1_102_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SEqualReducer_1_1Handler.html" target="_self">Handler</a></td><td class="desc">Internal handler that defines custom behaviors. </td></tr>
-<tr id="row_1_103_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_103_" class="arrow" onclick="toggleFolder('1_103_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SHashReducer.html" target="_self">SHashReducer</a></td><td class="desc">A Reducer class to reduce the structural hash value </td></tr>
-<tr id="row_1_103_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SHashReducer_1_1Handler.html" target="_self">Handler</a></td><td class="desc">Internal handler that defines custom behaviors </td></tr>
-<tr id="row_1_104_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SourceName.html" target="_self">SourceName</a></td><td class="desc">The source name of a file span </td></tr>
-<tr id="row_1_105_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SourceNameNode.html" target="_self">SourceNameNode</a></td><td class="desc">The name of a source fragment </td></tr>
-<tr id="row_1_106_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Span.html" target="_self">Span</a></td><td class="desc"></td></tr>
-<tr id="row_1_107_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SpanNode.html" target="_self">SpanNode</a></td><td class="desc">Stores locations in frontend source that generated a node </td></tr>
-<tr id="row_1_108_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1StructuralEqual.html" target="_self">StructuralEqual</a></td><td class="desc">Content-aware structural equality comparator for objects </td></tr>
-<tr id="row_1_109_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1StructuralHash.html" target="_self">StructuralHash</a></td><td class="desc">Content-aware structural hasing </td></tr>
-<tr id="row_1_110_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Target.html" target="_self">Target</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetNode.html" title="Compilation target. ">TargetNode</a> </td></tr>
-<tr id="row_1_111_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKind.html" target="_self">TargetKind</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetKindNode.html" title="Target kind, specifies the kind of the target. ">TargetKindNode</a> </td></tr>
-<tr id="row_1_112_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindAttrMap.html" target="_self">TargetKindAttrMap</a></td><td class="desc">Map<TargetKind, ValueType> used to store meta-information about <a class="el" href="classtvm_1_1TargetKind.html" title="Managed reference class to TargetKindNode. ">TargetKind</a> </td></tr>
-<tr id="row_1_113_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindNode.html" target="_self">TargetKindNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Target.html" title="Managed reference class to TargetNode. ">Target</a> kind, specifies the kind of the target </td></tr>
-<tr id="row_1_114_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindRegEntry.html" target="_self">TargetKindRegEntry</a></td><td class="desc">Helper structure to register <a class="el" href="classtvm_1_1TargetKind.html" title="Managed reference class to TargetKindNode. ">TargetKind</a> </td></tr>
-<tr id="row_1_115_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetNode.html" target="_self">TargetNode</a></td><td class="desc">Compilation target </td></tr>
-<tr id="row_1_116_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTag.html" target="_self">TargetTag</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetTagNode.html" title="A target tag. ">TargetTagNode</a> </td></tr>
-<tr id="row_1_117_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTagNode.html" target="_self">TargetTagNode</a></td><td class="desc">A target tag </td></tr>
-<tr id="row_1_118_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTagRegEntry.html" target="_self">TargetTagRegEntry</a></td><td class="desc"></td></tr>
-<tr id="row_1_119_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorAffineType.html" target="_self">TensorAffineType</a></td><td class="desc">Managed reference to AffineTypes </td></tr>
-<tr id="row_1_120_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorAffineTypeNode.html" target="_self">TensorAffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TensorAffineType.html" title="Managed reference to AffineTypes. ">TensorAffineType</a> representation </td></tr>
-<tr id="row_1_121_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorType.html" target="_self">TensorType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TensorTypeNode.html" title="This is the most commonly used type in relay. TensorType have a fixed dimension, data type...">TensorTypeNode</a> </td></tr>
-<tr id="row_1_122_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorTypeNode.html" target="_self">TensorTypeNode</a></td><td class="desc">This is the most commonly used type in relay. <a class="el" href="classtvm_1_1TensorType.html" title="Managed reference to TensorTypeNode. ">TensorType</a> have a fixed dimension, data type </td></tr>
-<tr id="row_1_123_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleAffineType.html" target="_self">TupleAffineType</a></td><td class="desc">Managed reference to TupleAffineTypes </td></tr>
-<tr id="row_1_124_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleAffineTypeNode.html" target="_self">TupleAffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TupleAffineType.html" title="Managed reference to TupleAffineTypes. ">TupleAffineType</a> representation </td></tr>
-<tr id="row_1_125_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleType.html" target="_self">TupleType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TupleTypeNode.html" title="The type of tuple values. ">TupleTypeNode</a> </td></tr>
-<tr id="row_1_126_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleTypeNode.html" target="_self">TupleTypeNode</a></td><td class="desc">The type of tuple values </td></tr>
-<tr id="row_1_127_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Type.html" target="_self">Type</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeNode.html" title="Type is the base type of all types. ">TypeNode</a> </td></tr>
-<tr id="row_1_128_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeCall.html" target="_self">TypeCall</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeCallNode.html" title="Type function application. ">TypeCallNode</a> </td></tr>
-<tr id="row_1_129_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeCallNode.html" target="_self">TypeCallNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> function application </td></tr>
-<tr id="row_1_130_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeConstraint.html" target="_self">TypeConstraint</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeConstraintNode.html" title="Potential Constraints in a function. ">TypeConstraintNode</a> </td></tr>
-<tr id="row_1_131_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeConstraintNode.html" target="_self">TypeConstraintNode</a></td><td class="desc">Potential Constraints in a function </td></tr>
-<tr id="row_1_132_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeData.html" target="_self">TypeData</a></td><td class="desc">Stores all data for an Algebraic Data <a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> (ADT) </td></tr>
-<tr id="row_1_133_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeDataNode.html" target="_self">TypeDataNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TypeData.html" title="Stores all data for an Algebraic Data Type (ADT). ">TypeData</a> container node </td></tr>
-<tr id="row_1_134_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypedEnvFunc.html" target="_self">TypedEnvFunc</a></td><td class="desc">Please refer to <a class="el" href="classtvm_1_1TypedEnvFunc_3_01R_07Args_8_8_8_08_4.html#TypedEnvFuncAnchor">TypedEnvFunc<R(Args..)></a> </td></tr>
-<tr id="row_1_135_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypedEnvFunc_3_01R_07Args_8_8_8_08_4.html" target="_self">TypedEnvFunc< R(Args...)></a></td><td class="desc">A typed version of <a class="el" href="classtvm_1_1EnvFunc.html" title="Managed reference to EnvFuncNode. ">EnvFunc</a>. It is backed by a GlobalFuncNode inte [...]
-<tr id="row_1_136_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeFunctor.html" target="_self">TypeFunctor</a></td><td class="desc"></td></tr>
-<tr id="row_1_137_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeFunctor_3_01R_07const_01Type_01_6n_00_01Args_8_8_8_08_4.html" target="_self">TypeFunctor< R(const Type &n, Args...)></a></td><td class="desc"></td></tr>
-<tr id="row_1_138_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeMutator.html" target="_self">TypeMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1TypeMutator.html" title="TypeMutator that mutates expressions. ">TypeMutator</a> that mutates expressions </td></tr>
-<tr id="row_1_139_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeNode.html" target="_self">TypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> is the base type of all types </td></tr>
-<tr id="row_1_140_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeRelation.html" target="_self">TypeRelation</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeRelationNode.html" title="User defined type relation, it is an input-output relation on types. ">TypeRelationNode</a> </td></tr>
-<tr id="row_1_141_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeRelationNode.html" target="_self">TypeRelationNode</a></td><td class="desc">User defined type relation, it is an input-output relation on types </td></tr>
-<tr id="row_1_142_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeReporter.html" target="_self">TypeReporter</a></td><td class="desc">Container class of <a class="el" href="classtvm_1_1TypeReporter.html" title="Container class of TypeReporter. ">TypeReporter</a> </td></tr>
-<tr id="row_1_143_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeReporterNode.html" target="_self">TypeReporterNode</a></td><td class="desc">Reporter that reports back to the type resolution information </td></tr>
-<tr id="row_1_144_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVar.html" target="_self">TypeVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeVarNode.html" title="Type parameter in functions. ">TypeVarNode</a> </td></tr>
-<tr id="row_1_145_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVarNode.html" target="_self">TypeVarNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> parameter in functions </td></tr>
-<tr id="row_1_146_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVisitor.html" target="_self">TypeVisitor</a></td><td class="desc">A type visitor that recursively visit types </td></tr>
-<tr id="row_1_147_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDevice.html" target="_self">VirtualDevice</a></td><td class="desc">Managed reference class to <code><a class="el" href="classtvm_1_1VirtualDeviceNode.html" title="Describes at compile time the constraints on where data is to be stored at runtime down to the (virtu.. [...]
-<tr id="row_1_148_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDeviceCache.html" target="_self">VirtualDeviceCache</a></td><td class="desc">A cache of <code>VirtualDevices</code>. This can be used: </td></tr>
-<tr id="row_1_149_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDeviceNode.html" target="_self">VirtualDeviceNode</a></td><td class="desc">Describes at compile time the constraints on where data is to be stored at runtime down to the (virtual) device and memory scope level, and how to compile code to compute that data. Used by t [...]
-<tr id="row_1_150_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1With.html" target="_self">With</a></td><td class="desc">RAII wrapper function to enter and exit a context object similar to python's with syntax </td></tr>
-<tr id="row_1_151_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1WorkspaceMemoryPools.html" target="_self">WorkspaceMemoryPools</a></td><td class="desc"></td></tr>
-<tr id="row_1_152_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1WorkspaceMemoryPoolsNode.html" target="_self">WorkspaceMemoryPoolsNode</a></td><td class="desc"></td></tr>
-<tr id="row_1_153_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1WorkspacePoolInfo.html" target="_self">WorkspacePoolInfo</a></td><td class="desc"></td></tr>
-<tr id="row_1_154_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1WorkspacePoolInfoNode.html" target="_self">WorkspacePoolInfoNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_11_155_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TensorIntrin.html" target="_self">TensorIntrin</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1TensorIntrinNode.html" title="Tensor intrinsics for tensorization. ">TensorIntrinNode</a> </td></tr>
+<tr id="row_1_11_156_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TensorIntrinNode.html" target="_self">TensorIntrinNode</a></td><td class="desc">Tensor intrinsics for tensorization </td></tr>
+<tr id="row_1_11_157_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Trace.html" target="_self">Trace</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1TraceNode.html" title="An execution trace of a scheduling program. ">TraceNode</a> </td></tr>
+<tr id="row_1_11_158_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1TraceNode.html" target="_self">TraceNode</a></td><td class="desc">An execution trace of a scheduling program </td></tr>
+<tr id="row_1_11_159_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1Var.html" target="_self">Var</a></td><td class="desc">Named variable in TIR </td></tr>
+<tr id="row_1_11_160_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1VarNode.html" target="_self">VarNode</a></td><td class="desc">A variable node in the IR </td></tr>
+<tr id="row_1_11_161_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1While.html" target="_self">While</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1tir_1_1WhileNode.html" title="A While loop. ">WhileNode</a> </td></tr>
+<tr id="row_1_11_162_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1tir_1_1WhileNode.html" target="_self">WhileNode</a></td><td class="desc">A <a class="el" href="classtvm_1_1tir_1_1While.html" title="Managed reference to WhileNode. ">While</a> loop </td></tr>
+<tr id="row_1_12_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_12_" class="arrow" onclick="toggleFolder('1_12_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1topi.html" target="_self">topi</a></td><td class="desc"></td></tr>
+<tr id="row_1_13_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_13_" class="arrow" onclick="toggleFolder('1_13_')">►</span><span class="icona"><span class="icon">N</span></span><a class="el" href="namespacetvm_1_1transform.html" target="_self">transform</a></td><td class="desc"></td></tr>
+<tr id="row_1_13_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1Pass.html" target="_self">Pass</a></td><td class="desc"></td></tr>
+<tr id="row_1_13_1_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassContext.html" target="_self">PassContext</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassContext.html" title="PassContext that is used to configure the pass behavior. ">PassContext</a> that is used to configure the pass behavior [...]
+<tr id="row_1_13_2_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassContextNode.html" target="_self">PassContextNode</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassContextNode.html" title="PassContextNode contains the information that a pass can rely on, such as analysis results...">PassContextN [...]
+<tr id="row_1_13_3_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassInfo.html" target="_self">PassInfo</a></td><td class="desc">Managed reference class for <a class="el" href="classtvm_1_1transform_1_1PassInfoNode.html" title="Meta data that will be used to help optimization and analysis. ">PassInfoNode</a> </td></tr>
+<tr id="row_1_13_4_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassInfoNode.html" target="_self">PassInfoNode</a></td><td class="desc">Meta data that will be used to help optimization and analysis </td></tr>
+<tr id="row_1_13_5_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1PassNode.html" target="_self">PassNode</a></td><td class="desc"><a class="el" href="classtvm_1_1transform_1_1PassNode.html" title="PassNode is the base type of differnt types of optimization passes. It is designed as a pure class an...">PassNode</a> is the ba [...]
+<tr id="row_1_13_6_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1Sequential.html" target="_self">Sequential</a></td><td class="desc"></td></tr>
+<tr id="row_1_13_7_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1transform_1_1SequentialNode.html" target="_self">SequentialNode</a></td><td class="desc">The <a class="el" href="classtvm_1_1transform_1_1SequentialNode.html" title="The SequentialNode contains a set of passes that transform Relay programs from one AST to another sem..."> [...]
+<tr id="row_1_14_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AffineType.html" target="_self">AffineType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1AffineTypeNode.html" title="AffineType representation. ">AffineTypeNode</a> </td></tr>
+<tr id="row_1_15_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AffineTypeNode.html" target="_self">AffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1AffineType.html" title="Managed reference to AffineTypeNode. ">AffineType</a> representation </td></tr>
+<tr id="row_1_16_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1AttrError.html" target="_self">AttrError</a></td><td class="desc">Error thrown during attribute checking </td></tr>
+<tr id="row_1_17_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrFieldInfo.html" target="_self">AttrFieldInfo</a></td><td class="desc"><a class="el" href="classtvm_1_1AttrFieldInfo.html" title="AttrFieldInfo. ">AttrFieldInfo</a> </td></tr>
+<tr id="row_1_18_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrFieldInfoNode.html" target="_self">AttrFieldInfoNode</a></td><td class="desc">Information about attribute fields in string representations </td></tr>
+<tr id="row_1_19_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistry.html" target="_self">AttrRegistry</a></td><td class="desc"></td></tr>
+<tr id="row_1_20_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistryMap.html" target="_self">AttrRegistryMap</a></td><td class="desc">Map<Key, ValueType> used to store meta-data </td></tr>
+<tr id="row_1_21_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrRegistryMapContainerMap.html" target="_self">AttrRegistryMapContainerMap</a></td><td class="desc">Generic attribute map </td></tr>
+<tr id="row_1_22_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Attrs.html" target="_self">Attrs</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseAttrsNode.html" title="Base class of all attribute class. ">BaseAttrsNode</a> </td></tr>
+<tr id="row_1_23_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrsNode.html" target="_self">AttrsNode</a></td><td class="desc">The base class of the all the Use "curiously recurring template pattern" </td></tr>
+<tr id="row_1_24_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1AttrVisitor.html" target="_self">AttrVisitor</a></td><td class="desc">Visitor class to get the attributes of an AST/IR node. The content is going to be called for each field </td></tr>
+<tr id="row_1_25_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseAttrsNode.html" target="_self">BaseAttrsNode</a></td><td class="desc">Base class of all attribute class </td></tr>
+<tr id="row_1_26_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseExpr.html" target="_self">BaseExpr</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseExprNode.html" title="Base type of all the expressions. ">BaseExprNode</a> </td></tr>
+<tr id="row_1_27_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseExprNode.html" target="_self">BaseExprNode</a></td><td class="desc">Base type of all the expressions </td></tr>
+<tr id="row_1_28_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseFunc.html" target="_self">BaseFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseFuncNode.html" title="Base node of all functions. ">BaseFuncNode</a> </td></tr>
+<tr id="row_1_29_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseFuncNode.html" target="_self">BaseFuncNode</a></td><td class="desc">Base node of all functions </td></tr>
+<tr id="row_1_30_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseTensorType.html" target="_self">BaseTensorType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1BaseTensorTypeNode.html" title="Base of all Tensor types This container can hold TensorType or GenericTensorType. ...">BaseTensorTypeNode</a> </td></tr>
+<tr id="row_1_31_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseTensorTypeNode.html" target="_self">BaseTensorTypeNode</a></td><td class="desc">Base of all Tensor types This container can hold <a class="el" href="classtvm_1_1TensorType.html" title="Managed reference to TensorTypeNode. ">TensorType</a> or GenericTensorType </td></tr>
+<tr id="row_1_32_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseValueEqual.html" target="_self">BaseValueEqual</a></td><td class="desc">Equality definition of base value class </td></tr>
+<tr id="row_1_33_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1BaseValueHash.html" target="_self">BaseValueHash</a></td><td class="desc">Hash definition of base value classes </td></tr>
+<tr id="row_1_34_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Bool.html" target="_self">Bool</a></td><td class="desc">Boolean constant </td></tr>
+<tr id="row_1_35_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompilationConfig.html" target="_self">CompilationConfig</a></td><td class="desc">Managed reference class to <code><a class="el" href="classtvm_1_1CompilationConfig.html" title="Managed reference class to CompilationConfig. ">CompilationConfig</a></code> </td></tr>
+<tr id="row_1_36_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompilationConfigNode.html" target="_self">CompilationConfigNode</a></td><td class="desc">Gathers the <code>Targets</code> and distinguished <code>VirtualDevices</code> in canonical form needed to compile a Relay module for execution over possibly heterogeneous devices. Cen [...]
+<tr id="row_1_37_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1CompileError.html" target="_self">CompileError</a></td><td class="desc">Custom Error class to be thrown during compilation </td></tr>
+<tr id="row_1_38_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantInfo.html" target="_self">ConstantInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_39_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantInfoNode.html" target="_self">ConstantInfoNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_40_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantMemoryPools.html" target="_self">ConstantMemoryPools</a></td><td class="desc"></td></tr>
+<tr id="row_1_41_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantMemoryPoolsNode.html" target="_self">ConstantMemoryPoolsNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_42_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstantPoolInfo.html" target="_self">ConstantPoolInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_43_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ConstantPoolInfoNode.html" target="_self">ConstantPoolInfoNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_44_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Constructor.html" target="_self">Constructor</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1ConstructorNode.html" title="ADT constructor. Constructors compare by pointer equality. ">ConstructorNode</a> </td></tr>
+<tr id="row_1_45_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ConstructorNode.html" target="_self">ConstructorNode</a></td><td class="desc">ADT constructor. Constructors compare by pointer equality </td></tr>
+<tr id="row_1_46_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Diagnostic.html" target="_self">Diagnostic</a></td><td class="desc"></td></tr>
+<tr id="row_1_47_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticBuilder.html" target="_self">DiagnosticBuilder</a></td><td class="desc">A wrapper around std::stringstream to build a diagnostic </td></tr>
+<tr id="row_1_48_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticContext.html" target="_self">DiagnosticContext</a></td><td class="desc"></td></tr>
+<tr id="row_1_49_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticContextNode.html" target="_self">DiagnosticContextNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_50_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticNode.html" target="_self">DiagnosticNode</a></td><td class="desc">A compiler diagnostic message </td></tr>
+<tr id="row_1_51_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticRenderer.html" target="_self">DiagnosticRenderer</a></td><td class="desc"></td></tr>
+<tr id="row_1_52_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DiagnosticRendererNode.html" target="_self">DiagnosticRendererNode</a></td><td class="desc">Display diagnostics in a given display format </td></tr>
+<tr id="row_1_53_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DictAttrs.html" target="_self">DictAttrs</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1DictAttrsNode.html" title="Specialized attribute type that is backed by a map. The DictAttrsNode implements the Attrs behavior...">DictAttrsNode</a> </td></tr>
+<tr id="row_1_54_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1DictAttrsNode.html" target="_self">DictAttrsNode</a></td><td class="desc">Specialized attribute type that is backed by a map. The <a class="el" href="classtvm_1_1DictAttrsNode.html" title="Specialized attribute type that is backed by a map. The DictAttrsNode implements the [...]
+<tr id="row_1_55_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1EnvFunc.html" target="_self">EnvFunc</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1EnvFuncNode.html" title="A serializable function backed by TVM's global environment. ">EnvFuncNode</a> </td></tr>
+<tr id="row_1_56_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1EnvFuncNode.html" target="_self">EnvFuncNode</a></td><td class="desc">A serializable function backed by TVM's global environment </td></tr>
+<tr id="row_1_57_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1ErrorBuilder.html" target="_self">ErrorBuilder</a></td><td class="desc">A wrapper around std::stringstream to build error </td></tr>
+<tr id="row_1_58_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ErrorReporter.html" target="_self">ErrorReporter</a></td><td class="desc">An abstraction around how errors are stored and reported. Designed to be opaque to users, so we can support a robust and simpler error reporting mode, as well as a more complex mode </td></tr>
+<tr id="row_1_59_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FloatImm.html" target="_self">FloatImm</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1FloatImmNode.html" title="Constant floating point literals in the program. ">FloatImmNode</a> </td></tr>
+<tr id="row_1_60_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FloatImmNode.html" target="_self">FloatImmNode</a></td><td class="desc">Constant floating point literals in the program </td></tr>
+<tr id="row_1_61_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FuncType.html" target="_self">FuncType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1FuncTypeNode.html" title="Function type. ">FuncTypeNode</a> </td></tr>
+<tr id="row_1_62_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1FuncTypeNode.html" target="_self">FuncTypeNode</a></td><td class="desc">Function type </td></tr>
+<tr id="row_1_63_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GenericFunc.html" target="_self">GenericFunc</a></td><td class="desc">Generic function that can be specialized on a per-target basis </td></tr>
+<tr id="row_1_64_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GenericFuncNode.html" target="_self">GenericFuncNode</a></td><td class="desc">Represents a generic function that can be specialized on a per-target basis </td></tr>
+<tr id="row_1_65_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalTypeVar.html" target="_self">GlobalTypeVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1GlobalTypeVarNode.html" title="A global type variable that is used for defining new types or type aliases. ">GlobalTypeVarNode</a> </td></tr>
+<tr id="row_1_66_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalTypeVarNode.html" target="_self">GlobalTypeVarNode</a></td><td class="desc">A global type variable that is used for defining new types or type aliases </td></tr>
+<tr id="row_1_67_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalVar.html" target="_self">GlobalVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1GlobalVarNode.html" title="Global variable that lives in the top-level module. ">GlobalVarNode</a> </td></tr>
+<tr id="row_1_68_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1GlobalVarNode.html" target="_self">GlobalVarNode</a></td><td class="desc">Global variable that lives in the top-level module </td></tr>
+<tr id="row_1_69_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IncompleteType.html" target="_self">IncompleteType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1IncompleteTypeNode.html" title="Intermediate values that is used to indicate incomplete type during type inference. ">IncompleteTypeNode</a> </td></tr>
+<tr id="row_1_70_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IncompleteTypeNode.html" target="_self">IncompleteTypeNode</a></td><td class="desc">Intermediate values that is used to indicate incomplete type during type inference </td></tr>
+<tr id="row_1_71_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Integer.html" target="_self">Integer</a></td><td class="desc">Container of constant int that adds more constructors </td></tr>
+<tr id="row_1_72_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IntImm.html" target="_self">IntImm</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1IntImmNode.html" title="Constant integer literals in the program. ">IntImmNode</a> </td></tr>
+<tr id="row_1_73_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IntImmNode.html" target="_self">IntImmNode</a></td><td class="desc">Constant integer literals in the program </td></tr>
+<tr id="row_1_74_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IRModule.html" target="_self">IRModule</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1IRModuleNode.html" title="IRModule that holds functions and type definitions. ">IRModuleNode</a> </td></tr>
+<tr id="row_1_75_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1IRModuleNode.html" target="_self">IRModuleNode</a></td><td class="desc"><a class="el" href="classtvm_1_1IRModule.html" title="Managed reference class to IRModuleNode. ">IRModule</a> that holds functions and type definitions </td></tr>
+<tr id="row_1_76_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1MemoryInfo.html" target="_self">MemoryInfo</a></td><td class="desc">Defines memory info </td></tr>
+<tr id="row_1_77_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1MemoryInfoNode.html" target="_self">MemoryInfoNode</a></td><td class="desc">Memory information of special memory region. Use <a class="el" href="classtvm_1_1MemoryInfo.html" title="Defines memory info. ">MemoryInfo</a> as its container type </td></tr>
+<tr id="row_1_78_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1NDArrayContainerTrait.html" target="_self">NDArrayContainerTrait</a></td><td class="desc"></td></tr>
+<tr id="row_1_79_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1NodeFunctor.html" target="_self">NodeFunctor</a></td><td class="desc">A dynamically dispatched functor on the type of the first argument </td></tr>
+<tr id="row_1_80_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1NodeFunctor_3_01R_07const_01ObjectRef_01_6n_00_01Args_8_8_8_08_4.html" target="_self">NodeFunctor< R(const ObjectRef &n, Args...)></a></td><td class="desc"></td></tr>
+<tr id="row_1_81_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Op.html" target="_self">Op</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1OpNode.html" title="Primitive Op(builtin intrinsics) ">OpNode</a> </td></tr>
+<tr id="row_1_82_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpAttrMap.html" target="_self">OpAttrMap</a></td><td class="desc">Map<Op,ValueType> used to store meta-information about <a class="el" href="classtvm_1_1Op.html" title="Managed reference class to OpNode. ">Op</a> </td></tr>
+<tr id="row_1_83_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpNode.html" target="_self">OpNode</a></td><td class="desc">Primitive Op(builtin intrinsics) </td></tr>
+<tr id="row_1_84_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1OpRegEntry.html" target="_self">OpRegEntry</a></td><td class="desc">Helper structure to register operators </td></tr>
+<tr id="row_1_85_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PointerType.html" target="_self">PointerType</a></td><td class="desc"></td></tr>
+<tr id="row_1_86_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PointerTypeNode.html" target="_self">PointerTypeNode</a></td><td class="desc">Low-level raw pointer type </td></tr>
+<tr id="row_1_87_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PoolInfo.html" target="_self">PoolInfo</a></td><td class="desc">Base class for <a class="el" href="classtvm_1_1WorkspacePoolInfo.html">WorkspacePoolInfo</a> and <a class="el" href="classtvm_1_1ConstantPoolInfo.html">ConstantPoolInfo</a> </td></tr>
+<tr id="row_1_88_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1PoolInfoNode.html" target="_self">PoolInfoNode</a></td><td class="desc">Describes a pool of memory accessible by one or more targets </td></tr>
+<tr id="row_1_89_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PoolInfoProperties.html" target="_self">PoolInfoProperties</a></td><td class="desc"></td></tr>
+<tr id="row_1_90_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1PoolInfoPropertiesNode.html" target="_self">PoolInfoPropertiesNode</a></td><td class="desc">Describes a pool of memory properties </td></tr>
+<tr id="row_1_91_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimExpr.html" target="_self">PrimExpr</a></td><td class="desc">Reference to <a class="el" href="classtvm_1_1PrimExprNode.html" title="Base node of all primitive expressions. ">PrimExprNode</a> </td></tr>
+<tr id="row_1_92_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimExprNode.html" target="_self">PrimExprNode</a></td><td class="desc">Base node of all primitive expressions </td></tr>
+<tr id="row_1_93_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimType.html" target="_self">PrimType</a></td><td class="desc"></td></tr>
+<tr id="row_1_94_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1PrimTypeNode.html" target="_self">PrimTypeNode</a></td><td class="desc">Primitive data types used in the low-level IR </td></tr>
+<tr id="row_1_95_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Range.html" target="_self">Range</a></td><td class="desc"><a class="el" href="classtvm_1_1Range.html" title="Range constainer. ">Range</a> constainer </td></tr>
+<tr id="row_1_96_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RangeNode.html" target="_self">RangeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Range.html" title="Range constainer. ">Range</a> over one dimension </td></tr>
+<tr id="row_1_97_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_97_" class="arrow" onclick="toggleFolder('1_97_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReflectionVTable.html" target="_self">ReflectionVTable</a></td><td class="desc">Virtual function table to support IR/AST node reflection </td></tr>
+<tr id="row_1_97_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReflectionVTable_1_1Registry.html" target="_self">Registry</a></td><td class="desc"><a class="el" href="classtvm_1_1ReflectionVTable_1_1Registry.html" title="Registry of a reflection table. ">Registry</a> of a reflection table </td></tr>
+<tr id="row_1_98_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayExpr.html" target="_self">RelayExpr</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1RelayExprNode.html" title="Base node of all non-primitive expressions. ">RelayExprNode</a> </td></tr>
+<tr id="row_1_99_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayExprNode.html" target="_self">RelayExprNode</a></td><td class="desc">Base node of all non-primitive expressions </td></tr>
+<tr id="row_1_100_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayRefType.html" target="_self">RelayRefType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1RelayRefTypeNode.html" title="Reference Type High-level Relay IR. ">RelayRefTypeNode</a> </td></tr>
+<tr id="row_1_101_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1RelayRefTypeNode.html" target="_self">RelayRefTypeNode</a></td><td class="desc">Reference <a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> High-level Relay IR </td></tr>
+<tr id="row_1_102_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1ReprPrinter.html" target="_self">ReprPrinter</a></td><td class="desc">A printer class to print the AST/IR nodes </td></tr>
+<tr id="row_1_103_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_103_" class="arrow" onclick="toggleFolder('1_103_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SEqualReducer.html" target="_self">SEqualReducer</a></td><td class="desc">A Reducer class to reduce the structural equality result of two objects </td></tr>
+<tr id="row_1_103_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SEqualReducer_1_1Handler.html" target="_self">Handler</a></td><td class="desc">Internal handler that defines custom behaviors. </td></tr>
+<tr id="row_1_104_" class="even" style="display:none;"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span id="arr_1_104_" class="arrow" onclick="toggleFolder('1_104_')">►</span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SHashReducer.html" target="_self">SHashReducer</a></td><td class="desc">A Reducer class to reduce the structural hash value </td></tr>
+<tr id="row_1_104_0_" class="even" style="display:none;"><td class="entry"><span style="width:48px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SHashReducer_1_1Handler.html" target="_self">Handler</a></td><td class="desc">Internal handler that defines custom behaviors </td></tr>
+<tr id="row_1_105_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SourceName.html" target="_self">SourceName</a></td><td class="desc">The source name of a file span </td></tr>
+<tr id="row_1_106_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SourceNameNode.html" target="_self">SourceNameNode</a></td><td class="desc">The name of a source fragment </td></tr>
+<tr id="row_1_107_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Span.html" target="_self">Span</a></td><td class="desc"></td></tr>
+<tr id="row_1_108_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1SpanNode.html" target="_self">SpanNode</a></td><td class="desc">Stores locations in frontend source that generated a node </td></tr>
+<tr id="row_1_109_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1StructuralEqual.html" target="_self">StructuralEqual</a></td><td class="desc">Content-aware structural equality comparator for objects </td></tr>
+<tr id="row_1_110_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1StructuralHash.html" target="_self">StructuralHash</a></td><td class="desc">Content-aware structural hasing </td></tr>
+<tr id="row_1_111_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Target.html" target="_self">Target</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetNode.html" title="Compilation target. ">TargetNode</a> </td></tr>
+<tr id="row_1_112_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKind.html" target="_self">TargetKind</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetKindNode.html" title="Target kind, specifies the kind of the target. ">TargetKindNode</a> </td></tr>
+<tr id="row_1_113_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindAttrMap.html" target="_self">TargetKindAttrMap</a></td><td class="desc">Map<TargetKind, ValueType> used to store meta-information about <a class="el" href="classtvm_1_1TargetKind.html" title="Managed reference class to TargetKindNode. ">TargetKind</a> </td></tr>
+<tr id="row_1_114_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindNode.html" target="_self">TargetKindNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Target.html" title="Managed reference class to TargetNode. ">Target</a> kind, specifies the kind of the target </td></tr>
+<tr id="row_1_115_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetKindRegEntry.html" target="_self">TargetKindRegEntry</a></td><td class="desc">Helper structure to register <a class="el" href="classtvm_1_1TargetKind.html" title="Managed reference class to TargetKindNode. ">TargetKind</a> </td></tr>
+<tr id="row_1_116_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetNode.html" target="_self">TargetNode</a></td><td class="desc">Compilation target </td></tr>
+<tr id="row_1_117_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTag.html" target="_self">TargetTag</a></td><td class="desc">Managed reference class to <a class="el" href="classtvm_1_1TargetTagNode.html" title="A target tag. ">TargetTagNode</a> </td></tr>
+<tr id="row_1_118_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTagNode.html" target="_self">TargetTagNode</a></td><td class="desc">A target tag </td></tr>
+<tr id="row_1_119_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TargetTagRegEntry.html" target="_self">TargetTagRegEntry</a></td><td class="desc"></td></tr>
+<tr id="row_1_120_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorAffineType.html" target="_self">TensorAffineType</a></td><td class="desc">Managed reference to AffineTypes </td></tr>
+<tr id="row_1_121_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorAffineTypeNode.html" target="_self">TensorAffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TensorAffineType.html" title="Managed reference to AffineTypes. ">TensorAffineType</a> representation </td></tr>
+<tr id="row_1_122_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorType.html" target="_self">TensorType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TensorTypeNode.html" title="This is the most commonly used type in relay. TensorType have a fixed dimension, data type...">TensorTypeNode</a> </td></tr>
+<tr id="row_1_123_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TensorTypeNode.html" target="_self">TensorTypeNode</a></td><td class="desc">This is the most commonly used type in relay. <a class="el" href="classtvm_1_1TensorType.html" title="Managed reference to TensorTypeNode. ">TensorType</a> have a fixed dimension, data type </td></tr>
+<tr id="row_1_124_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleAffineType.html" target="_self">TupleAffineType</a></td><td class="desc">Managed reference to TupleAffineTypes </td></tr>
+<tr id="row_1_125_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleAffineTypeNode.html" target="_self">TupleAffineTypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TupleAffineType.html" title="Managed reference to TupleAffineTypes. ">TupleAffineType</a> representation </td></tr>
+<tr id="row_1_126_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleType.html" target="_self">TupleType</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TupleTypeNode.html" title="The type of tuple values. ">TupleTypeNode</a> </td></tr>
+<tr id="row_1_127_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TupleTypeNode.html" target="_self">TupleTypeNode</a></td><td class="desc">The type of tuple values </td></tr>
+<tr id="row_1_128_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1Type.html" target="_self">Type</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeNode.html" title="Type is the base type of all types. ">TypeNode</a> </td></tr>
+<tr id="row_1_129_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeCall.html" target="_self">TypeCall</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeCallNode.html" title="Type function application. ">TypeCallNode</a> </td></tr>
+<tr id="row_1_130_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeCallNode.html" target="_self">TypeCallNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> function application </td></tr>
+<tr id="row_1_131_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeConstraint.html" target="_self">TypeConstraint</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeConstraintNode.html" title="Potential Constraints in a function. ">TypeConstraintNode</a> </td></tr>
+<tr id="row_1_132_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeConstraintNode.html" target="_self">TypeConstraintNode</a></td><td class="desc">Potential Constraints in a function </td></tr>
+<tr id="row_1_133_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeData.html" target="_self">TypeData</a></td><td class="desc">Stores all data for an Algebraic Data <a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> (ADT) </td></tr>
+<tr id="row_1_134_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeDataNode.html" target="_self">TypeDataNode</a></td><td class="desc"><a class="el" href="classtvm_1_1TypeData.html" title="Stores all data for an Algebraic Data Type (ADT). ">TypeData</a> container node </td></tr>
+<tr id="row_1_135_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypedEnvFunc.html" target="_self">TypedEnvFunc</a></td><td class="desc">Please refer to <a class="el" href="classtvm_1_1TypedEnvFunc_3_01R_07Args_8_8_8_08_4.html#TypedEnvFuncAnchor">TypedEnvFunc<R(Args..)></a> </td></tr>
+<tr id="row_1_136_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypedEnvFunc_3_01R_07Args_8_8_8_08_4.html" target="_self">TypedEnvFunc< R(Args...)></a></td><td class="desc">A typed version of <a class="el" href="classtvm_1_1EnvFunc.html" title="Managed reference to EnvFuncNode. ">EnvFunc</a>. It is backed by a GlobalFuncNode inte [...]
+<tr id="row_1_137_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeFunctor.html" target="_self">TypeFunctor</a></td><td class="desc"></td></tr>
+<tr id="row_1_138_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeFunctor_3_01R_07const_01Type_01_6n_00_01Args_8_8_8_08_4.html" target="_self">TypeFunctor< R(const Type &n, Args...)></a></td><td class="desc"></td></tr>
+<tr id="row_1_139_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeMutator.html" target="_self">TypeMutator</a></td><td class="desc"><a class="el" href="classtvm_1_1TypeMutator.html" title="TypeMutator that mutates expressions. ">TypeMutator</a> that mutates expressions </td></tr>
+<tr id="row_1_140_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeNode.html" target="_self">TypeNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> is the base type of all types </td></tr>
+<tr id="row_1_141_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeRelation.html" target="_self">TypeRelation</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeRelationNode.html" title="User defined type relation, it is an input-output relation on types. ">TypeRelationNode</a> </td></tr>
+<tr id="row_1_142_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeRelationNode.html" target="_self">TypeRelationNode</a></td><td class="desc">User defined type relation, it is an input-output relation on types </td></tr>
+<tr id="row_1_143_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeReporter.html" target="_self">TypeReporter</a></td><td class="desc">Container class of <a class="el" href="classtvm_1_1TypeReporter.html" title="Container class of TypeReporter. ">TypeReporter</a> </td></tr>
+<tr id="row_1_144_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeReporterNode.html" target="_self">TypeReporterNode</a></td><td class="desc">Reporter that reports back to the type resolution information </td></tr>
+<tr id="row_1_145_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVar.html" target="_self">TypeVar</a></td><td class="desc">Managed reference to <a class="el" href="classtvm_1_1TypeVarNode.html" title="Type parameter in functions. ">TypeVarNode</a> </td></tr>
+<tr id="row_1_146_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVarNode.html" target="_self">TypeVarNode</a></td><td class="desc"><a class="el" href="classtvm_1_1Type.html" title="Managed reference to TypeNode. ">Type</a> parameter in functions </td></tr>
+<tr id="row_1_147_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1TypeVisitor.html" target="_self">TypeVisitor</a></td><td class="desc">A type visitor that recursively visit types </td></tr>
+<tr id="row_1_148_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDevice.html" target="_self">VirtualDevice</a></td><td class="desc">Managed reference class to <code><a class="el" href="classtvm_1_1VirtualDeviceNode.html" title="Describes at compile time the constraints on where data is to be stored at runtime down to the (virtu.. [...]
+<tr id="row_1_149_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDeviceCache.html" target="_self">VirtualDeviceCache</a></td><td class="desc">A cache of <code>VirtualDevices</code>. This can be used: </td></tr>
+<tr id="row_1_150_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1VirtualDeviceNode.html" target="_self">VirtualDeviceNode</a></td><td class="desc">Describes at compile time the constraints on where data is to be stored at runtime down to the (virtual) device and memory scope level, and how to compile code to compute that data. Used by t [...]
+<tr id="row_1_151_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1With.html" target="_self">With</a></td><td class="desc">RAII wrapper function to enter and exit a context object similar to python's with syntax </td></tr>
+<tr id="row_1_152_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1WorkspaceMemoryPools.html" target="_self">WorkspaceMemoryPools</a></td><td class="desc"></td></tr>
+<tr id="row_1_153_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1WorkspaceMemoryPoolsNode.html" target="_self">WorkspaceMemoryPoolsNode</a></td><td class="desc"></td></tr>
+<tr id="row_1_154_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="classtvm_1_1WorkspacePoolInfo.html" target="_self">WorkspacePoolInfo</a></td><td class="desc"></td></tr>
+<tr id="row_1_155_" class="even" style="display:none;"><td class="entry"><span style="width:32px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm_1_1WorkspacePoolInfoNode.html" target="_self">WorkspacePoolInfoNode</a></td><td class="desc"></td></tr>
<tr id="row_2_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structMemoryManagerInterface.html" target="_self">MemoryManagerInterface</a></td><td class="desc"></td></tr>
<tr id="row_3_"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structtvm__workspace__t.html" target="_self">tvm_workspace_t</a></td><td class="desc"></td></tr>
<tr id="row_4_" class="even"><td class="entry"><span style="width:16px;display:inline-block;"> </span><span class="icona"><span class="icon">C</span></span><a class="el" href="structTVMAotExecutor.html" target="_self">TVMAotExecutor</a></td><td class="desc"></td></tr>
diff --git a/docs/reference/api/doxygen/array_8h__dep__incl.svg b/docs/reference/api/doxygen/array_8h__dep__incl.svg
index 5a2b86bc1..8a0ba0eae 100644
--- a/docs/reference/api/doxygen/array_8h__dep__incl.svg
+++ b/docs/reference/api/doxygen/array_8h__dep__incl.svg
@@ -331,39 +331,39 @@
<path fill="none" stroke="#191970" d="M2102.2873,-828.1246C2107.8551,-802.3215 2115.2886,-761.1107 2115.2886,-725 2115.2886,-725 2115.2886,-725 2115.2886,-602 2115.2886,-497.3462 2014.9229,-401.6258 1972.1891,-365.613"/>
<polygon fill="#191970" stroke="#191970" points="2098.8055,-827.6575 2100.0424,-838.1799 2105.6373,-829.1828 2098.8055,-827.6575"/>
</g>
-<!-- Node179 -->
+<!-- Node181 -->
<g id="node45" class="node">
-<title>Node179</title>
+<title>Node181</title>
<g id="a_node45"><a xlink:href="buffer_8h.html" target="_top" xlink:title="Symbolic n-dimensional array, to represent a memory buffer. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1415.2886,-592.5 1415.2886,-611.5 1543.2886,-611.5 1543.2886,-592.5 1415.2886,-592.5"/>
<text text-anchor="middle" x="1479.2886" y="-599.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/buffer.h</text>
</a>
</g>
</g>
-<!-- Node19->Node179 -->
+<!-- Node19->Node181 -->
<g id="edge159" class="edge">
-<title>Node19->Node179</title>
+<title>Node19->Node181</title>
<path fill="none" stroke="#191970" d="M2028.2659,-843.8C2000.3807,-836.5745 1969.8841,-823.9729 1949.2886,-802 1900.6912,-750.1525 1960.5066,-693.9378 1906.2886,-648 1879.1961,-625.0451 1655.9756,-610.7813 1543.5231,-604.9849"/>
<polygon fill="#191970" stroke="#191970" points="2027.4589,-847.2056 2037.9998,-846.138 2029.0938,-840.3992 2027.4589,-847.2056"/>
</g>
-<!-- Node180 -->
+<!-- Node182 -->
<g id="node46" class="node">
-<title>Node180</title>
+<title>Node182</title>
<g id="a_node46"><a xlink:href="tir_2expr_8h.html" target="_top" xlink:title="TIR expressions. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1509.7886,-536.5 1509.7886,-555.5 1630.7886,-555.5 1630.7886,-536.5 1509.7886,-536.5"/>
<text text-anchor="middle" x="1570.2886" y="-543.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/expr.h</text>
</a>
</g>
</g>
-<!-- Node19->Node180 -->
+<!-- Node19->Node182 -->
<g id="edge160" class="edge">
-<title>Node19->Node180</title>
+<title>Node19->Node182</title>
<path fill="none" stroke="#191970" d="M2034.9259,-834.4251C2017.6997,-826.6345 2000.1761,-816.059 1987.2886,-802 1937.9724,-748.201 1983.8776,-697.5443 1930.2886,-648 1885.9289,-606.9884 1715.9879,-571.7267 1626.3681,-555.5093"/>
<polygon fill="#191970" stroke="#191970" points="2033.8622,-837.7761 2044.4342,-838.4712 2036.6031,-831.335 2033.8622,-837.7761"/>
</g>
-<!-- Node185 -->
+<!-- Node187 -->
<g id="node47" class="node">
-<title>Node185</title>
+<title>Node187</title>
<g id="a_node47"><a xlink:href="index__map_8h.html" target="_top" xlink:title="Defines a remapping of buffer indices. ">
<polygon fill="#ffffff" stroke="#ff0000" points="1779.2886,-648.5 1779.2886,-678.5 1897.2886,-678.5 1897.2886,-648.5 1779.2886,-648.5"/>
<text text-anchor="start" x="1787.2886" y="-666.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/tir/index</text>
@@ -371,15 +371,15 @@
</a>
</g>
</g>
-<!-- Node19->Node185 -->
+<!-- Node19->Node187 -->
<g id="edge161" class="edge">
-<title>Node19->Node185</title>
+<title>Node19->Node187</title>
<path fill="none" stroke="#191970" d="M2027.8697,-840.3669C1983.0179,-830.6617 1929.4449,-816.6407 1911.2886,-802 1870.4525,-769.0709 1849.6982,-707.0728 1841.9564,-678.5524"/>
<polygon fill="#191970" stroke="#191970" points="2027.3683,-843.8384 2037.8774,-842.4933 2028.8231,-836.9913 2027.3683,-843.8384"/>
</g>
-<!-- Node189 -->
+<!-- Node191 -->
<g id="node48" class="node">
-<title>Node189</title>
+<title>Node191</title>
<g id="a_node48"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
<polygon fill="#ffffff" stroke="#ff0000" points="3279.7886,-771.5 3279.7886,-801.5 3430.7886,-801.5 3430.7886,-771.5 3279.7886,-771.5"/>
<text text-anchor="start" x="3287.7886" y="-789.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/node/structural</text>
@@ -387,15 +387,15 @@
</a>
</g>
</g>
-<!-- Node19->Node189 -->
+<!-- Node19->Node191 -->
<g id="edge140" class="edge">
-<title>Node19->Node189</title>
+<title>Node19->Node191</title>
<path fill="none" stroke="#191970" d="M2164.7839,-851.6229C2381.3728,-845.5074 3049.7955,-825.2089 3265.2886,-802 3270.0178,-801.4907 3274.8745,-800.8878 3279.7639,-800.2204"/>
<polygon fill="#191970" stroke="#191970" points="2164.4077,-848.132 2154.5101,-851.9119 2164.6046,-855.1292 2164.4077,-848.132"/>
</g>
-<!-- Node198 -->
... 68864 lines suppressed ...