You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by tq...@apache.org on 2023/01/25 12:41:03 UTC
[tvm-site] branch asf-site updated: deploying docs (apache/tvm@56926009616e5f28bb42dfb9d136474e2bafde15)
This is an automated email from the ASF dual-hosted git repository.
tqchen pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/tvm-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 3709ff9259 deploying docs (apache/tvm@56926009616e5f28bb42dfb9d136474e2bafde15)
3709ff9259 is described below
commit 3709ff925925f34512f7e96c023304ed05dc0417
Author: tvm-bot <95...@users.noreply.github.com>
AuthorDate: Wed Jan 25 12:40:52 2023 +0000
deploying docs (apache/tvm@56926009616e5f28bb42dfb9d136474e2bafde15)
---
docs/_images/sphx_glr_micro_train_001.png | Bin 302810 -> 306809 bytes
docs/_images/sphx_glr_micro_train_thumb.png | Bin 22617 -> 22544 bytes
.../how_to/compile_models/from_darknet.rst.txt | 2 +-
.../how_to/compile_models/from_keras.rst.txt | 2 +-
.../how_to/compile_models/from_mxnet.rst.txt | 2 +-
.../how_to/compile_models/from_oneflow.rst.txt | 2 +-
.../how_to/compile_models/from_pytorch.rst.txt | 2 +-
.../how_to/compile_models/from_tensorflow.rst.txt | 2 +-
.../compile_models/sg_execution_times.rst.txt | 22 +-
.../deploy_models/deploy_model_on_adreno.rst.txt | 2 +-
.../deploy_models/deploy_model_on_android.rst.txt | 2 +-
.../deploy_object_detection_pytorch.rst.txt | 4 +-
.../deploy_models/deploy_prequantized.rst.txt | 6 +-
.../deploy_prequantized_tflite.rst.txt | 4 +-
.../how_to/deploy_models/deploy_quantized.rst.txt | 2 +-
.../deploy_models/deploy_ssd_gluoncv.rst.txt | 4 +-
.../deploy_models/sg_execution_times.rst.txt | 20 +-
.../extend_tvm/bring_your_own_datatypes.rst.txt | 2 +-
.../how_to/extend_tvm/sg_execution_times.rst.txt | 8 +-
.../how_to/extend_tvm/use_pass_instrument.rst.txt | 16 +-
.../optimize_operators/opt_conv_cuda.rst.txt | 2 +-
.../optimize_operators/opt_conv_tensorcore.rst.txt | 2 +-
.../how_to/optimize_operators/opt_gemm.rst.txt | 16 +-
.../optimize_operators/sg_execution_times.rst.txt | 8 +-
.../sg_execution_times.rst.txt | 14 +-
.../tune_conv2d_layer_cuda.rst.txt | 1583 +++--
.../tune_network_cuda.rst.txt | 4 +-
.../tune_network_x86.rst.txt | 4 +-
.../tune_sparse_x86.rst.txt | 90 +-
.../tune_with_autotvm/sg_execution_times.rst.txt | 6 +-
.../tune_with_autotvm/tune_conv2d_cuda.rst.txt | 357 +-
.../work_with_microtvm/micro_autotune.rst.txt | 16 +-
.../work_with_microtvm/micro_pytorch.rst.txt | 4 +-
.../how_to/work_with_microtvm/micro_train.rst.txt | 18 +-
.../work_with_microtvm/sg_execution_times.rst.txt | 12 +-
.../work_with_relay/sg_execution_times.rst.txt | 8 +-
.../how_to/work_with_schedules/intrin_math.rst.txt | 2 +-
.../work_with_schedules/sg_execution_times.rst.txt | 14 +-
.../how_to/work_with_schedules/tensorize.rst.txt | 2 +-
.../tutorials/autotvm/sg_execution_times.rst.txt | 4 +-
.../frontend/deploy_classification.rst.txt | 2 +-
.../tutorials/frontend/deploy_detection.rst.txt | 2 +-
.../tutorials/frontend/sg_execution_times.rst.txt | 6 +-
.../tutorials/optimize/sg_execution_times.rst.txt | 6 +-
.../topic/vta/tutorials/sg_execution_times.rst.txt | 6 +-
.../tutorial/auto_scheduler_matmul_x86.rst.txt | 4 +-
docs/_sources/tutorial/autotvm_matmul_x86.rst.txt | 20 +-
docs/_sources/tutorial/autotvm_relay_x86.rst.txt | 57 +-
.../tutorial/cross_compilation_and_rpc.rst.txt | 2 +-
docs/_sources/tutorial/intro_topi.rst.txt | 2 +-
docs/_sources/tutorial/sg_execution_times.rst.txt | 18 +-
.../tutorial/tensor_expr_get_started.rst.txt | 45 +-
docs/commit_hash | 2 +-
docs/how_to/compile_models/from_darknet.html | 2 +-
docs/how_to/compile_models/from_keras.html | 2 +-
docs/how_to/compile_models/from_mxnet.html | 2 +-
docs/how_to/compile_models/from_oneflow.html | 13 +-
docs/how_to/compile_models/from_pytorch.html | 12 +-
docs/how_to/compile_models/from_tensorflow.html | 2 +-
docs/how_to/compile_models/sg_execution_times.html | 26 +-
.../deploy_models/deploy_model_on_adreno.html | 2 +-
.../deploy_models/deploy_model_on_android.html | 2 +-
.../deploy_object_detection_pytorch.html | 41 +-
docs/how_to/deploy_models/deploy_prequantized.html | 8 +-
.../deploy_models/deploy_prequantized_tflite.html | 4 +-
docs/how_to/deploy_models/deploy_quantized.html | 2 +-
docs/how_to/deploy_models/deploy_ssd_gluoncv.html | 36 +-
docs/how_to/deploy_models/sg_execution_times.html | 20 +-
.../extend_tvm/bring_your_own_datatypes.html | 2 +-
docs/how_to/extend_tvm/sg_execution_times.html | 8 +-
docs/how_to/extend_tvm/use_pass_instrument.html | 16 +-
docs/how_to/optimize_operators/opt_conv_cuda.html | 2 +-
.../optimize_operators/opt_conv_tensorcore.html | 2 +-
docs/how_to/optimize_operators/opt_gemm.html | 16 +-
.../optimize_operators/sg_execution_times.html | 8 +-
.../sg_execution_times.html | 14 +-
.../tune_conv2d_layer_cuda.html | 1583 +++--
.../tune_with_autoscheduler/tune_network_cuda.html | 4 +-
.../tune_with_autoscheduler/tune_network_x86.html | 4 +-
.../tune_with_autoscheduler/tune_sparse_x86.html | 90 +-
.../tune_with_autotvm/sg_execution_times.html | 6 +-
.../how_to/tune_with_autotvm/tune_conv2d_cuda.html | 357 +-
docs/how_to/work_with_microtvm/micro_autotune.html | 16 +-
docs/how_to/work_with_microtvm/micro_pytorch.html | 6 +-
docs/how_to/work_with_microtvm/micro_train.html | 16 +-
.../work_with_microtvm/sg_execution_times.html | 12 +-
.../how_to/work_with_relay/sg_execution_times.html | 8 +-
docs/how_to/work_with_schedules/intrin_math.html | 2 +-
.../work_with_schedules/sg_execution_times.html | 14 +-
docs/how_to/work_with_schedules/tensorize.html | 2 +-
docs/install/nnpack.html | 12 +-
docs/reference/api/doxygen/affine__type_8h.html | 2 +-
.../api/doxygen/affine__type_8h__incl.svg | 1288 ++--
.../api/doxygen/affine__type_8h_source.html | 2 +-
docs/reference/api/doxygen/algorithm_8h.html | 2 +-
docs/reference/api/doxygen/algorithm_8h__incl.svg | 1872 +++---
docs/reference/api/doxygen/algorithms_8h.html | 2 +-
docs/reference/api/doxygen/algorithms_8h__incl.svg | 1531 ++---
docs/reference/api/doxygen/analyzer_8h.html | 2 +-
.../api/doxygen/analyzer_8h__dep__incl.svg | 652 +-
docs/reference/api/doxygen/analyzer_8h__incl.svg | 1892 +++---
docs/reference/api/doxygen/annotated.html | 1789 +++---
docs/reference/api/doxygen/annotation_8h.html | 2 +-
docs/reference/api/doxygen/annotation_8h__incl.svg | 1172 ++--
docs/reference/api/doxygen/arg__info_8h.html | 2 +-
.../api/doxygen/arg__info_8h__dep__incl.svg | 128 +-
docs/reference/api/doxygen/arg__info_8h__incl.svg | 1726 +++---
docs/reference/api/doxygen/array_8h.html | 2 +-
docs/reference/api/doxygen/array_8h__dep__incl.svg | 1691 +++--
docs/reference/api/doxygen/array__utils_8h.html | 2 +-
.../api/doxygen/array__utils_8h__dep__incl.svg | 44 +-
.../api/doxygen/array__utils_8h__incl.svg | 1638 ++---
.../api/doxygen/auto__schedule_8h__incl.svg | 204 +-
.../auto__scheduler_2cost__model_8h__incl.svg | 176 +-
.../doxygen/auto__scheduler_2feature_8h__incl.svg | 416 +-
docs/reference/api/doxygen/autodiff_8h.html | 2 +-
docs/reference/api/doxygen/autodiff_8h__incl.svg | 1851 +++---
docs/reference/api/doxygen/bias__add_8h__incl.svg | 672 +-
docs/reference/api/doxygen/bitserial_8h.html | 2 +-
docs/reference/api/doxygen/bitserial_8h__incl.svg | 1738 +++---
docs/reference/api/doxygen/block__scope_8h.html | 2 +-
.../api/doxygen/block__scope_8h__dep__incl.svg | 180 +-
.../api/doxygen/block__scope_8h__incl.svg | 1378 +++--
docs/reference/api/doxygen/bound_8h.html | 2 +-
docs/reference/api/doxygen/bound_8h__dep__incl.svg | 628 +-
docs/reference/api/doxygen/bound_8h__incl.svg | 1922 +++---
docs/reference/api/doxygen/broadcast_8h.html | 2 +-
.../api/doxygen/broadcast_8h__dep__incl.svg | 92 +-
docs/reference/api/doxygen/broadcast_8h__incl.svg | 2095 ++++---
docs/reference/api/doxygen/buffer_8h.html | 2 +-
.../reference/api/doxygen/buffer_8h__dep__incl.svg | 640 +-
docs/reference/api/doxygen/buffer_8h__incl.svg | 1328 ++--
docs/reference/api/doxygen/buffer_8h_source.html | 2 +-
docs/reference/api/doxygen/builder_8h.html | 2 +-
.../api/doxygen/builder_8h__dep__incl.svg | 40 +-
docs/reference/api/doxygen/builder_8h__incl.svg | 1553 +++--
docs/reference/api/doxygen/builtin_8h.html | 2 +-
.../api/doxygen/builtin_8h__dep__incl.svg | 108 +-
docs/reference/api/doxygen/builtin_8h__incl.svg | 1546 ++---
docs/reference/api/doxygen/c__runtime__api_8h.html | 2 +-
.../api/doxygen/c__runtime__api_8h__dep__incl.svg | 1339 ++--
docs/reference/api/doxygen/call_8h.html | 2 +-
docs/reference/api/doxygen/call_8h__incl.svg | 1172 ++--
docs/reference/api/doxygen/classes.html | 40 +-
.../api/doxygen/classtvm_1_1IRModule-members.html | 2 +-
.../api/doxygen/classtvm_1_1IRModule.html | 12 +-
.../doxygen/classtvm_1_1IRModuleNode-members.html | 2 +-
.../api/doxygen/classtvm_1_1IRModuleNode.html | 14 +-
.../classtvm_1_1IRModuleNode__coll__graph.svg | 402 +-
...embers.html => classtvm_1_1Source-members.html} | 12 +-
...rser_1_1Source.html => classtvm_1_1Source.html} | 52 +-
...ers.html => classtvm_1_1SourceMap-members.html} | 18 +-
..._1SourceMap.html => classtvm_1_1SourceMap.html} | 98 +-
...html => classtvm_1_1SourceMapNode-members.html} | 16 +-
...MapNode.html => classtvm_1_1SourceMapNode.html} | 76 +-
... => classtvm_1_1SourceMapNode__coll__graph.svg} | 10 +-
... classtvm_1_1SourceMapNode__inherit__graph.svg} | 6 +-
....svg => classtvm_1_1SourceMap__coll__graph.svg} | 6 +-
...g => classtvm_1_1SourceMap__inherit__graph.svg} | 6 +-
.../api/doxygen/classtvm_1_1SourceName.html | 4 +-
.../api/doxygen/classtvm_1_1SourceNameNode.html | 4 +-
...rs.html => classtvm_1_1SourceNode-members.html} | 18 +-
...SourceNode.html => classtvm_1_1SourceNode.html} | 86 +-
...svg => classtvm_1_1SourceNode__coll__graph.svg} | 6 +-
... => classtvm_1_1SourceNode__inherit__graph.svg} | 6 +-
...aph.svg => classtvm_1_1Source__coll__graph.svg} | 6 +-
....svg => classtvm_1_1Source__inherit__graph.svg} | 6 +-
docs/reference/api/doxygen/classtvm_1_1Span.html | 4 +-
.../api/doxygen/classtvm_1_1SpanNode.html | 4 +-
.../doxygen/classtvm_1_1runtime_1_1ArrayNode.html | 4 +-
.../api/doxygen/classtvm_1_1runtime_1_1Object.html | 2 +-
.../doxygen/classtvm_1_1runtime_1_1ObjectRef.html | 2 +-
.../api/doxygen/classtvm_1_1support_1_1Span.html | 4 +-
...sstvm_1_1support_1_1Span_1_1iterator__base.html | 4 +-
docs/reference/api/doxygen/codegen_8h.html | 2 +-
docs/reference/api/doxygen/codegen_8h__incl.svg | 2267 ++++---
.../api/doxygen/compilation__config_8h.html | 2 +-
.../doxygen/compilation__config_8h__dep__incl.svg | 20 +-
.../api/doxygen/compilation__config_8h__incl.svg | 1814 +++---
docs/reference/api/doxygen/compute__dag_8h.html | 2 +-
.../api/doxygen/compute__dag_8h__dep__incl.svg | 76 +-
.../api/doxygen/compute__dag_8h__incl.svg | 1500 ++---
docs/reference/api/doxygen/constant__utils_8h.html | 2 +-
.../api/doxygen/constant__utils_8h__dep__incl.svg | 168 +-
.../api/doxygen/constant__utils_8h__incl.svg | 2467 ++++----
docs/reference/api/doxygen/cublas_8h.html | 2 +-
.../reference/api/doxygen/cublas_8h__dep__incl.svg | 20 +-
docs/reference/api/doxygen/cublas_8h__incl.svg | 1718 ++---
.../api/doxygen/cuda_2dense_8h__dep__incl.svg | 12 +-
.../reference/api/doxygen/cuda_2dense_8h__incl.svg | 560 +-
.../api/doxygen/cuda_2injective_8h__dep__incl.svg | 12 +-
.../api/doxygen/cuda_2injective_8h__incl.svg | 556 +-
.../api/doxygen/cuda_2pooling_8h__dep__incl.svg | 12 +-
.../api/doxygen/cuda_2pooling_8h__incl.svg | 528 +-
.../api/doxygen/cuda_2reduction_8h__dep__incl.svg | 12 +-
.../api/doxygen/cuda_2reduction_8h__incl.svg | 556 +-
.../api/doxygen/cuda_2softmax_8h__dep__incl.svg | 12 +-
.../api/doxygen/cuda_2softmax_8h__incl.svg | 556 +-
docs/reference/api/doxygen/data__layout_8h.html | 2 +-
.../api/doxygen/data__layout_8h__dep__incl.svg | 112 +-
.../api/doxygen/data__layout_8h__incl.svg | 1862 +++---
docs/reference/api/doxygen/data__type_8h.html | 2 +-
.../api/doxygen/data__type_8h__dep__incl.svg | 1392 +++--
.../api/doxygen/data__type__rewriter_8h.html | 2 +-
.../api/doxygen/data__type__rewriter_8h__incl.svg | 1360 ++--
docs/reference/api/doxygen/database_8h.html | 2 +-
.../api/doxygen/database_8h__dep__incl.svg | 44 +-
docs/reference/api/doxygen/database_8h__incl.svg | 1563 ++---
.../api/doxygen/dataflow__matcher_8h.html | 2 +-
.../api/doxygen/dataflow__matcher_8h__incl.svg | 2094 +++----
.../api/doxygen/dataflow__pattern_8h.html | 2 +-
.../doxygen/dataflow__pattern_8h__dep__incl.svg | 24 +-
.../api/doxygen/dataflow__pattern_8h__incl.svg | 2131 ++++---
.../api/doxygen/dataflow__pattern__functor_8h.html | 2 +-
.../dataflow__pattern__functor_8h__dep__incl.svg | 12 +-
.../dataflow__pattern__functor_8h__incl.svg | 2104 +++----
.../api/doxygen/detail_2broadcast_8h.html | 2 +-
.../doxygen/detail_2broadcast_8h__dep__incl.svg | 104 +-
.../api/doxygen/detail_2broadcast_8h__incl.svg | 1988 +++---
docs/reference/api/doxygen/detail_2extern_8h.html | 2 +-
.../api/doxygen/detail_2extern_8h__dep__incl.svg | 40 +-
.../api/doxygen/detail_2extern_8h__incl.svg | 2006 +++---
docs/reference/api/doxygen/device__copy_8h.html | 2 +-
.../api/doxygen/device__copy_8h__incl.svg | 1791 +++---
docs/reference/api/doxygen/diagnostic_8h.html | 3 +-
.../api/doxygen/diagnostic_8h__dep__incl.svg | 539 +-
docs/reference/api/doxygen/diagnostic_8h__incl.svg | 2044 +++---
.../api/doxygen/diagnostic_8h_source.html | 66 +-
docs/reference/api/doxygen/dilate_8h.html | 2 +-
docs/reference/api/doxygen/dilate_8h__incl.svg | 2071 ++++---
...r_000002_000013.html => dir_000002_000030.html} | 0
...r_000003_000020.html => dir_000003_000019.html} | 0
...r_000003_000032.html => dir_000003_000031.html} | 0
docs/reference/api/doxygen/dir_000003_000032.html | 4 +-
docs/reference/api/doxygen/dir_000003_000033.html | 73 -
...r_000004_000020.html => dir_000004_000019.html} | 0
...r_000004_000013.html => dir_000004_000030.html} | 0
...r_000004_000032.html => dir_000004_000031.html} | 0
docs/reference/api/doxygen/dir_000005_000007.html | 2 +-
docs/reference/api/doxygen/dir_000005_000008.html | 2 +-
...r_000005_000014.html => dir_000005_000013.html} | 0
...r_000005_000020.html => dir_000005_000019.html} | 0
...r_000005_000032.html => dir_000005_000031.html} | 0
docs/reference/api/doxygen/dir_000005_000032.html | 4 +-
docs/reference/api/doxygen/dir_000005_000033.html | 73 -
...r_000006_000013.html => dir_000006_000030.html} | 0
...r_000006_000032.html => dir_000006_000031.html} | 0
docs/reference/api/doxygen/dir_000006_000032.html | 4 +-
docs/reference/api/doxygen/dir_000006_000033.html | 73 -
docs/reference/api/doxygen/dir_000007_000008.html | 2 +-
docs/reference/api/doxygen/dir_000007_000009.html | 73 -
...r_000007_000020.html => dir_000007_000019.html} | 2 +-
docs/reference/api/doxygen/dir_000007_000021.html | 73 -
...r_000007_000013.html => dir_000007_000030.html} | 0
...r_000007_000032.html => dir_000007_000031.html} | 0
docs/reference/api/doxygen/dir_000008_000007.html | 73 -
...r_000008_000013.html => dir_000008_000030.html} | 2 +-
...r_000011_000020.html => dir_000011_000019.html} | 0
...r_000011_000013.html => dir_000011_000030.html} | 0
...r_000011_000032.html => dir_000011_000031.html} | 0
...r_000014_000007.html => dir_000013_000007.html} | 0
...r_000014_000011.html => dir_000013_000011.html} | 0
...r_000014_000032.html => dir_000013_000031.html} | 0
...r_000015_000002.html => dir_000014_000002.html} | 0
docs/reference/api/doxygen/dir_000014_000007.html | 6 +-
...r_000015_000008.html => dir_000014_000008.html} | 0
...r_000015_000032.html => dir_000014_000031.html} | 0
docs/reference/api/doxygen/dir_000015_000002.html | 6 +-
docs/reference/api/doxygen/dir_000015_000007.html | 6 +-
docs/reference/api/doxygen/dir_000015_000008.html | 6 +-
...r_000016_000011.html => dir_000015_000011.html} | 0
...r_000016_000032.html => dir_000015_000031.html} | 0
docs/reference/api/doxygen/dir_000015_000032.html | 6 +-
docs/reference/api/doxygen/dir_000016_000002.html | 73 -
docs/reference/api/doxygen/dir_000016_000007.html | 73 -
docs/reference/api/doxygen/dir_000016_000008.html | 73 -
docs/reference/api/doxygen/dir_000016_000011.html | 6 +-
docs/reference/api/doxygen/dir_000016_000033.html | 73 -
docs/reference/api/doxygen/dir_000017_000011.html | 6 +-
docs/reference/api/doxygen/dir_000018_000011.html | 6 +-
...r_000020_000008.html => dir_000019_000008.html} | 0
docs/reference/api/doxygen/dir_000019_000011.html | 73 -
...r_000022_000009.html => dir_000020_000009.html} | 0
docs/reference/api/doxygen/dir_000021_000007.html | 6 +-
docs/reference/api/doxygen/dir_000021_000008.html | 6 +-
...r_000023_000011.html => dir_000021_000011.html} | 0
...r_000023_000020.html => dir_000021_000019.html} | 0
...r_000024_000007.html => dir_000022_000007.html} | 0
...r_000024_000011.html => dir_000022_000011.html} | 0
...r_000024_000020.html => dir_000022_000019.html} | 0
docs/reference/api/doxygen/dir_000023_000007.html | 73 -
docs/reference/api/doxygen/dir_000023_000008.html | 73 -
...r_000025_000009.html => dir_000023_000009.html} | 0
...r_000029_000007.html => dir_000027_000007.html} | 0
...r_000029_000020.html => dir_000027_000019.html} | 0
...r_000030_000011.html => dir_000028_000011.html} | 0
...r_000030_000029.html => dir_000028_000027.html} | 0
docs/reference/api/doxygen/dir_000029_000007.html | 6 +-
...r_000031_000008.html => dir_000029_000008.html} | 0
...r_000031_000020.html => dir_000029_000019.html} | 0
...r_000013_000008.html => dir_000030_000008.html} | 0
docs/reference/api/doxygen/dir_000031_000007.html | 6 +-
docs/reference/api/doxygen/dir_000031_000008.html | 6 +-
...r_000032_000011.html => dir_000031_000011.html} | 0
...r_000032_000020.html => dir_000031_000019.html} | 0
...r_000032_000013.html => dir_000031_000030.html} | 0
...r_000033_000002.html => dir_000032_000002.html} | 0
docs/reference/api/doxygen/dir_000032_000007.html | 73 -
docs/reference/api/doxygen/dir_000032_000008.html | 6 +-
docs/reference/api/doxygen/dir_000032_000011.html | 6 +-
...r_000033_000013.html => dir_000032_000030.html} | 0
...r_000034_000007.html => dir_000033_000007.html} | 0
docs/reference/api/doxygen/dir_000033_000008.html | 73 -
docs/reference/api/doxygen/dir_000033_000011.html | 73 -
...r_000034_000020.html => dir_000033_000019.html} | 0
...r_000034_000013.html => dir_000033_000030.html} | 0
...r_000035_000002.html => dir_000034_000002.html} | 0
...r_000035_000008.html => dir_000034_000008.html} | 0
docs/reference/api/doxygen/dir_000035_000002.html | 6 +-
...r_000036_000011.html => dir_000035_000011.html} | 0
...r_000036_000033.html => dir_000035_000032.html} | 0
docs/reference/api/doxygen/dir_000036_000002.html | 73 -
...r_000037_000033.html => dir_000036_000032.html} | 0
...r_000037_000036.html => dir_000036_000035.html} | 0
...r_000038_000032.html => dir_000037_000031.html} | 0
...r_000038_000033.html => dir_000037_000032.html} | 0
...r_000038_000036.html => dir_000037_000035.html} | 0
docs/reference/api/doxygen/dir_000037_000036.html | 6 +-
...r_000038_000039.html => dir_000037_000038.html} | 0
...r_000038_000041.html => dir_000037_000040.html} | 0
...r_000039_000002.html => dir_000038_000002.html} | 0
docs/reference/api/doxygen/dir_000038_000032.html | 6 +-
...r_000039_000036.html => dir_000038_000035.html} | 0
docs/reference/api/doxygen/dir_000038_000037.html | 73 -
...r_000040_000032.html => dir_000039_000031.html} | 0
...r_000040_000033.html => dir_000039_000032.html} | 0
docs/reference/api/doxygen/dir_000039_000033.html | 73 -
...r_000040_000036.html => dir_000039_000035.html} | 0
docs/reference/api/doxygen/dir_000039_000036.html | 6 +-
...r_000040_000038.html => dir_000039_000037.html} | 0
...r_000040_000039.html => dir_000039_000038.html} | 0
...r_000040_000041.html => dir_000039_000040.html} | 0
...r_000041_000032.html => dir_000040_000031.html} | 0
docs/reference/api/doxygen/dir_000040_000032.html | 6 +-
...r_000041_000036.html => dir_000040_000035.html} | 0
docs/reference/api/doxygen/dir_000040_000037.html | 73 -
...r_000042_000032.html => dir_000041_000031.html} | 0
docs/reference/api/doxygen/dir_000041_000032.html | 6 +-
docs/reference/api/doxygen/dir_000041_000033.html | 73 -
...r_000042_000036.html => dir_000041_000035.html} | 0
docs/reference/api/doxygen/dir_000042_000032.html | 6 +-
docs/reference/api/doxygen/dir_000042_000033.html | 73 -
...r_000043_000036.html => dir_000042_000035.html} | 0
docs/reference/api/doxygen/dir_000043_000033.html | 73 -
.../dir_006b1f4ac353a18abb55f74cc4796db6_dep.svg | 10 +-
.../dir_02be2c9d68e402f80df60bd528724ee5.html | 2 +-
.../dir_02be2c9d68e402f80df60bd528724ee5_dep.svg | 236 +-
.../dir_05ffda4d144d7985f926507abde48dbb.html | 2 +-
.../dir_05ffda4d144d7985f926507abde48dbb_dep.svg | 126 +-
.../dir_194ecda214f05a38134392ac6a69b970_dep.svg | 6 +-
.../dir_1f1b12d204a071c9e67e47fcbb552b86_dep.svg | 10 +-
.../dir_2b0ef9f1c86b565a92e96353e1195b2c_dep.svg | 8 +-
.../dir_3a038e7bfa2370c6aee2a5aecd5d3ef1_dep.svg | 12 +-
.../dir_404558507ed35459f0d7a6d81d8c508d_dep.svg | 81 +-
.../dir_4378f18824ae7d4ad48f8d7785cd7ac8.html | 2 +-
.../dir_4378f18824ae7d4ad48f8d7785cd7ac8_dep.svg | 254 +-
.../dir_437a885699bf6787e92bcac6040bb86f_dep.svg | 2 +-
.../dir_519be2d4a83a987dbf989f1de527b870_dep.svg | 14 +-
.../dir_536029070df27a3ee03a4230630922c5_dep.svg | 2 +-
.../dir_54983dd6d74c59f67ee9e8e5a50aafc4_dep.svg | 42 +-
.../dir_5baffeed82c1190bfdf7a4f918ab5ac6_dep.svg | 2 +-
.../dir_5da96592f3a7c442b838b075c58254c2.html | 2 +-
.../dir_5da96592f3a7c442b838b075c58254c2_dep.svg | 202 +-
.../dir_63946bee875c6d52bce55e72a67a86ad.html | 4 +-
.../dir_63946bee875c6d52bce55e72a67a86ad_dep.svg | 250 +-
.../dir_67fdee7a5e0396034822418fa5baa4b4_dep.svg | 4 +-
.../dir_6cd4295f6ad5aa17e5b568d0e5b190e5_dep.svg | 2 +-
.../dir_72c2f11201cd7636dc7624de0754daa5.html | 2 +-
.../dir_72c2f11201cd7636dc7624de0754daa5_dep.svg | 258 +-
.../dir_8395ded0a3205c0748976a0d4487d38d_dep.svg | 8 +-
.../dir_84875704194fd544d29fe0c7fedd8939.html | 2 +-
.../dir_84875704194fd544d29fe0c7fedd8939_dep.svg | 160 +-
.../dir_8e4e25e66b8623d88c5b5dd2040bca97.html | 2 +-
.../dir_8e4e25e66b8623d88c5b5dd2040bca97_dep.svg | 488 +-
.../dir_9e615ec4a59e46584bcc4e2226e148a2_dep.svg | 12 +-
.../dir_a59a89c7dd2e4e6561fe59bf359ce2f3.html | 2 +-
.../dir_a59a89c7dd2e4e6561fe59bf359ce2f3_dep.svg | 88 +-
.../dir_a98464176f1216e334ac3bbacd433085_dep.svg | 22 +-
.../dir_ac57496531ccbad72f774fa62e6de987_dep.svg | 28 +-
.../dir_af4961563c20a83bf971a498792e6dee_dep.svg | 4 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5.html | 4 +-
.../dir_b4c7d8e826c599ba55146c099a14beb5_dep.svg | 639 +-
.../dir_c20c9fad5dedbc870b2ada04754d1b9b_dep.svg | 2 +-
.../dir_d331277d4303e21ded95616eb56c1a9e_dep.svg | 6 +-
.../dir_d3953cf7eb33eca56fc6850c0e98447d_dep.svg | 6 +-
.../dir_d4a54fa981698f72ef4cd62f8b9e1a8f_dep.svg | 4 +-
.../dir_d523279167051dc3aad9a40981221f4d.html | 2 +-
.../dir_d523279167051dc3aad9a40981221f4d_dep.svg | 62 +-
.../dir_dc867ff9a37cad1764f1670dc7eba6c1.html | 6 +-
.../dir_dc867ff9a37cad1764f1670dc7eba6c1_dep.svg | 181 +-
.../dir_e4a1a856a30057b9b1543256279fc7a1_dep.svg | 6 +-
.../dir_f97d855a3173728370e632aa77170e34_dep.svg | 16 +-
.../dir_fafc18f54a755f417c55c769623cbfef.html | 93 -
.../dir_fafc18f54a755f417c55c769623cbfef_dep.svg | 98 -
.../dir_fb1b1bc11a768ab8cf63a96a73170118.html | 4 +-
.../dir_fb1b1bc11a768ab8cf63a96a73170118_dep.svg | 4 +-
docs/reference/api/doxygen/doc_8h.html | 2 +-
docs/reference/api/doxygen/doc_8h__dep__incl.svg | 12 +-
docs/reference/api/doxygen/doc_8h__incl.svg | 1251 ++--
docs/reference/api/doxygen/driver__api_8h.html | 2 +-
.../reference/api/doxygen/driver__api_8h__incl.svg | 2489 ++++----
docs/reference/api/doxygen/einsum_8h.html | 2 +-
docs/reference/api/doxygen/einsum_8h__incl.svg | 1990 +++---
docs/reference/api/doxygen/elemwise_8h.html | 2 +-
.../api/doxygen/elemwise_8h__dep__incl.svg | 48 +-
docs/reference/api/doxygen/elemwise_8h__incl.svg | 1460 ++---
docs/reference/api/doxygen/error_8h.html | 5 +-
docs/reference/api/doxygen/error_8h__dep__incl.svg | 20 +-
docs/reference/api/doxygen/error_8h__incl.svg | 2058 +++---
docs/reference/api/doxygen/error_8h_source.html | 31 +-
docs/reference/api/doxygen/executor_8h.html | 2 +-
docs/reference/api/doxygen/executor_8h__incl.svg | 1397 +++--
docs/reference/api/doxygen/extracted__task_8h.html | 2 +-
.../api/doxygen/extracted__task_8h__incl.svg | 1835 +++---
.../api/doxygen/feature__extractor_8h__incl.svg | 88 +-
docs/reference/api/doxygen/files.html | 442 +-
docs/reference/api/doxygen/flatten_8h.html | 2 +-
docs/reference/api/doxygen/flatten_8h__incl.svg | 1998 +++---
docs/reference/api/doxygen/functions__.html | 4 +-
docs/reference/api/doxygen/functions_a.html | 2 +-
docs/reference/api/doxygen/functions_func_a.html | 2 +-
docs/reference/api/doxygen/functions_func_g.html | 2 +-
docs/reference/api/doxygen/functions_func_i.html | 2 +-
docs/reference/api/doxygen/functions_func_o.html | 8 +-
docs/reference/api/doxygen/functions_func_s.html | 12 +-
docs/reference/api/doxygen/functions_func_t.html | 14 +-
docs/reference/api/doxygen/functions_func_v.html | 20 +-
docs/reference/api/doxygen/functions_g.html | 2 +-
docs/reference/api/doxygen/functions_i.html | 2 +-
docs/reference/api/doxygen/functions_l.html | 2 +-
docs/reference/api/doxygen/functions_o.html | 2 +-
docs/reference/api/doxygen/functions_s.html | 20 +-
docs/reference/api/doxygen/functions_t.html | 14 +-
docs/reference/api/doxygen/functions_v.html | 28 +-
docs/reference/api/doxygen/functions_vars.html | 4 +-
docs/reference/api/doxygen/functions_vars_l.html | 2 +-
docs/reference/api/doxygen/functions_vars_s.html | 8 +-
.../api/doxygen/functor_8h__dep__incl.svg | 297 +-
docs/reference/api/doxygen/fuse_8h.html | 2 +-
docs/reference/api/doxygen/fuse_8h__dep__incl.svg | 156 +-
docs/reference/api/doxygen/fuse_8h__incl.svg | 1638 ++---
.../api/doxygen/generic_2default_8h__incl.svg | 556 +-
.../api/doxygen/generic_2extern_8h__dep__incl.svg | 24 +-
.../api/doxygen/generic_2extern_8h__incl.svg | 544 +-
.../doxygen/generic_2injective_8h__dep__incl.svg | 32 +-
.../api/doxygen/generic_2injective_8h__incl.svg | 556 +-
docs/reference/api/doxygen/generic__func_8h.html | 2 +-
.../api/doxygen/generic__func_8h__dep__incl.svg | 196 +-
.../api/doxygen/generic__func_8h__incl.svg | 1417 ++---
.../api/doxygen/global__var__supply_8h.html | 2 +-
.../doxygen/global__var__supply_8h__dep__incl.svg | 12 +-
.../api/doxygen/global__var__supply_8h__incl.svg | 2097 +++----
docs/reference/api/doxygen/greedy_8h.html | 2 +-
docs/reference/api/doxygen/greedy_8h__incl.svg | 2080 ++++---
docs/reference/api/doxygen/hierarchy.html | 476 +-
docs/reference/api/doxygen/image_8h.html | 2 +-
docs/reference/api/doxygen/image_8h__incl.svg | 1738 +++---
docs/reference/api/doxygen/index__map_8h.html | 2 +-
.../api/doxygen/index__map_8h__dep__incl.svg | 600 +-
docs/reference/api/doxygen/index__map_8h__incl.svg | 1322 ++--
docs/reference/api/doxygen/inherit_graph_112.svg | 6542 ++++++++++----------
docs/reference/api/doxygen/inherit_graph_121.svg | 1958 +++---
docs/reference/api/doxygen/inherit_graph_74.svg | 2 +-
docs/reference/api/doxygen/inherits.html | 2 +-
.../api/doxygen/instrument_8h__dep__incl.svg | 11 +-
docs/reference/api/doxygen/int__set_8h.html | 2 +-
.../api/doxygen/int__set_8h__dep__incl.svg | 656 +-
docs/reference/api/doxygen/int__set_8h__incl.svg | 1644 ++---
docs/reference/api/doxygen/int__solver_8h.html | 2 +-
.../reference/api/doxygen/int__solver_8h__incl.svg | 1759 +++---
docs/reference/api/doxygen/interpreter_8h.html | 2 +-
.../reference/api/doxygen/interpreter_8h__incl.svg | 1943 +++---
docs/reference/api/doxygen/ir_2adt_8h.html | 2 +-
.../api/doxygen/ir_2adt_8h__dep__incl.svg | 883 ++-
docs/reference/api/doxygen/ir_2adt_8h__incl.svg | 1318 ++--
docs/reference/api/doxygen/ir_2attrs_8h.html | 2 +-
.../api/doxygen/ir_2attrs_8h__dep__incl.svg | 568 +-
docs/reference/api/doxygen/ir_2attrs_8h__incl.svg | 1120 ++--
docs/reference/api/doxygen/ir_2expr_8h.html | 6 +-
.../api/doxygen/ir_2expr_8h__dep__incl.svg | 1370 ++--
docs/reference/api/doxygen/ir_2expr_8h__incl.svg | 1256 ++--
docs/reference/api/doxygen/ir_2expr_8h_source.html | 6 +-
docs/reference/api/doxygen/ir_2frame_8h.html | 2 +-
.../api/doxygen/ir_2frame_8h__dep__incl.svg | 28 +-
docs/reference/api/doxygen/ir_2frame_8h__incl.svg | 1382 ++---
docs/reference/api/doxygen/ir_2function_8h.html | 2 +-
.../api/doxygen/ir_2function_8h__dep__incl.svg | 1013 ++-
.../api/doxygen/ir_2function_8h__incl.svg | 1249 ++--
docs/reference/api/doxygen/ir_2ir_8h.html | 2 +-
docs/reference/api/doxygen/ir_2ir_8h__incl.svg | 1612 +++--
docs/reference/api/doxygen/ir_2module_8h.html | 6 +-
.../api/doxygen/ir_2module_8h__dep__incl.svg | 1241 ++--
docs/reference/api/doxygen/ir_2module_8h__incl.svg | 1431 +++--
.../api/doxygen/ir_2module_8h_source.html | 6 +-
docs/reference/api/doxygen/ir_2op_8h.html | 2 +-
.../reference/api/doxygen/ir_2op_8h__dep__incl.svg | 572 +-
docs/reference/api/doxygen/ir_2op_8h__incl.svg | 1391 ++---
docs/reference/api/doxygen/ir_2span_8h.html | 123 -
.../api/doxygen/ir_2span_8h__dep__incl.svg | 1224 ----
docs/reference/api/doxygen/ir_2span_8h__incl.svg | 1294 ----
docs/reference/api/doxygen/ir_2span_8h_source.html | 107 -
docs/reference/api/doxygen/ir_2transform_8h.html | 2 +-
.../api/doxygen/ir_2transform_8h__dep__incl.svg | 559 +-
.../api/doxygen/ir_2transform_8h__incl.svg | 2074 ++++---
docs/reference/api/doxygen/ir_2type_8h.html | 6 +-
.../api/doxygen/ir_2type_8h__dep__incl.svg | 1727 +++---
docs/reference/api/doxygen/ir_2type_8h__incl.svg | 1141 ++--
docs/reference/api/doxygen/ir_2type_8h_source.html | 6 +-
docs/reference/api/doxygen/ir__docsifier_8h.html | 2 +-
.../api/doxygen/ir__docsifier_8h__incl.svg | 1883 +++---
.../api/doxygen/iter__affine__map_8h.html | 2 +-
.../api/doxygen/iter__affine__map_8h__incl.svg | 1642 ++---
docs/reference/api/doxygen/layer__norm_8h.html | 2 +-
.../reference/api/doxygen/layer__norm_8h__incl.svg | 1672 ++---
.../api/doxygen/local__response__norm_8h.html | 2 +-
.../api/doxygen/local__response__norm_8h__incl.svg | 1672 ++---
.../api/doxygen/loop__state_8h__dep__incl.svg | 88 +-
.../reference/api/doxygen/loop__state_8h__incl.svg | 200 +-
docs/reference/api/doxygen/map_8h.html | 2 +-
docs/reference/api/doxygen/map_8h__dep__incl.svg | 1113 ++--
docs/reference/api/doxygen/mapping_8h.html | 2 +-
docs/reference/api/doxygen/mapping_8h__incl.svg | 1672 ++---
.../api/doxygen/measure_8h__dep__incl.svg | 48 +-
docs/reference/api/doxygen/measure_8h__incl.svg | 236 +-
.../api/doxygen/measure__callback_8h.html | 2 +-
.../doxygen/measure__callback_8h__dep__incl.svg | 12 +-
.../api/doxygen/measure__callback_8h__incl.svg | 2586 ++++----
.../api/doxygen/measure__candidate_8h.html | 2 +-
.../doxygen/measure__candidate_8h__dep__incl.svg | 72 +-
.../api/doxygen/measure__candidate_8h__incl.svg | 1576 ++---
.../api/doxygen/measure__record_8h__incl.svg | 240 +-
docs/reference/api/doxygen/memory__pools_8h.html | 2 +-
.../api/doxygen/memory__pools_8h__dep__incl.svg | 44 +-
.../api/doxygen/memory__pools_8h__incl.svg | 1620 ++---
.../doxygen/meta__schedule_2cost__model_8h.html | 2 +-
.../meta__schedule_2cost__model_8h__dep__incl.svg | 48 +-
.../meta__schedule_2cost__model_8h__incl.svg | 1474 ++---
docs/reference/api/doxygen/metadata_8h.html | 4 +-
docs/reference/api/doxygen/metadata_8h__incl.svg | 1731 ++----
docs/reference/api/doxygen/metadata_8h_source.html | 10 +-
docs/reference/api/doxygen/metadata__base_8h.html | 7 +-
.../api/doxygen/metadata__base_8h__dep__incl.svg | 12 +-
.../api/doxygen/metadata__base_8h__incl.svg | 1787 ++----
.../api/doxygen/metadata__base_8h_source.html | 75 +-
.../api/doxygen/mutator_8h__dep__incl.svg | 40 +-
docs/reference/api/doxygen/mutator_8h__incl.svg | 64 +-
docs/reference/api/doxygen/name__supply_8h.html | 2 +-
.../api/doxygen/name__supply_8h__dep__incl.svg | 20 +-
.../api/doxygen/name__supply_8h__incl.svg | 1278 ++--
docs/reference/api/doxygen/namespacemembers.html | 2 +-
.../api/doxygen/namespacemembers_func.html | 2 +-
.../api/doxygen/namespacemembers_func_p.html | 2 +-
docs/reference/api/doxygen/namespacemembers_m.html | 2 +-
docs/reference/api/doxygen/namespacemembers_p.html | 2 +-
.../api/doxygen/namespacemembers_type.html | 2 +-
docs/reference/api/doxygen/namespaces.html | 79 +-
docs/reference/api/doxygen/namespacetvm.html | 11 +-
.../api/doxygen/namespacetvm_1_1parser.html | 182 -
.../api/doxygen/namespacetvm_1_1relay.html | 82 +-
docs/reference/api/doxygen/ndarray_8h.html | 2 +-
.../api/doxygen/ndarray_8h__dep__incl.svg | 1275 ++--
docs/reference/api/doxygen/nn_2bnn_8h.html | 2 +-
docs/reference/api/doxygen/nn_2bnn_8h__incl.svg | 2470 ++++----
docs/reference/api/doxygen/nn_2dense_8h.html | 2 +-
.../api/doxygen/nn_2dense_8h__dep__incl.svg | 24 +-
docs/reference/api/doxygen/nn_2dense_8h__incl.svg | 1672 ++---
docs/reference/api/doxygen/nn_2pooling_8h.html | 2 +-
.../reference/api/doxygen/nn_2pooling_8h__incl.svg | 2248 +++----
docs/reference/api/doxygen/nn_2softmax_8h.html | 2 +-
.../reference/api/doxygen/nn_2softmax_8h__incl.svg | 2164 +++----
docs/reference/api/doxygen/node_8h.html | 2 +-
docs/reference/api/doxygen/node_8h__dep__incl.svg | 1037 ++--
docs/reference/api/doxygen/object_8h.html | 2 +-
.../reference/api/doxygen/object_8h__dep__incl.svg | 1763 +++---
.../api/doxygen/object__path_8h__dep__incl.svg | 201 +-
docs/reference/api/doxygen/on__device_8h.html | 2 +-
docs/reference/api/doxygen/on__device_8h__incl.svg | 1791 +++---
docs/reference/api/doxygen/op__strategy_8h.html | 2 +-
.../api/doxygen/op__strategy_8h__incl.svg | 2470 ++++----
docs/reference/api/doxygen/operation_8h.html | 2 +-
.../api/doxygen/operation_8h__dep__incl.svg | 600 +-
docs/reference/api/doxygen/operation_8h__incl.svg | 1658 ++---
docs/reference/api/doxygen/optional_8h.html | 2 +-
.../api/doxygen/optional_8h__dep__incl.svg | 1678 +++--
docs/reference/api/doxygen/packed__func_8h.html | 2 +-
.../api/doxygen/packed__func_8h__dep__incl.svg | 995 ++-
docs/reference/api/doxygen/pad__utils_8h.html | 2 +-
.../api/doxygen/pad__utils_8h__dep__incl.svg | 12 +-
docs/reference/api/doxygen/pad__utils_8h__incl.svg | 1658 ++---
docs/reference/api/doxygen/parser_8h.html | 30 +-
docs/reference/api/doxygen/parser_8h__incl.svg | 1790 +++---
docs/reference/api/doxygen/parser_8h_source.html | 12 +-
docs/reference/api/doxygen/pattern_8h.html | 2 +-
docs/reference/api/doxygen/pattern_8h__incl.svg | 1636 ++---
.../reference/api/doxygen/pattern__functor_8h.html | 2 +-
.../api/doxygen/pattern__functor_8h__incl.svg | 2168 +++----
.../api/doxygen/postproc_8h__dep__incl.svg | 40 +-
docs/reference/api/doxygen/postproc_8h__incl.svg | 112 +-
docs/reference/api/doxygen/profiler_8h.html | 2 +-
docs/reference/api/doxygen/profiler_8h__incl.svg | 2110 +++----
docs/reference/api/doxygen/random_8h.html | 2 +-
docs/reference/api/doxygen/random_8h__incl.svg | 1166 ++--
docs/reference/api/doxygen/ravel__unravel_8h.html | 2 +-
.../api/doxygen/ravel__unravel_8h__dep__incl.svg | 84 +-
.../api/doxygen/ravel__unravel_8h__incl.svg | 1644 ++---
docs/reference/api/doxygen/reduce_8h.html | 2 +-
docs/reference/api/doxygen/reduce_8h__incl.svg | 1172 ++--
.../api/doxygen/reduction_8h__dep__incl.svg | 40 +-
docs/reference/api/doxygen/reduction_8h__incl.svg | 756 +--
.../reference/api/doxygen/reduction_8h_source.html | 2 +-
.../api/doxygen/reflection_8h__dep__incl.svg | 185 +-
docs/reference/api/doxygen/registry_8h.html | 2 +-
.../api/doxygen/registry_8h__dep__incl.svg | 1030 +--
docs/reference/api/doxygen/relay_2adt_8h.html | 2 +-
.../api/doxygen/relay_2adt_8h__dep__incl.svg | 28 +-
docs/reference/api/doxygen/relay_2adt_8h__incl.svg | 1856 +++---
.../api/doxygen/relay_2adt_8h_source.html | 2 +-
docs/reference/api/doxygen/relay_2analysis_8h.html | 2 +-
.../api/doxygen/relay_2analysis_8h__incl.svg | 2169 ++++---
.../api/doxygen/relay_2attrs_2debug_8h.html | 2 +-
.../api/doxygen/relay_2attrs_2debug_8h__incl.svg | 1261 ++--
.../api/doxygen/relay_2attrs_2memory_8h.html | 2 +-
.../api/doxygen/relay_2attrs_2memory_8h__incl.svg | 2112 +++----
.../reference/api/doxygen/relay_2attrs_2nn_8h.html | 2 +-
.../api/doxygen/relay_2attrs_2nn_8h__incl.svg | 1736 +++---
.../api/doxygen/relay_2attrs_2transform_8h.html | 2 +-
.../relay_2attrs_2transform_8h__dep__incl.svg | 20 +-
.../doxygen/relay_2attrs_2transform_8h__incl.svg | 1906 +++---
.../reference/api/doxygen/relay_2attrs_2vm_8h.html | 2 +-
.../api/doxygen/relay_2attrs_2vm_8h__incl.svg | 1164 ++--
docs/reference/api/doxygen/relay_2base_8h.html | 4 +-
.../api/doxygen/relay_2base_8h__dep__incl.svg | 284 +-
.../reference/api/doxygen/relay_2base_8h__incl.svg | 1626 ++---
.../api/doxygen/relay_2base_8h_source.html | 10 +-
docs/reference/api/doxygen/relay_2expr_8h.html | 2 +-
.../api/doxygen/relay_2expr_8h__dep__incl.svg | 200 +-
.../reference/api/doxygen/relay_2expr_8h__incl.svg | 2017 +++---
.../api/doxygen/relay_2expr_8h_source.html | 2 +-
.../api/doxygen/relay_2expr__functor_8h.html | 2 +-
.../api/doxygen/relay_2expr__functor_8h__incl.svg | 2165 ++++---
.../doxygen/relay_2expr__functor_8h_source.html | 2 +-
docs/reference/api/doxygen/relay_2feature_8h.html | 2 +-
.../api/doxygen/relay_2feature_8h__incl.svg | 1851 +++---
docs/reference/api/doxygen/relay_2function_8h.html | 2 +-
.../api/doxygen/relay_2function_8h__dep__incl.svg | 36 +-
.../api/doxygen/relay_2function_8h__incl.svg | 2028 +++---
.../api/doxygen/relay_2function_8h_source.html | 2 +-
docs/reference/api/doxygen/relay_2op_8h.html | 2 +-
.../api/doxygen/relay_2op_8h__dep__incl.svg | 36 +-
docs/reference/api/doxygen/relay_2op_8h__incl.svg | 2095 ++++---
.../api/doxygen/relay_2op__attr__types_8h.html | 2 +-
.../relay_2op__attr__types_8h__dep__incl.svg | 28 +-
.../doxygen/relay_2op__attr__types_8h__incl.svg | 2347 ++++---
.../api/doxygen/relay_2qnn_2attrs_8h.html | 2 +-
.../api/doxygen/relay_2qnn_2attrs_8h__incl.svg | 1172 ++--
.../api/doxygen/relay_2qnn_2transform_8h.html | 2 +-
.../api/doxygen/relay_2qnn_2transform_8h__incl.svg | 2002 +++---
.../reference/api/doxygen/relay_2transform_8h.html | 2 +-
.../api/doxygen/relay_2transform_8h__dep__incl.svg | 12 +-
.../api/doxygen/relay_2transform_8h__incl.svg | 2088 +++----
docs/reference/api/doxygen/relay_2type_8h.html | 2 +-
.../api/doxygen/relay_2type_8h__dep__incl.svg | 228 +-
.../reference/api/doxygen/relay_2type_8h__incl.svg | 1677 +++--
docs/reference/api/doxygen/reorg_8h__incl.svg | 800 +--
docs/reference/api/doxygen/repr__printer_8h.html | 2 +-
.../api/doxygen/repr__printer_8h__dep__incl.svg | 1082 ++--
docs/reference/api/doxygen/rocblas_8h.html | 2 +-
.../api/doxygen/rocblas_8h__dep__incl.svg | 12 +-
docs/reference/api/doxygen/rocblas_8h__incl.svg | 1718 ++---
.../reference/api/doxygen/rocm_2dense_8h__incl.svg | 572 +-
.../api/doxygen/rocm_2injective_8h__incl.svg | 556 +-
.../api/doxygen/rocm_2pooling_8h__incl.svg | 532 +-
.../api/doxygen/rocm_2reduction_8h__incl.svg | 556 +-
.../api/doxygen/rocm_2softmax_8h__incl.svg | 556 +-
docs/reference/api/doxygen/runner_8h.html | 2 +-
.../reference/api/doxygen/runner_8h__dep__incl.svg | 72 +-
docs/reference/api/doxygen/runner_8h__incl.svg | 2305 ++++---
.../runtime_2container_2adt_8h__dep__incl.svg | 431 +-
.../api/doxygen/runtime_2container_2base_8h.html | 2 +-
.../runtime_2container_2base_8h__dep__incl.svg | 1660 ++---
.../api/doxygen/runtime_2memory_8h__dep__incl.svg | 465 +-
docs/reference/api/doxygen/runtime_2module_8h.html | 2 +-
.../api/doxygen/runtime_2module_8h__dep__incl.svg | 979 ++-
docs/reference/api/doxygen/runtime_8h.html | 2 +-
docs/reference/api/doxygen/runtime_8h__incl.svg | 1397 +++--
docs/reference/api/doxygen/schedule__pass_8h.html | 2 +-
.../api/doxygen/schedule__pass_8h__dep__incl.svg | 136 +-
.../api/doxygen/schedule__pass_8h__incl.svg | 1666 ++---
docs/reference/api/doxygen/schedule__rule_8h.html | 2 +-
.../api/doxygen/schedule__rule_8h__dep__incl.svg | 40 +-
.../api/doxygen/schedule__rule_8h__incl.svg | 1459 ++---
.../api/doxygen/script_2ir__builder_2base_8h.html | 2 +-
.../script_2ir__builder_2base_8h__dep__incl.svg | 44 +-
.../doxygen/script_2ir__builder_2base_8h__incl.svg | 1423 ++---
docs/reference/api/doxygen/script__printer_8h.html | 2 +-
.../api/doxygen/script__printer_8h__dep__incl.svg | 1035 ++--
docs/reference/api/doxygen/search/all_1.js | 2 +-
docs/reference/api/doxygen/search/all_10.js | 2 +-
docs/reference/api/doxygen/search/all_11.js | 2 +-
docs/reference/api/doxygen/search/all_13.js | 4 +-
docs/reference/api/doxygen/search/all_14.js | 32 +-
docs/reference/api/doxygen/search/all_15.js | 13 +-
docs/reference/api/doxygen/search/all_17.js | 4 +-
docs/reference/api/doxygen/search/all_2.js | 4 +-
docs/reference/api/doxygen/search/all_8.js | 2 +-
docs/reference/api/doxygen/search/all_a.js | 4 +-
docs/reference/api/doxygen/search/all_d.js | 2 +-
docs/reference/api/doxygen/search/all_e.js | 4 +-
docs/reference/api/doxygen/search/classes_10.js | 16 +-
docs/reference/api/doxygen/search/classes_11.js | 4 +-
docs/reference/api/doxygen/search/classes_13.js | 2 +-
docs/reference/api/doxygen/search/classes_8.js | 2 +-
docs/reference/api/doxygen/search/classes_a.js | 2 +-
docs/reference/api/doxygen/search/files_e.js | 2 +-
docs/reference/api/doxygen/search/functions_1.js | 4 +-
docs/reference/api/doxygen/search/functions_10.js | 2 +-
docs/reference/api/doxygen/search/functions_13.js | 12 +-
docs/reference/api/doxygen/search/functions_14.js | 6 +-
docs/reference/api/doxygen/search/functions_16.js | 2 +-
docs/reference/api/doxygen/search/functions_7.js | 2 +-
docs/reference/api/doxygen/search/functions_9.js | 2 +-
docs/reference/api/doxygen/search/functions_f.js | 2 +-
docs/reference/api/doxygen/search/namespaces_1.js | 1 -
docs/reference/api/doxygen/search/typedefs_a.js | 2 +-
docs/reference/api/doxygen/search/variables_0.js | 2 +-
docs/reference/api/doxygen/search/variables_11.js | 6 +-
docs/reference/api/doxygen/search/variables_b.js | 2 +-
.../api/doxygen/search__policy_8h__dep__incl.svg | 12 +-
.../api/doxygen/search__policy_8h__incl.svg | 232 +-
.../reference/api/doxygen/search__strategy_8h.html | 2 +-
.../api/doxygen/search__strategy_8h__dep__incl.svg | 36 +-
.../api/doxygen/search__strategy_8h__incl.svg | 1768 +++---
docs/reference/api/doxygen/search__task_8h.html | 2 +-
.../api/doxygen/search__task_8h__dep__incl.svg | 60 +-
.../api/doxygen/search__task_8h__incl.svg | 1314 ++--
docs/reference/api/doxygen/serializer_8h.html | 2 +-
.../api/doxygen/serializer_8h__dep__incl.svg | 1231 ++--
docs/reference/api/doxygen/shape__tuple_8h.html | 2 +-
.../api/doxygen/shape__tuple_8h__dep__incl.svg | 1267 ++--
docs/reference/api/doxygen/source__map_8h.html | 34 +-
.../api/doxygen/source__map_8h__dep__incl.svg | 1864 +++---
.../reference/api/doxygen/source__map_8h__incl.svg | 1927 +++---
.../api/doxygen/source__map_8h_source.html | 59 +-
.../reference/api/doxygen/space__generator_8h.html | 2 +-
.../api/doxygen/space__generator_8h__dep__incl.svg | 32 +-
.../api/doxygen/space__generator_8h__incl.svg | 1812 +++---
.../{support_2span_8h.html => span_8h.html} | 6 +-
...an_8h__dep__incl.svg => span_8h__dep__incl.svg} | 0
...upport_2span_8h__incl.svg => span_8h__incl.svg} | 0
docs/reference/api/doxygen/span_8h_source.html | 101 +
docs/reference/api/doxygen/state_8h.html | 2 +-
docs/reference/api/doxygen/state_8h__dep__incl.svg | 172 +-
docs/reference/api/doxygen/state_8h__incl.svg | 2125 ++++---
docs/reference/api/doxygen/stmt_8h.html | 2 +-
docs/reference/api/doxygen/stmt_8h__dep__incl.svg | 592 +-
docs/reference/api/doxygen/stmt_8h__incl.svg | 1430 ++---
docs/reference/api/doxygen/stmt_8h_source.html | 2 +-
docs/reference/api/doxygen/stmt__functor_8h.html | 2 +-
.../api/doxygen/stmt__functor_8h__dep__incl.svg | 20 +-
.../api/doxygen/stmt__functor_8h__incl.svg | 1368 ++--
docs/reference/api/doxygen/strided__slice_8h.html | 2 +-
.../api/doxygen/strided__slice_8h__dep__incl.svg | 72 +-
.../api/doxygen/strided__slice_8h__incl.svg | 1464 ++---
docs/reference/api/doxygen/string_8h.html | 2 +-
.../reference/api/doxygen/string_8h__dep__incl.svg | 1006 +--
.../structtvm_1_1relay_1_1ErrorBuilder.html | 5 +-
.../doxygen/structural__equal_8h__dep__incl.svg | 225 +-
.../api/doxygen/structural__hash_8h__dep__incl.svg | 225 +-
.../api/doxygen/support_2span_8h_source.html | 101 -
docs/reference/api/doxygen/tag_8h.html | 2 +-
docs/reference/api/doxygen/tag_8h__incl.svg | 1268 ++--
docs/reference/api/doxygen/target_8h.html | 2 +-
.../reference/api/doxygen/target_8h__dep__incl.svg | 504 +-
docs/reference/api/doxygen/target_8h__incl.svg | 2248 +++----
docs/reference/api/doxygen/target__info_8h.html | 2 +-
.../api/doxygen/target__info_8h__incl.svg | 1284 ++--
docs/reference/api/doxygen/target__kind_8h.html | 2 +-
.../api/doxygen/target__kind_8h__dep__incl.svg | 504 +-
.../api/doxygen/target__kind_8h__incl.svg | 1344 ++--
docs/reference/api/doxygen/task__scheduler_8h.html | 2 +-
.../api/doxygen/task__scheduler_8h__incl.svg | 2700 ++++----
docs/reference/api/doxygen/te_2schedule_8h.html | 2 +-
.../api/doxygen/te_2schedule_8h__dep__incl.svg | 640 +-
.../api/doxygen/te_2schedule_8h__incl.svg | 1446 ++---
docs/reference/api/doxygen/tensor_8h.html | 2 +-
.../reference/api/doxygen/tensor_8h__dep__incl.svg | 628 +-
docs/reference/api/doxygen/tensor_8h__incl.svg | 1628 ++---
docs/reference/api/doxygen/tensor__intrin_8h.html | 2 +-
.../api/doxygen/tensor__intrin_8h__dep__incl.svg | 636 +-
.../api/doxygen/tensor__intrin_8h__incl.svg | 1520 ++---
docs/reference/api/doxygen/tensor__type_8h.html | 2 +-
.../api/doxygen/tensor__type_8h__dep__incl.svg | 244 +-
.../api/doxygen/tensor__type_8h__incl.svg | 1288 ++--
docs/reference/api/doxygen/tensor__utils_8h.html | 2 +-
.../api/doxygen/tensor__utils_8h__dep__incl.svg | 80 +-
.../api/doxygen/tensor__utils_8h__incl.svg | 1644 ++---
docs/reference/api/doxygen/thread__bind_8h.html | 2 +-
.../api/doxygen/thread__bind_8h__incl.svg | 1978 +++---
docs/reference/api/doxygen/tir_2analysis_8h.html | 2 +-
.../api/doxygen/tir_2analysis_8h__dep__incl.svg | 176 +-
.../api/doxygen/tir_2analysis_8h__incl.svg | 2121 ++++---
docs/reference/api/doxygen/tir_2expr_8h.html | 2 +-
.../api/doxygen/tir_2expr_8h__dep__incl.svg | 612 +-
docs/reference/api/doxygen/tir_2expr_8h__incl.svg | 1418 ++---
.../reference/api/doxygen/tir_2expr_8h_source.html | 2 +-
.../api/doxygen/tir_2expr__functor_8h.html | 2 +-
.../doxygen/tir_2expr__functor_8h__dep__incl.svg | 28 +-
.../api/doxygen/tir_2expr__functor_8h__incl.svg | 1418 ++---
docs/reference/api/doxygen/tir_2frame_8h.html | 2 +-
.../api/doxygen/tir_2frame_8h__dep__incl.svg | 12 +-
docs/reference/api/doxygen/tir_2frame_8h__incl.svg | 2074 ++++---
docs/reference/api/doxygen/tir_2function_8h.html | 2 +-
.../api/doxygen/tir_2function_8h__dep__incl.svg | 524 +-
.../api/doxygen/tir_2function_8h__incl.svg | 1348 ++--
.../api/doxygen/tir_2function_8h_source.html | 2 +-
docs/reference/api/doxygen/tir_2ir_8h.html | 2 +-
docs/reference/api/doxygen/tir_2ir_8h__incl.svg | 2104 ++++---
docs/reference/api/doxygen/tir_2op_8h.html | 2 +-
.../api/doxygen/tir_2op_8h__dep__incl.svg | 648 +-
docs/reference/api/doxygen/tir_2op_8h__incl.svg | 1735 +++---
docs/reference/api/doxygen/tir_2op_8h_source.html | 2 +-
.../api/doxygen/tir_2op__attr__types_8h.html | 2 +-
.../doxygen/tir_2op__attr__types_8h__dep__incl.svg | 184 +-
.../api/doxygen/tir_2op__attr__types_8h__incl.svg | 1088 ++--
.../api/doxygen/tir_2schedule_2schedule_8h.html | 2 +-
.../tir_2schedule_2schedule_8h__dep__incl.svg | 164 +-
.../doxygen/tir_2schedule_2schedule_8h__incl.svg | 1850 +++---
docs/reference/api/doxygen/tir_2transform_8h.html | 2 +-
.../api/doxygen/tir_2transform_8h__incl.svg | 1710 ++---
.../api/doxygen/tir_2usmp_2analysis_8h.html | 2 +-
.../api/doxygen/tir_2usmp_2analysis_8h__incl.svg | 1804 +++---
.../api/doxygen/tir_2usmp_2transform_8h.html | 2 +-
.../api/doxygen/tir_2usmp_2transform_8h__incl.svg | 1531 ++---
.../reference/api/doxygen/tir_2usmp_2utils_8h.html | 2 +-
.../api/doxygen/tir_2usmp_2utils_8h__dep__incl.svg | 36 +-
.../api/doxygen/tir_2usmp_2utils_8h__incl.svg | 1540 ++---
docs/reference/api/doxygen/topi_2nn_8h.html | 2 +-
.../api/doxygen/topi_2nn_8h__dep__incl.svg | 12 +-
docs/reference/api/doxygen/topi_2nn_8h__incl.svg | 2172 +++----
docs/reference/api/doxygen/topi_2nn_8h_source.html | 2 +-
docs/reference/api/doxygen/topi_2transform_8h.html | 2 +-
.../api/doxygen/topi_2transform_8h__dep__incl.svg | 64 +-
.../api/doxygen/topi_2transform_8h__incl.svg | 1962 +++---
docs/reference/api/doxygen/topi_2utils_8h.html | 2 +-
.../reference/api/doxygen/topi_2utils_8h__incl.svg | 1646 ++---
.../api/doxygen/transform__step_8h__dep__incl.svg | 96 +-
.../api/doxygen/transform__step_8h__incl.svg | 200 +-
docs/reference/api/doxygen/tune__context_8h.html | 2 +-
.../api/doxygen/tune__context_8h__dep__incl.svg | 24 +-
.../api/doxygen/tune__context_8h__incl.svg | 2191 ++++---
docs/reference/api/doxygen/type__functor_8h.html | 2 +-
.../api/doxygen/type__functor_8h__incl.svg | 1663 +++--
docs/reference/api/doxygen/type__relation_8h.html | 2 +-
.../api/doxygen/type__relation_8h__dep__incl.svg | 548 +-
.../api/doxygen/type__relation_8h__incl.svg | 1713 +++--
.../api/doxygen/type__relation_8h_source.html | 4 +-
docs/reference/api/doxygen/var_8h.html | 2 +-
docs/reference/api/doxygen/var_8h__dep__incl.svg | 644 +-
docs/reference/api/doxygen/var_8h__incl.svg | 1249 ++--
docs/reference/api/doxygen/var_8h_source.html | 2 +-
docs/reference/api/doxygen/virtual__device_8h.html | 2 +-
.../api/doxygen/virtual__device_8h__dep__incl.svg | 244 +-
.../api/doxygen/virtual__device_8h__incl.svg | 1870 +++---
docs/reference/api/doxygen/vision_8h.html | 2 +-
docs/reference/api/doxygen/vision_8h__incl.svg | 1738 +++---
docs/reference/api/doxygen/winograd_8h.html | 2 +-
docs/reference/api/doxygen/winograd_8h__incl.svg | 1962 +++---
docs/reference/api/doxygen/with_8h__dep__incl.svg | 69 +-
docs/reference/api/doxygen/x86_2bnn_8h__incl.svg | 552 +-
.../api/doxygen/x86_2default_8h__incl.svg | 556 +-
.../api/doxygen/x86_2injective_8h__incl.svg | 552 +-
docs/reference/api/python/auto_scheduler.html | 4 +-
docs/reference/api/python/ir.html | 4 +-
docs/reference/api/python/relay/index.html | 305 +-
.../api/typedoc/classes/bytestreamreader.html | 12 +-
.../api/typedoc/classes/cachedcallstack.html | 34 +-
docs/reference/api/typedoc/classes/dldatatype.html | 12 +-
docs/reference/api/typedoc/classes/dldevice.html | 10 +-
.../reference/api/typedoc/classes/environment.html | 12 +-
docs/reference/api/typedoc/classes/ffilibrary.html | 20 +-
.../api/typedoc/classes/graphexecutor.html | 16 +-
docs/reference/api/typedoc/classes/instance.html | 40 +-
docs/reference/api/typedoc/classes/memory.html | 34 +-
docs/reference/api/typedoc/classes/module.html | 10 +-
docs/reference/api/typedoc/classes/ndarray.html | 22 +-
.../api/typedoc/classes/packedfunccell.html | 6 +-
docs/reference/api/typedoc/classes/rpcserver.html | 14 +-
docs/reference/api/typedoc/classes/scalar.html | 6 +-
.../api/typedoc/classes/webgpucontext.html | 12 +-
docs/reference/api/typedoc/enums/argtypecode.html | 30 +-
.../api/typedoc/enums/aynccallbackcode.html | 4 +-
.../api/typedoc/enums/dldatatypecode.html | 8 +-
.../api/typedoc/enums/rpcserverstate.html | 12 +-
docs/reference/api/typedoc/enums/sizeof.html | 18 +-
docs/reference/api/typedoc/index.html | 112 +-
.../api/typedoc/interfaces/disposable.html | 2 +-
.../api/typedoc/interfaces/functioninfo.html | 6 +-
.../api/typedoc/interfaces/libraryprovider.html | 4 +-
docs/searchindex.js | 2 +-
.../vta/tutorials/autotvm/sg_execution_times.html | 4 +-
.../tutorials/frontend/deploy_classification.html | 2 +-
.../vta/tutorials/frontend/deploy_detection.html | 2 +-
.../vta/tutorials/frontend/sg_execution_times.html | 10 +-
.../vta/tutorials/optimize/sg_execution_times.html | 6 +-
docs/topic/vta/tutorials/sg_execution_times.html | 6 +-
docs/tutorial/auto_scheduler_matmul_x86.html | 4 +-
docs/tutorial/autotvm_matmul_x86.html | 20 +-
docs/tutorial/autotvm_relay_x86.html | 267 +-
docs/tutorial/cross_compilation_and_rpc.html | 2 +-
docs/tutorial/intro_topi.html | 2 +-
docs/tutorial/sg_execution_times.html | 22 +-
docs/tutorial/tensor_expr_get_started.html | 41 +-
922 files changed, 175172 insertions(+), 179149 deletions(-)
diff --git a/docs/_images/sphx_glr_micro_train_001.png b/docs/_images/sphx_glr_micro_train_001.png
index 1a78ef007a..bbd9b6f736 100644
Binary files a/docs/_images/sphx_glr_micro_train_001.png and b/docs/_images/sphx_glr_micro_train_001.png differ
diff --git a/docs/_images/sphx_glr_micro_train_thumb.png b/docs/_images/sphx_glr_micro_train_thumb.png
index 8e911753f3..d75c481972 100644
Binary files a/docs/_images/sphx_glr_micro_train_thumb.png and b/docs/_images/sphx_glr_micro_train_thumb.png differ
diff --git a/docs/_sources/how_to/compile_models/from_darknet.rst.txt b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
index 653d8be090..18bead6d8b 100644
--- a/docs/_sources/how_to/compile_models/from_darknet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_darknet.rst.txt
@@ -318,7 +318,7 @@ The process is no different from other examples.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 17.754 seconds)
+ **Total running time of the script:** ( 1 minutes 16.940 seconds)
.. _sphx_glr_download_how_to_compile_models_from_darknet.py:
diff --git a/docs/_sources/how_to/compile_models/from_keras.rst.txt b/docs/_sources/how_to/compile_models/from_keras.rst.txt
index 8b8a06881e..24fa306f7a 100644
--- a/docs/_sources/how_to/compile_models/from_keras.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_keras.rst.txt
@@ -232,7 +232,7 @@ Look up prediction top 1 index in 1000 class synset.
.. code-block:: none
Relay top-1 id: 285, class name: Egyptian cat
-
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 925ms/step
+
1/1 [==============================] - ETA: 0s
1/1 [==============================] - 1s 922ms/step
Keras top-1 id: 285, class name: Egyptian cat
diff --git a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
index 6cb300a904..0f5e79f8ee 100644
--- a/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_mxnet.rst.txt
@@ -116,7 +116,7 @@ In this section, we download a pretrained imagenet model and classify an image.
.. code-block:: none
- Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe3176ba3-0fe9-43f6-b092-3fe63054207d from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+ Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip31278884-f624-4b28-a605-c808c85aa157 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
diff --git a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
index 789dfc22d5..34f4de8252 100644
--- a/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_oneflow.rst.txt
@@ -121,7 +121,7 @@ Load a pretrained OneFlow model and save model
.. code-block:: none
Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
-
0%| | 0.00/41.5M [00:00<?, ?B/s]
19%|#9 | 7.99M/41.5M [00:00<00:00, 48.1MB/s]
39%|###8 | 16.0M/41.5M [00:00<00:00, 56.6MB/s]
58%|#####7 | 24.0M/41.5M [00:00<00:00, 54.2MB/s]
77%|#######7 | 32.0M/41.5M [00:00<00:00, 60.0MB/s]
91%|#########1| 37.9M/41.5M [00:00<00:00, 55.1MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 48.8MB/s]
+
0%| | 0.00/41.5M [00:00<?, ?B/s]
17%|#7 | 7.20M/41.5M [00:00<00:00, 75.4MB/s]
35%|###4 | 14.4M/41.5M [00:00<00:00, 65.8MB/s]
50%|####9 | 20.7M/41.5M [00:00<00:00, 60.0MB/s]
64%|######3 | 26.5M/41.5M [00:00<00:00, 52.7MB/s]
80%|#######9 | 33.0M/41.5M [00:00<00:00, 57.3MB/s]
93%|#########3| 38.6M/41.5M [00:00<00:00, 44.8MB/s]
100%|##########| 41.5M/41.5M [00:00<00:00, 51.9MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
index aac095191c..8aa132b917 100644
--- a/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_pytorch.rst.txt
@@ -101,7 +101,7 @@ Load a pretrained PyTorch model
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
-
0%| | 0.00/44.7M [00:00<?, ?B/s]
21%|##1 | 9.53M/44.7M [00:00<00:00, 99.9MB/s]
43%|####2 | 19.1M/44.7M [00:00<00:00, 86.0MB/s]
72%|#######1 | 32.0M/44.7M [00:00<00:00, 97.2MB/s]
92%|#########2| 41.3M/44.7M [00:00<00:00, 94.7MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 93.2MB/s]
+
0%| | 0.00/44.7M [00:00<?, ?B/s]
18%|#7 | 7.99M/44.7M [00:00<00:00, 49.2MB/s]
32%|###2 | 14.3M/44.7M [00:00<00:00, 35.4MB/s]
40%|###9 | 17.8M/44.7M [00:00<00:00, 33.5MB/s]
54%|#####3 | 24.0M/44.7M [00:00<00:00, 41.6MB/s]
72%|#######1 | 32.0M/44.7M [00:00<00:00, 45.6MB/s]
90%|########9 | 40.0M/44.7M [00:00<00:00, 52.3MB/s]
100%|##########| 44.7M/44.7M [00:00<00:00, 50.1MB/s]
diff --git a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
index 60652a1a5b..251c56474c 100644
--- a/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
+++ b/docs/_sources/how_to/compile_models/from_tensorflow.rst.txt
@@ -424,7 +424,7 @@ Run the corresponding model on tensorflow
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 20.985 seconds)
+ **Total running time of the script:** ( 1 minutes 20.604 seconds)
.. _sphx_glr_download_how_to_compile_models_from_tensorflow.py:
diff --git a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
index b90cbabac2..6d1875b142 100644
--- a/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/compile_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**06:22.455** total execution time for **how_to_compile_models** files:
+**06:17.784** total execution time for **how_to_compile_models** files:
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:20.985 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tensorflow.py` (``from_tensorflow.py``) | 01:20.604 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:17.754 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_darknet.py` (``from_darknet.py``) | 01:16.940 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:52.358 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_paddle.py` (``from_paddle.py``) | 00:51.291 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:35.752 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_oneflow.py` (``from_oneflow.py``) | 00:35.095 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:30.291 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:30.553 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_coreml.py` (``from_coreml.py``) | 00:30.194 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_mxnet.py` (``from_mxnet.py``) | 00:29.430 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:27.749 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_tflite.py` (``from_tflite.py``) | 00:26.452 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:24.724 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_pytorch.py` (``from_pytorch.py``) | 00:24.396 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:20.012 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_keras.py` (``from_keras.py``) | 00:20.355 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.636 | 0.0 MB |
+| :ref:`sphx_glr_how_to_compile_models_from_onnx.py` (``from_onnx.py``) | 00:02.670 | 0.0 MB |
+-----------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
index da2f1eaf9c..68a10af4e9 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_adreno.rst.txt
@@ -727,7 +727,7 @@ well as provides information about the model's performance
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 2541.9046 2542.1988 2544.5307 2539.4577 1.8976
+ 2545.0897 2544.5280 2548.3965 2543.5946 1.3722
diff --git a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
index 68b9c80f89..a03eaa3df6 100644
--- a/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_model_on_android.rst.txt
@@ -437,7 +437,7 @@ Execute on TVM
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0990 16.0548 16.5096 15.7118 0.2785
+ 16.0066 16.0084 16.1286 15.8646 0.0826
diff --git a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
index b5890d7893..bcd4f270b8 100644
--- a/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_object_detection_pytorch.rst.txt
@@ -130,7 +130,7 @@ Load pre-trained maskrcnn from torchvision and do tracing
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MaskRCNN_ResNet50_FPN_Weights.COCO_V1`. You can also use `weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
-
0%| | 0.00/170M [00:00<?, ?B/s]
3%|3 | 5.18M/170M [00:00<00:03, 49.3MB/s]
6%|5 | 9.89M/170M [00:00<00:04, 33.6MB/s]
9%|9 | 16.0M/170M [00:00<00:04, 33.9MB/s]
14%|#4 | 24.0M/170M [00:00<00:04, 30.9MB/s]
19%|#8 | 31.7M/170M [00:00<00:03, 36.7MB/s]
21%|## | 35.4M/170M [00:01<00:06, 20.5MB/s]
24%|##3 | 40.0M/170M [00:01<00:06, 22.6MB/s]
28%|##8 | 48.0M/170M [00:01<00:04, 30.3MB/s]
34%|###3 | 57.4M/170M [00:01<00:02, 42.2MB/s]
38%|###7 | 64.0M/170M [00:01<00:02, 46.6MB/s]
42%|####2 | 72.0M/170M [00:02<00:02, 45.7MB/s]
48%|####7 | 81.4M/170M [00:02<00:01, 56.7MB/s]
52%|#####1 | 88.0M/170M [00:02<00:01, 53.2MB/s]
55%|#####5 | 93.8M/170M [00:02<00:01, 48.6MB/s]
58%|#####8 | 99.0M/170M [00:02<00:01, 45.8MB/s]
61%|######1 | 104M/170M [00:02<00:01, 43.6MB/s]
66%|######5 | 112M/170M [00:02<00:01, 45.1MB/s
]
71%|####### | 120M/170M [00:03<00:00, 52.7MB/s]
75%|#######5 | 128M/170M [00:03<00:00, 55.6MB/s]
80%|######## | 136M/170M [00:03<00:00, 55.8MB/s]
86%|########5 | 146M/170M [00:03<00:00, 66.6MB/s]
91%|######### | 154M/170M [00:03<00:00, 67.1MB/s]
95%|#########4| 161M/170M [00:03<00:00, 55.1MB/s]
100%|##########| 170M/170M [00:03<00:00, 45.7MB/s]
+
0%| | 0.00/170M [00:00<?, ?B/s]
7%|6 | 11.7M/170M [00:00<00:01, 123MB/s]
14%|#3 | 23.5M/170M [00:00<00:01, 123MB/s]
21%|## | 35.3M/170M [00:00<00:01, 83.0MB/s]
28%|##8 | 48.0M/170M [00:00<00:01, 89.2MB/s]
37%|###7 | 63.7M/170M [00:00<00:01, 110MB/s]
44%|####4 | 75.2M/170M [00:00<00:00, 102MB/s]
52%|#####1 | 88.0M/170M [00:00<00:00, 96.3MB/s]
59%|#####9 | 100M/170M [00:01<00:00, 105MB/s]
66%|######5 | 112M/170M [00:01<00:00, 100MB/s]
72%|#######2 | 123M/170M [00:01<00:00, 98.6MB/s]
80%|######## | 136M/170M [00:01<00:00, 99.1MB/s]
87%|########7 | 148M/170M [00:01<00:00, 106MB/s]
94%|#########3| 159M/170M [00:01<00:00, 108MB/s]
100%|#########9| 169M/170M [00:01<00:00, 98.4MB/s]
100%|##########| 170M/170M [00:01<00:00, 101MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
@@ -299,7 +299,7 @@ Get boxes with score larger than 0.9
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 24.958 seconds)
+ **Total running time of the script:** ( 3 minutes 27.556 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_object_detection_pytorch.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
index 348b9f2db6..c43b054a62 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized.rst.txt
@@ -227,7 +227,7 @@ training. Other models require a full post training calibration.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and will be removed in 0.15. The current behavior is equivalent to passing `weights=MobileNet_V2_Weights.IMAGENET1K_V1`. You can also use `weights=MobileNet_V2_Weights.DEFAULT` to get the most up-to-date weights.
warnings.warn(msg)
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
-
0%| | 0.00/13.6M [00:00<?, ?B/s]
59%|#####8 | 7.99M/13.6M [00:00<00:00, 61.5MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 70.2MB/s]
+
0%| | 0.00/13.6M [00:00<?, ?B/s]
59%|#####8 | 7.99M/13.6M [00:00<00:00, 64.6MB/s]
100%|##########| 13.6M/13.6M [00:00<00:00, 93.8MB/s]
@@ -409,7 +409,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.3890 90.2207 94.9386 89.9815 0.5996
+ 90.4616 90.3165 94.6651 90.0549 0.5140
@@ -458,7 +458,7 @@ TODO
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 13.851 seconds)
+ **Total running time of the script:** ( 1 minutes 13.799 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
index a30a6e4bc0..2a76e89d6f 100644
--- a/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_prequantized_tflite.rst.txt
@@ -423,7 +423,7 @@ Here we give an example of how to measure performance of TVM compiled models.
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 118.6146 118.4709 122.3098 117.2891 0.7489
+ 119.6743 119.5340 125.3106 118.7616 0.7213
@@ -460,7 +460,7 @@ Here we give an example of how to measure performance of TVM compiled models.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 2 minutes 36.525 seconds)
+ **Total running time of the script:** ( 2 minutes 30.960 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_prequantized_tflite.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
index 0156131a8e..1c0425862d 100644
--- a/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_quantized.rst.txt
@@ -257,7 +257,7 @@ We create a Relay VM to build and execute the model.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 43.203 seconds)
+ **Total running time of the script:** ( 1 minutes 40.857 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_quantized.py:
diff --git a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
index fb10beb474..73e3afe95e 100644
--- a/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
+++ b/docs/_sources/how_to/deploy_models/deploy_ssd_gluoncv.rst.txt
@@ -170,7 +170,7 @@ Convert and compile model for CPU.
data: None
input_sym_arg_type = in_param.infer_type()[0]
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
-
0%| | 0/132723 [00:00<?, ?KB/s]
4%|4 | 5363/132723 [00:00<00:02, 53626.60KB/s]
9%|8 | 11773/132723 [00:00<00:02, 59781.93KB/s]
13%|#3 | 17752/132723 [00:00<00:02, 42451.97KB/s]
19%|#9 | 25526/132723 [00:00<00:01, 53620.40KB/s]
25%|##5 | 33283/132723 [00:00<00:01, 61077.20KB/s]
30%|### | 39857/132723 [00:00<00:01, 62164.31KB/s]
36%|###6 | 47815/132723 [00:00<00:01, 67461.80KB/s]
42%|####2 | 55744/132723 [00:00<00:01, 71044.07KB/s]
48%|####8 | 63714/132723 [00:00<00:00, 73657.84KB/s]
54%|#####4 | 71757/132723 [00:01<00:00, 75694.75KB/s]
60%|###### | 79750/132723 [00:01<00:00, 76966.32KB/s]
66%|######6 | 87810/132723 [00:01<00:00, 78055.76KB/s]
72%|#######2 | 95814/132723 [00:01<00:00, 78645.29KB/s]
78%|#######8 | 103809/132723 [00:01<00:00, 79029.89KB/s]
84%|########4 | 111789/132723 [00:01<00:00, 79257.25KB/s]
90%|#########
| 119756/132723 [00:01<00:00, 79380.24KB/s]
96%|#########6| 127739/132723 [00:01<00:00, 79513.47KB/s]
100%|##########| 132723/132723 [00:01<00:00, 71599.06KB/s]
+
0%| | 0/132723 [00:00<?, ?KB/s]
5%|5 | 7045/132723 [00:00<00:01, 70438.82KB/s]
12%|#1 | 15891/132723 [00:00<00:01, 81035.00KB/s]
19%|#8 | 24732/132723 [00:00<00:01, 84400.02KB/s]
25%|##4 | 33173/132723 [00:00<00:01, 64674.57KB/s]
32%|###1 | 41996/132723 [00:00<00:01, 71781.26KB/s]
38%|###8 | 50852/132723 [00:00<00:01, 76831.11KB/s]
45%|####4 | 59620/132723 [00:00<00:00, 80091.26KB/s]
52%|#####1 | 68471/132723 [00:00<00:00, 82619.56KB/s]
58%|#####8 | 77329/132723 [00:00<00:00, 84403.74KB/s]
65%|######4 | 86215/132723 [00:01<00:00, 85740.83KB/s]
72%|#######1 | 95059/132723 [00:01<00:00, 86548.74KB/s]
78%|#######8 | 103969/132723 [00:01<00:00, 87310.83KB/s]
85%|########5 | 112861/132723 [00:01<00:00, 87792.60KB/s]
92%|#########1| 121707/132723 [00:01<00:00, 87990.67KB/s]
98%|#########8| 130579/132723 [00:01<00:00, 88206.36KB/s]
100%|#######
###| 132723/132723 [00:01<00:00, 82765.82KB/s]
@@ -246,7 +246,7 @@ Display result
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 3 minutes 31.458 seconds)
+ **Total running time of the script:** ( 3 minutes 35.155 seconds)
.. _sphx_glr_download_how_to_deploy_models_deploy_ssd_gluoncv.py:
diff --git a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
index 37eb9726d4..b6c92b34f2 100644
--- a/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/deploy_models/sg_execution_times.rst.txt
@@ -5,26 +5,26 @@
Computation times
=================
-**14:58.194** total execution time for **how_to_deploy_models** files:
+**14:56.410** total execution time for **how_to_deploy_models** files:
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:31.458 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_ssd_gluoncv.py` (``deploy_ssd_gluoncv.py``) | 03:35.155 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:24.958 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_object_detection_pytorch.py` (``deploy_object_detection_pytorch.py``) | 03:27.556 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:36.525 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized_tflite.py` (``deploy_prequantized_tflite.py``) | 02:30.960 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:43.203 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_quantized.py` (``deploy_quantized.py``) | 01:40.857 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:13.851 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_prequantized.py` (``deploy_prequantized.py``) | 01:13.799 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 00:53.577 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_adreno.py` (``deploy_model_on_adreno.py``) | 00:53.705 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:40.151 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_android.py` (``deploy_model_on_android.py``) | 00:40.621 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:27.322 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_nano.py` (``deploy_model_on_nano.py``) | 00:27.093 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:27.143 | 0.0 MB |
+| :ref:`sphx_glr_how_to_deploy_models_deploy_model_on_rasp.py` (``deploy_model_on_rasp.py``) | 00:26.659 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_deploy_models_deploy_sparse.py` (``deploy_sparse.py``) | 00:00.006 | 0.0 MB |
+------------------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
index 9c2f387889..a29a24609c 100644
--- a/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/bring_your_own_datatypes.rst.txt
@@ -463,7 +463,7 @@ First let us define two helper functions to get the mobilenet model and a cat im
.. code-block:: none
- Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip837d4979-2338-4b33-abd0-3108602ea2f1 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+ Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipc284f576-75fc-4dd1-87fd-ddce03ce8b37 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
diff --git a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
index dd42b56630..c800216e72 100644
--- a/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:52.320** total execution time for **how_to_extend_tvm** files:
+**00:52.733** total execution time for **how_to_extend_tvm** files:
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:48.568 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_bring_your_own_datatypes.py` (``bring_your_own_datatypes.py``) | 00:49.000 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.702 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_instrument.py` (``use_pass_instrument.py``) | 00:02.660 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:01.043 | 0.0 MB |
+| :ref:`sphx_glr_how_to_extend_tvm_use_pass_infra.py` (``use_pass_infra.py``) | 00:01.066 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_extend_tvm_low_level_custom_pass.py` (``low_level_custom_pass.py``) | 00:00.007 | 0.0 MB |
+-------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
index 31add8c62a..ec0fab780f 100644
--- a/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
+++ b/docs/_sources/how_to/extend_tvm/use_pass_instrument.rst.txt
@@ -220,10 +220,10 @@ profile the execution time of each passes.
.. code-block:: none
Printing results of timing profile...
- InferType: 20645us [20645us] (48.79%; 48.79%)
- FoldScaleAxis: 21669us [7us] (51.21%; 51.21%)
- FoldConstant: 21662us [1649us] (51.19%; 99.97%)
- InferType: 20013us [20013us] (47.30%; 92.39%)
+ InferType: 21251us [21251us] (48.53%; 48.53%)
+ FoldScaleAxis: 22538us [9us] (51.47%; 51.47%)
+ FoldConstant: 22529us [1710us] (51.45%; 99.96%)
+ InferType: 20819us [20819us] (47.54%; 92.41%)
@@ -262,10 +262,10 @@ Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for th
.. code-block:: none
Printing results of timing profile...
- InferType: 20069us [20069us] (48.20%; 48.20%)
- FoldScaleAxis: 21565us [5us] (51.80%; 51.80%)
- FoldConstant: 21560us [1675us] (51.78%; 99.98%)
- InferType: 19885us [19885us] (47.76%; 92.23%)
+ InferType: 20727us [20727us] (48.14%; 48.14%)
+ FoldScaleAxis: 22330us [6us] (51.86%; 51.86%)
+ FoldConstant: 22325us [1732us] (51.85%; 99.97%)
+ InferType: 20593us [20593us] (47.83%; 92.24%)
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
index 5f97f5ff7d..0861c6afaf 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_cuda.rst.txt
@@ -331,7 +331,7 @@ latency of convolution.
.. code-block:: none
- Convolution: 54.177120 ms
+ Convolution: 39.232894 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
index 3d4a22fefd..90a5c9657e 100644
--- a/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_conv_tensorcore.rst.txt
@@ -602,7 +602,7 @@ be able to run on our build server
.. code-block:: none
- conv2d with tensor core: 6.674224 ms
+ conv2d with tensor core: 13.368719 ms
diff --git a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
index 15eb67a5bc..0a2d8867b9 100644
--- a/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/opt_gemm.rst.txt
@@ -134,8 +134,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
.. code-block:: none
- Numpy running time: 0.017826
- Baseline: 3.292331
+ Numpy running time: 0.018515
+ Baseline: 3.232754
@@ -224,7 +224,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
.. code-block:: none
- Opt1: 0.305006
+ Opt1: 0.304784
@@ -312,7 +312,7 @@ In this tutorial, we chose to vectorize the inner loop row data since it is cach
.. code-block:: none
- Opt2: 0.339556
+ Opt2: 0.335927
@@ -397,7 +397,7 @@ the access pattern for A matrix is more cache friendly.
.. code-block:: none
- Opt3: 0.116267
+ Opt3: 0.115428
@@ -511,7 +511,7 @@ flattening.
.. code-block:: none
- Opt4: 0.108496
+ Opt4: 0.109501
@@ -620,7 +620,7 @@ write to C when all the block results are ready.
.. code-block:: none
- Opt5: 0.111046
+ Opt5: 0.111203
@@ -730,7 +730,7 @@ Furthermore, we can also utilize multi-core processors to do the thread-level pa
.. code-block:: none
- Opt6: 0.147583
+ Opt6: 0.146464
diff --git a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
index 0e1d8f906d..b776d719a2 100644
--- a/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/optimize_operators/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:34.615** total execution time for **how_to_optimize_operators** files:
+**00:34.352** total execution time for **how_to_optimize_operators** files:
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:32.086 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_gemm.py` (``opt_gemm.py``) | 00:31.727 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.443 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_tensorcore.py` (``opt_conv_tensorcore.py``) | 00:01.563 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.086 | 0.0 MB |
+| :ref:`sphx_glr_how_to_optimize_operators_opt_conv_cuda.py` (``opt_conv_cuda.py``) | 00:01.062 | 0.0 MB |
+-----------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
index b3d5fda921..8e9dc4512d 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**09:18.552** total execution time for **how_to_tune_with_autoscheduler** files:
+**09:28.269** total execution time for **how_to_tune_with_autoscheduler** files:
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:38.300 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py` (``tune_conv2d_layer_cuda.py``) | 05:34.610 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:39.258 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_x86.py` (``tune_network_x86.py``) | 01:39.417 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:05.499 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_cuda.py` (``tune_network_cuda.py``) | 01:06.112 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:28.685 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_sparse_x86.py` (``tune_sparse_x86.py``) | 00:40.800 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:13.915 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_arm.py` (``tune_network_arm.py``) | 00:14.207 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:12.895 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autoscheduler_tune_network_mali.py` (``tune_network_mali.py``) | 00:13.123 | 0.0 MB |
+----------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
index e2c7175ff4..263028b588 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.rst.txt
@@ -241,481 +241,356 @@ cooperative fetching, unrolling and operator fusion.
def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
blockIdx_x = T.env_thread("blockIdx.x")
- T.launch_thread(blockIdx_x, 28)
+ T.launch_thread(blockIdx_x, 32)
conv2d_nchw = T.allocate([14], "float32", "local")
- pad_temp_shared = T.allocate([72], "float32", "shared")
- kernel_shared = T.allocate([3072], "float32", "shared")
+ pad_temp_shared = T.allocate([1008], "float32", "shared")
+ kernel_shared = T.allocate([768], "float32", "shared")
threadIdx_x = T.env_thread("threadIdx.x")
- T.launch_thread(threadIdx_x, 64)
- conv2d_nchw_1 = T.Buffer((14,), data=conv2d_nchw, scope="local", align=32)
+ T.launch_thread(threadIdx_x, 56)
+ conv2d_nchw_1 = T.Buffer((4,), data=conv2d_nchw, scope="local", align=8)
conv2d_nchw_1[0] = T.float32(0)
- conv2d_nchw_1[1] = T.float32(0)
conv2d_nchw_1[2] = T.float32(0)
- conv2d_nchw_1[3] = T.float32(0)
conv2d_nchw_1[4] = T.float32(0)
- conv2d_nchw_1[5] = T.float32(0)
conv2d_nchw_1[6] = T.float32(0)
- conv2d_nchw_1[7] = T.float32(0)
conv2d_nchw_1[8] = T.float32(0)
- conv2d_nchw_1[9] = T.float32(0)
conv2d_nchw_1[10] = T.float32(0)
- conv2d_nchw_1[11] = T.float32(0)
conv2d_nchw_1[12] = T.float32(0)
+ conv2d_nchw_1[1] = T.float32(0)
+ conv2d_nchw_1[3] = T.float32(0)
+ conv2d_nchw_1[5] = T.float32(0)
+ conv2d_nchw_1[7] = T.float32(0)
+ conv2d_nchw_1[9] = T.float32(0)
+ conv2d_nchw_1[11] = T.float32(0)
conv2d_nchw_1[13] = T.float32(0)
- for rc_outer_outer, ry_outer_outer in T.grid(64, 3):
- cse_var_2: T.int32 = rc_outer_outer * 72
- cse_var_1: T.int32 = ry_outer_outer * 3
+ for rc_outer_outer, rx_outer_outer in T.grid(32, 3):
+ cse_var_1: T.int32 = rc_outer_outer * 144
threadIdx_x_1 = T.env_thread("threadIdx.x")
- pad_temp_shared_1 = T.Buffer((72,), data=pad_temp_shared, scope="shared")
- with T.launch_thread(threadIdx_x_1, 64):
+ pad_temp_shared_1 = T.Buffer((1008,), data=pad_temp_shared, scope="shared")
+ with T.launch_thread(threadIdx_x_1, 56):
data_1 = T.Buffer((25088,), data=data.data)
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= threadIdx_x_1 * 4 % 9 and threadIdx_x_1 * 4 % 9 < 8, data_1[rc_outer_outer * 392 + threadIdx_x_1 * 4 // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + threadIdx_x_1 * 4 % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 1] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 2] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 3] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 4] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 5] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 6] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 7] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 8] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 9] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 10] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 11] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 12] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 13] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 14] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 15] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 16] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 17] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 18] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 19] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 20] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 21] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 22] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 23] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 24] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 25] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 26] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 27] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 28] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 29] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 30] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 31] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 32] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 33] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 34] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 1] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 1) % 9 and (threadIdx_x_1 * 4 + 1) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 1) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 1) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 35] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 2] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 2) % 9 and (threadIdx_x_1 * 4 + 2) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 2) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 2) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 36] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 3] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 3) % 9 and (threadIdx_x_1 * 4 + 3) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 3) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 3) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 37] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 38] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 39] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 40] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 41] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 42] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 43] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 44] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 45] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 46] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 47] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 48] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 49] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 50] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 51] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 52] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 53] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 54] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 55] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
threadIdx_x_2 = T.env_thread("threadIdx.x")
- kernel_shared_1 = T.Buffer((3072,), data=kernel_shared, scope="shared")
+ kernel_shared_1 = T.Buffer((768,), data=kernel_shared, scope="shared")
kernel_1 = T.Buffer((2359296,), data=kernel.data)
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 64] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 64) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 128] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 128) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 192] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 36864]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 256] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 256) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 320] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 320) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 384] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 73728]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 448) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 512] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 512) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 576] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 110592]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 640] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 640) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 704] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 704) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 768] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 147456]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 832] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 832) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 896] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 896) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 960] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 184320]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1024] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1024) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1088] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1088) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1152] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 221184]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1216] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1216) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1280] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1280) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1344] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 258048]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1408] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1408) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1472] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1472) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1536] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 294912]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1600] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1600) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1664] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1664) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1728] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 331776]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1792] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1792) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1856] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1856) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1920] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 368640]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1984] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1984) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2048] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2048) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2112] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 405504]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2176] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2176) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2240] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2240) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2304] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 442368]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2368] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2368) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2432] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2432) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2496] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 479232]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2560] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2560) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2624] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2624) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2688] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 516096]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2752] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2752) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2816] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2816) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2880] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 552960]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2944] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2944) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 3008] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3008) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 47]
- for i1_inner, i3_inner in T.grid(2, 7):
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 56] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 56) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 112] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 112) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 168] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 168) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 16 * 9 + threadIdx_x_2 % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 224] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 224) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 280] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 280) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 336] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer + 32256]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 392) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 448) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 504] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 504) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 16 * 9 + threadIdx_x_2 % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 560] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 560) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 616] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 616) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 672] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer + 64512]
+ with T.launch_thread(threadIdx_x_2, 56):
+ if T.likely(threadIdx_x_2 < 40):
+ kernel_shared_1[threadIdx_x_2 + 728] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 728) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ for rc_outer_inner in range(4):
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 3] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 4] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 5] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 6] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 66] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 67] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 68] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 69] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 129] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 130] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 131] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 132] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 192] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 193] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 194] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 195] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 3] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 4] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 5] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 6] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 66] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 67] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 68] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 69] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 129] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 130] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 131] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 132] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 192] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 193] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 194] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 195] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 8] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 12] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 13] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 70] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 71] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 75] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 76] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 133] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 134] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 138] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 139] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 196] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 197] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 201] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 202] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 8] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 12] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 13] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 70] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 71] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 75] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 76] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 133] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 134] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 138] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 139] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 196] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 197] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 201] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 202] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 14] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 15] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 16] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 17] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 77] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 78] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 79] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 80] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 140] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 141] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 142] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 143] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 203] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 204] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 205] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 206] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 14] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 15] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 16] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 17] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 77] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 78] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 79] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 80] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 140] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 141] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 142] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 143] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 203] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 204] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 205] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 206] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ for i1_inner in range(2):
compute_1 = T.Buffer((25088,), data=compute.data)
bias_1 = T.Buffer((512,), data=bias.data)
- compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 98 + i1_inner * 49 + blockIdx_x % 7 * 7 + i3_inner] = T.max(conv2d_nchw_1[i1_inner * 7 + i3_inner] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 1] = T.max(conv2d_nchw_1[i1_inner + 2] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 2] = T.max(conv2d_nchw_1[i1_inner + 4] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 3] = T.max(conv2d_nchw_1[i1_inner + 6] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 4] = T.max(conv2d_nchw_1[i1_inner + 8] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 5] = T.max(conv2d_nchw_1[i1_inner + 10] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 6] = T.max(conv2d_nchw_1[i1_inner + 12] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
@@ -765,7 +640,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 0.352 ms
+ Execution time of this operator: 0.383 ms
@@ -815,35 +690,35 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
- conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+ conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
- conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
+ conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
- conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+ conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
- conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
- conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+ conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=7)
+ conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=4)
conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
- conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+ conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
- conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+ conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2 [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
- compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+ compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
- compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
+ compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
- compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+ compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
- compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
+ compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=7)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw])
@@ -862,12 +737,12 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=56)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
- pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+ pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -887,430 +762,392 @@ They can be used for debugging and learning the behavior of the auto-scheduler.
#define int64_t long long
#define uint64_t unsigned long long
#endif
- extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+ extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+ __shared__ float pad_temp_shared[1008];
+ __shared__ float kernel_shared[768];
conv2d_nchw[0] = 0.000000e+00f;
- conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
- conv2d_nchw[3] = 0.000000e+00f;
conv2d_nchw[4] = 0.000000e+00f;
- conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[12] = 0.000000e+00f;
+ conv2d_nchw[1] = 0.000000e+00f;
+ conv2d_nchw[3] = 0.000000e+00f;
+ conv2d_nchw[5] = 0.000000e+00f;
+ conv2d_nchw[7] = 0.000000e+00f;
+ conv2d_nchw[9] = 0.000000e+00f;
+ conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) * 56)] = ((((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 1)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 2)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 3)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 4)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 5)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 6)] = ((((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 7)] = ((((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 8)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 9)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 10)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 11)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 12)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 13)] = ((((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 14)] = ((((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 15)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 16)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 17)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 18)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 19)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 20)] = ((((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 21)] = ((((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 22)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 23)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 24)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 25)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 26)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 27)] = ((((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 28)] = ((((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 29)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 30)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 31)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 32)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 33)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 34)] = ((((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 35)] = ((((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 36)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 37)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 38)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 39)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 40)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 41)] = ((((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 42)] = ((((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 43)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 44)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 45)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 46)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 47)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 48)] = ((((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 49)] = ((((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 50)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 51)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 52)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 53)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 54)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 55)] = ((((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 56)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 56) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 112) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 168) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) & 15) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 224) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 280) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 40) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 392) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 448) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 504) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) & 15) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 560) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 616) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 40) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 64512)];
+ if (((int)threadIdx.x) < 40) {
+ kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 728) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
}
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 4; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 66)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 67)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 68)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 69)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 129)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 130)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 131)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 132)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 190)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 191)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 192)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 193)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 194)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 195)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 66)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 67)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 68)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 69)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 129)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 130)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 131)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 132)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 190)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 191)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 192)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 193)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 194)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 195)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 71)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 75)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 76)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 134)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 138)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 139)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 197)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 198)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 199)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 200)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 201)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 202)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 71)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 75)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 76)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 134)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 138)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 139)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 197)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 198)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 199)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 200)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 201)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 202)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 78)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 79)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 80)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 141)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 142)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 143)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 204)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 205)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 206)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 207)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 208)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 209)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 78)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 79)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 80)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 141)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 142)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 143)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 204)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 205)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 206)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 207)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 208)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 209)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ }
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ compute[((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 1)] = max((conv2d_nchw[(i1_inner + 2)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 2)] = max((conv2d_nchw[(i1_inner + 4)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 3)] = max((conv2d_nchw[(i1_inner + 6)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 4)] = max((conv2d_nchw[(i1_inner + 8)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 5)] = max((conv2d_nchw[(i1_inner + 10)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 6)] = max((conv2d_nchw[(i1_inner + 12)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
}
}
@@ -1372,7 +1209,7 @@ In the example below we resume the status and do more 5 trials.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 38.300 seconds)
+ **Total running time of the script:** ( 5 minutes 34.610 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_conv2d_layer_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
index ea466638a6..beb58297f0 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_cuda.rst.txt
@@ -647,7 +647,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8796 7.8807 7.8808 7.8772 0.0017
+ 7.8650 7.8617 7.8742 7.8589 0.0066
@@ -675,7 +675,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 5.499 seconds)
+ **Total running time of the script:** ( 1 minutes 6.112 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_cuda.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
index 111b4667e9..a0e0077e96 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_network_x86.rst.txt
@@ -666,7 +666,7 @@ so we can read the log file and load the best schedules.
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 754.1366 754.1049 754.7571 753.5476 0.4943
+ 747.3662 747.1039 748.5371 746.4578 0.8689
@@ -694,7 +694,7 @@ Other Tips
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 39.258 seconds)
+ **Total running time of the script:** ( 1 minutes 39.417 seconds)
.. _sphx_glr_download_how_to_tune_with_autoscheduler_tune_network_x86.py:
diff --git a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
index 92c1cff115..0de188f845 100644
--- a/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
+++ b/docs/_sources/how_to/tune_with_autoscheduler/tune_sparse_x86.rst.txt
@@ -386,26 +386,86 @@ layout transformation, parallelization, vectorization, unrolling, and operator f
@T.prim_func
def main(placeholder: T.Buffer((128, 256), "float32"), placeholder_1: T.Buffer((4916, 16, 1), "float32"), placeholder_2: T.Buffer((4916,), "int32"), placeholder_3: T.Buffer((33,), "int32"), placeholder_4: T.Buffer((128, 512), "float32"), compute: T.Buffer((128, 512), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
- for i0_outer_i1_outer_fused in T.parallel(64):
- compute_1 = T.allocate([1024], "float32", "global")
- compute_2 = T.Buffer((1024,), data=compute_1)
- for nb_j_inner in range(2):
- for i_inner_init, j_init in T.grid(32, 16):
- compute_2[i_inner_init * 32 + nb_j_inner * 16 + j_init] = T.float32(0)
- for elem_idx, i_inner, j in T.grid(T.let(cse_var_1, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_5[cse_var_1 + 1] - placeholder_5[cse_var_1]), 32, 16):
- cse_var_1 = T.var("int32")
+ for i0_outer_i1_outer_fused in T.parallel(32):
+ compute_1 = T.allocate([2048], "float32", "global")
+ compute_2 = T.Buffer((2048,), data=compute_1)
+ for i_outer_inner in range(2):
+ for i_inner_init in range(64):
+ cse_var_1: T.int32 = i_outer_inner * 1024 + i_inner_init * 16
+ compute_2[cse_var_1] = T.float32(0)
+ compute_2[cse_var_1 + 1] = T.float32(0)
+ compute_2[cse_var_1 + 2] = T.float32(0)
+ compute_2[cse_var_1 + 3] = T.float32(0)
+ compute_2[cse_var_1 + 4] = T.float32(0)
+ compute_2[cse_var_1 + 5] = T.float32(0)
+ compute_2[cse_var_1 + 6] = T.float32(0)
+ compute_2[cse_var_1 + 7] = T.float32(0)
+ compute_2[cse_var_1 + 8] = T.float32(0)
+ compute_2[cse_var_1 + 9] = T.float32(0)
+ compute_2[cse_var_1 + 10] = T.float32(0)
+ compute_2[cse_var_1 + 11] = T.float32(0)
+ compute_2[cse_var_1 + 12] = T.float32(0)
+ compute_2[cse_var_1 + 13] = T.float32(0)
+ compute_2[cse_var_1 + 14] = T.float32(0)
+ compute_2[cse_var_1 + 15] = T.float32(0)
+ for elem_idx, i_inner in T.grid(placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused], 64):
placeholder_5 = T.Buffer((33,), "int32", data=placeholder_3.data)
- cse_var_3: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
- cse_var_2: T.int32 = i_inner * 32 + nb_j_inner * 16 + j
placeholder_6 = T.Buffer((78656,), data=placeholder_1.data)
placeholder_7 = T.Buffer((32768,), data=placeholder.data)
placeholder_8 = T.Buffer((4916,), "int32", data=placeholder_2.data)
- compute_2[cse_var_2] = compute_2[cse_var_2] + placeholder_6[placeholder_5[cse_var_3] * 16 + elem_idx * 16 + j] * T.max(placeholder_7[i0_outer_i1_outer_fused // 16 * 8192 + i_inner * 256 + placeholder_8[placeholder_5[cse_var_3] + elem_idx]], T.float32(0))
- for i0_inner, i1_inner in T.grid(32, 32):
- cse_var_4: T.int32 = i0_outer_i1_outer_fused // 16 * 16384 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32 + i1_inner
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_2: T.int32 = i_outer_inner * 1024 + i_inner * 16
+ compute_2[cse_var_2] = compute_2[cse_var_2] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_3: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 1
+ compute_2[cse_var_3] = compute_2[cse_var_3] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 1] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_4: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 2
+ compute_2[cse_var_4] = compute_2[cse_var_4] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 2] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_5: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 3
+ compute_2[cse_var_5] = compute_2[cse_var_5] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 3] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_6: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 4
+ compute_2[cse_var_6] = compute_2[cse_var_6] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 4] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_7: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 5
+ compute_2[cse_var_7] = compute_2[cse_var_7] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 5] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_8: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 6
+ compute_2[cse_var_8] = compute_2[cse_var_8] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 6] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_9: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 7
+ compute_2[cse_var_9] = compute_2[cse_var_9] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 7] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_10: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 8
+ compute_2[cse_var_10] = compute_2[cse_var_10] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 8] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_11: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 9
+ compute_2[cse_var_11] = compute_2[cse_var_11] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 9] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_12: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 10
+ compute_2[cse_var_12] = compute_2[cse_var_12] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 10] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_13: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 11
+ compute_2[cse_var_13] = compute_2[cse_var_13] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 11] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_14: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 12
+ compute_2[cse_var_14] = compute_2[cse_var_14] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 12] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_15: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 13
+ compute_2[cse_var_15] = compute_2[cse_var_15] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 13] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_16: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 14
+ compute_2[cse_var_16] = compute_2[cse_var_16] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 14] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_17: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 15
+ compute_2[cse_var_17] = compute_2[cse_var_17] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 15] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ for i0_inner in range(128):
+ cse_var_18: T.int32 = i0_inner * 512 + i0_outer_i1_outer_fused * 16
compute_3 = T.Buffer((65536,), data=compute.data)
placeholder_5 = T.Buffer((65536,), data=placeholder_4.data)
- compute_3[cse_var_4] = T.max(compute_2[i0_inner * 32 + i1_inner] + placeholder_5[cse_var_4], T.float32(0))
+ compute_3[cse_var_18:cse_var_18 + 16] = T.max(compute_2[i0_inner * 16:i0_inner * 16 + 16] + placeholder_5[cse_var_18:cse_var_18 + 16], T.Broadcast(T.float32(0), 16))
@@ -455,7 +515,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 1.670 ms
+ Execution time of this operator: 1.827 ms
diff --git a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
index 90b01402aa..73c88e05ce 100644
--- a/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:52.417** total execution time for **how_to_tune_with_autotvm** files:
+**00:27.181** total execution time for **how_to_tune_with_autotvm** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:52.385 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_conv2d_cuda.py` (``tune_conv2d_cuda.py``) | 00:27.145 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.019 | 0.0 MB |
+| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_x86.py` (``tune_relay_x86.py``) | 00:00.022 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_tune_with_autotvm_tune_relay_cuda.py` (``tune_relay_cuda.py``) | 00:00.005 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
index c0d81a1f93..54e43001b2 100644
--- a/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
+++ b/docs/_sources/how_to/tune_with_autotvm/tune_conv2d_cuda.rst.txt
@@ -390,7 +390,7 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 2, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6302753
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6500953
No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -513,7 +513,7 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3162506
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5717883
No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -636,7 +636,7 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7533856
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 16, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10418626
No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -759,8 +759,9 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7872598
- No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4282049
+ No: 5 GFLOPS: 55.23/55.23 result: MeasureResult(costs=(0.004191678666666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.290860414505005, timestamp=1674648479.445705) [('tile_f', [-1, 1, 64, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9109233
+ No: 6 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -882,8 +883,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1353389
- No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 128]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,207234
+ No: 7 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1005,8 +1006,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9242972
- No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 256, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2029993
+ No: 8 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1128,8 +1129,12 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8544741
- No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1103922
+ No: 9 GFLOPS: 27.51/55.23 result: MeasureResult(costs=(0.008414402583333333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.1393074989318848, timestamp=1674648483.7522187) [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9877126
+ No: 10 GFLOPS: 31.85/55.23 result: MeasureResult(costs=(0.0072692777857142855,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.873112916946411, timestamp=1674648484.5286293) [('tile_f', [-1, 32, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9309544
+ No: 11 GFLOPS: 228.53/228.53 result: MeasureResult(costs=(0.0010130230606060606,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5445051193237305, timestamp=1674648485.2542942) [('tile_f', [-1, 4, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4844421
+ No: 12 GFLOPS: 436.89/436.89 result: MeasureResult(costs=(0.0005298873639344263,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8932838439941406, timestamp=1674648486.267551) [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9945801
+ No: 13 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1251,26 +1256,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4982370
- No: 9 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
- res = future.result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
- return self.__get_result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
- raise self._exception
- File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
- result = self.fn(*self.args, **self.kwargs)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
- worker = lambda *args: self._worker_run(*args)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
- return proc.recv()
- File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
- raise TimeoutError()
- TimeoutError
-
- [('tile_f', [-1, 4, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2346190
- No: 10 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 64, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2597234
+ No: 14 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1392,8 +1379,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4485720
- No: 11 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 256, 1, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10377628
+ No: 15 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1515,8 +1502,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3423339
- No: 12 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2054152
+ No: 16 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1638,8 +1625,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8637442
- No: 13 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8283346
+ No: 17 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1761,8 +1748,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3959938
- No: 14 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9091325
+ No: 18 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1884,284 +1871,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6762941
- No: 15 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 742, in __call__
- yield remote, remote.load_module(os.path.split(build_result.filename)[1])
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 706, in run_through_rpc
- costs = time_f(*args).results
- File "/workspace/python/tvm/runtime/module.py", line 357, in evaluator
- blob = feval(*args)
- File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
- File "tvm/_ffi/_cython/./packed_func.pxi", line 262, in tvm._ffi._cy3.core.FuncCall
- File "tvm/_ffi/_cython/./packed_func.pxi", line 251, in tvm._ffi._cy3.core.FuncCall3
- File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
- tvm._ffi.base.TVMError: Traceback (most recent call last):
- 4: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../src/runtime/rpc/rpc_module.cc:129
- 1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)
- at ../src/runtime/rpc/rpc_endpoint.cc:1012
- 0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)
- at ../src/runtime/rpc/rpc_endpoint.cc:804
- File "../src/runtime/rpc/rpc_endpoint.cc", line 804
- TVMError:
- ---------------------------------------------------------------
- An error occurred during the execution of TVM.
- For more information, please see: https://tvm.apache.org/docs/errors.html
- ---------------------------------------------------------------
- Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
- During handling of the above exception, another exception occurred:
-
- Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 706, in run_through_rpc
- costs = time_f(*args).results
- File "/usr/lib/python3.7/contextlib.py", line 130, in __exit__
- self.gen.throw(type, value, traceback)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 746, in __call__
- remote.remove(build_result.filename)
- File "/workspace/python/tvm/rpc/client.py", line 144, in remove
- self._remote_funcs["remove"] = self.get_function("tvm.rpc.server.remove")
- File "/workspace/python/tvm/rpc/client.py", line 72, in get_function
- return self._sess.get_function(name)
- File "/workspace/python/tvm/runtime/module.py", line 171, in get_function
- self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
- File "/workspace/python/tvm/_ffi/base.py", line 348, in check_call
- raise get_last_ffi_error()
- tvm._ffi.base.TVMError: Traceback (most recent call last):
- 52: 0xffffffffffffffff
- 51: _start
- 50: __libc_start_main
- 49: _Py_UnixMain
- 48: 0x0000000000650da0
- 47: 0x0000000000650afa
- 46: _PyFunction_FastCallDict
- 45: _PyEval_EvalCodeWithName
- 44: _PyEval_EvalFrameDefault
- 43: _PyFunction_FastCallKeywords
- 42: _PyEval_EvalCodeWithName
- 41: _PyEval_EvalFrameDefault
- 40: _PyMethodDef_RawFastCallKeywords
- 39: 0x0000000000546369
- 38: _PyEval_EvalCodeWithName
- 37: _PyEval_EvalFrameDefault
- 36: _PyFunction_FastCallKeywords
- 35: _PyEval_EvalCodeWithName
- 34: _PyEval_EvalFrameDefault
- 33: _PyFunction_FastCallDict
- 32: _PyEval_EvalCodeWithName
- 31: _PyEval_EvalFrameDefault
- 30: _PyObject_FastCallDict
- 29: 0x00000000004c06e1
- 28: _PyFunction_FastCallDict
- 27: _PyEval_EvalFrameDefault
- 26: _PyMethodDescr_FastCallKeywords
- 25: 0x00000000005dcb58
- 24: 0x00000000005dc83f
- 23: 0x00000000004ba127
- 22: _PyEval_EvalFrameDefault
- 21: _PyFunction_FastCallKeywords
- 20: _PyEval_EvalFrameDefault
- 19: _PyFunction_FastCallKeywords
- 18: _PyEval_EvalFrameDefault
- 17: _PyFunction_FastCallKeywords
- 16: _PyEval_EvalCodeWithName
- 15: _PyEval_EvalFrameDefault
- 14: 0x0000000000537c30
- 13: _PyObject_FastCallKeywords
- 12: 0x00007f9dc4af1fa2
- 11: _ctypes_callproc
- 10: ffi_call
- 9: ffi_call_unix64
- 8: TVMModGetFunction
- at ../src/runtime/c_runtime_api.cc:408
- 7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
- at ../src/runtime/module.cc:66
- 6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)
- at ../src/runtime/rpc/rpc_module.cc:185
- 5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
- at ../src/runtime/rpc/rpc_endpoint.cc:1007
- 4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(tvm::runtime::RPCCode, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
- at ../src/runtime/rpc/rpc_endpoint.h:223
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(int&&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/rpc/rpc_endpoint.cc:684
- File "../src/runtime/rpc/rpc_endpoint.cc", line 684
- TVMError:
- ---------------------------------------------------------------
- An error occurred during the execution of TVM.
- For more information, please see: https://tvm.apache.org/docs/errors.html
- ---------------------------------------------------------------
- Check failed: (code == RPCCode::kReturn) is false: code=1
-
- Traceback (most recent call last):
- 52: 0xffffffffffffffff
- 51: _start
- 50: __libc_start_main
- 49: _Py_UnixMain
- 48: 0x0000000000650da0
- 47: 0x0000000000650afa
- 46: _PyFunction_FastCallDict
- 45: _PyEval_EvalCodeWithName
- 44: _PyEval_EvalFrameDefault
- 43: _PyFunction_FastCallKeywords
- 42: _PyEval_EvalCodeWithName
- 41: _PyEval_EvalFrameDefault
- 40: _PyMethodDef_RawFastCallKeywords
- 39: 0x0000000000546369
- 38: _PyEval_EvalCodeWithName
- 37: _PyEval_EvalFrameDefault
- 36: _PyFunction_FastCallKeywords
- 35: _PyEval_EvalCodeWithName
- 34: _PyEval_EvalFrameDefault
- 33: _PyFunction_FastCallDict
- 32: _PyEval_EvalCodeWithName
- 31: _PyEval_EvalFrameDefault
- 30: _PyObject_FastCallDict
- 29: 0x00000000004c06e1
- 28: _PyFunction_FastCallDict
- 27: _PyEval_EvalFrameDefault
- 26: _PyMethodDescr_FastCallKeywords
- 25: 0x00000000005dcb58
- 24: 0x00000000005dc83f
- 23: 0x00000000004ba127
- 22: _PyEval_EvalFrameDefault
- 21: _PyFunction_FastCallKeywords
- 20: _PyEval_EvalFrameDefault
- 19: _PyFunction_FastCall [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4301358
- No: 16 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
- func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
- func = build(s, args, target_host=task.target_host, runtime=runtime)
- File "/workspace/python/tvm/driver/build_module.py", line 227, in build
- input_mod = lower(inputs, args, name=name, binds=binds)
- File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
- return ffi.lower_schedule(inp, args, name, binds, simple_mode)
- File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
- File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
- File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
- tvm._ffi.base.TVMError: Traceback (most recent call last):
- 24: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 22: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 21: operator()
- at ../include/tvm/runtime/packed_func.h:1730
- 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
- at ../include/tvm/runtime/packed_func.h:1670
- 19: run<>
- at ../include/tvm/runtime/packed_func.h:1630
- 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1645
- 13: operator()
- at ../src/driver/driver_api.cc:395
- 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:381
- 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:276
- 10: tvm::transform::Pass::operator()(tvm::IRModule) const
- at ../src/ir/transform.cc:258
- 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:451
- 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/tir/ir/transform.cc:100
- 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
- at ../include/tvm/runtime/packed_func.h:1749
- 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
- at ../include/tvm/runtime/packed_func.h:1693
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/c_runtime_api.cc:534
- File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
- raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
-
- Traceback (most recent call last):
- 24: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 22: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 21: operator()
- at ../include/tvm/runtime/packed_func.h:1730
- 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
- at ../include/tvm/runtime/packed_func.h:1670
- 19: run<>
- at ../include/tvm/runtime/packed_func.h:1630
- 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1645
- 13: operator()
- at ../src/driver/driver_api.cc:395
- 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:381
- 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:276
- 10: tvm::transform::Pass::operator()(tvm::IRModule) const
- at ../src/ir/transform.cc:258
- 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:451
- 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/tir/ir/transform.cc:100
- 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
- at ../include/tvm/runtime/packed_func.h:1749
- 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
- at ../include/tvm/runtime/packed_func.h:1693
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/c_runtime_api.cc:534
- File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
- raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 64, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,532885
- No: 17 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 1, 256]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1875056
+ No: 19 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2283,10 +1994,8 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9872525
- No: 18 GFLOPS: 1.76/1.76 result: MeasureResult(costs=(0.131499069,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.7559709548950195, timestamp=1674629081.3925776) [('tile_f', [-1, 1, 8, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4846589
- No: 19 GFLOPS: 70.51/70.51 result: MeasureResult(costs=(0.0032830254193548384,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.035698413848877, timestamp=1674629082.1421092) [('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8980632
- No: 20 GFLOPS: 0.00/70.51 result: Traceback (most recent call last):
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6111528
+ No: 20 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2408,7 +2117,7 @@ for this template
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
- tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 1, 256]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3199896
+ tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1328620
@@ -2463,9 +2172,9 @@ and measure running time.
Finish loading 20 records
Best config:
- [('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8980632
+ [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9945801
Finish loading 20 records
- Time cost of this operator: 0.003554
+ Time cost of this operator: 0.000917
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
index e936329f86..e4e1291565 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_autotune.rst.txt
@@ -363,10 +363,10 @@ Timing the untuned program
########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.8 98.72 (1, 2, 10, 10, 3) 2 1 [310.8]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.04 0.966 (1, 6, 10, 10) 1 1 [3.04]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.991 0.315 (1, 1, 10, 10, 3) 1 1 [0.991]
- Total_time - 314.831 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 309.3 98.645 (1, 2, 10, 10, 3) 2 1 [309.3]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.286 1.048 (1, 6, 10, 10) 1 1 [3.286]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.961 0.307 (1, 1, 10, 10, 3) 1 1 [0.961]
+ Total_time - 313.547 - - - - -
@@ -431,10 +431,10 @@ Timing the tuned program
########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
- tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 102.7 97.495 (1, 6, 10, 10, 1) 2 1 [102.7]
- tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.775 1.685 (1, 6, 10, 10) 1 1 [1.775]
- tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.863 0.82 (1, 3, 10, 10, 1) 1 1 [0.863]
- Total_time - 105.339 - - - - -
+ tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 104.6 97.511 (1, 6, 10, 10, 1) 2 1 [104.6]
+ tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.83 1.706 (1, 6, 10, 10) 1 1 [1.83]
+ tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.84 0.783 (1, 3, 10, 10, 1) 1 1 [0.84]
+ Total_time - 107.27 - - - - -
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
index 146b5f98cd..752cfed0b8 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_pytorch.rst.txt
@@ -117,7 +117,7 @@ download a cat image and preprocess it to use as the model input.
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/ao/quantization/utils.py:281: UserWarning: must run observer before calling calculate_qparams. Returning default values.
"must run observer before calling calculate_qparams. " +
Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
-
0%| | 0.00/3.42M [00:00<?, ?B/s]
61%|###### | 2.09M/3.42M [00:00<00:00, 18.8MB/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 29.1MB/s]
+
0%| | 0.00/3.42M [00:00<?, ?B/s]
61%|###### | 2.09M/3.42M [00:00<00:00, 19.6MB/s]
100%|##########| 3.42M/3.42M [00:00<00:00, 30.5MB/s]
/workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
return LooseVersion(torch_ver) > ver
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -322,7 +322,7 @@ Look up prediction top 1 index in 1000 class synset.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 8.448 seconds)
+ **Total running time of the script:** ( 1 minutes 10.597 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_pytorch.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
index c26ef63344..6445b1312e 100644
--- a/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/micro_train.rst.txt
@@ -218,7 +218,7 @@ take about **2 minutes** to download the Stanford Cars, while COCO 2017 validati
.. code-block:: none
- '/tmp/tmp84sny8mt/images/random'
+ '/tmp/tmpp9l12y8g/images/random'
@@ -309,7 +309,7 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. image-sg:: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
- :alt: [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]
+ :alt: [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]
:srcset: /how_to/work_with_microtvm/images/sphx_glr_micro_train_001.png
:class: sphx-glr-single-img
@@ -318,8 +318,8 @@ objects to other stuff? We can display some examples from our datasets using ``m
.. code-block:: none
- /tmp/tmp84sny8mt/images/target contains 8144 images
- /tmp/tmp84sny8mt/images/random contains 5000 images
+ /tmp/tmpp9l12y8g/images/target contains 8144 images
+ /tmp/tmpp9l12y8g/images/random contains 5000 images
@@ -494,13 +494,13 @@ the time on our validation set).
.. code-block:: none
Epoch 1/3
- 328/328 - 47s - loss: 0.2188 - accuracy: 0.9217 - val_loss: 0.1127 - val_accuracy: 0.9585 - 47s/epoch - 142ms/step
+ 328/328 - 47s - loss: 0.2189 - accuracy: 0.9204 - val_loss: 0.1003 - val_accuracy: 0.9645 - 47s/epoch - 143ms/step
Epoch 2/3
- 328/328 - 43s - loss: 0.0993 - accuracy: 0.9647 - val_loss: 0.1226 - val_accuracy: 0.9585 - 43s/epoch - 132ms/step
+ 328/328 - 43s - loss: 0.0913 - accuracy: 0.9649 - val_loss: 0.1056 - val_accuracy: 0.9611 - 43s/epoch - 131ms/step
Epoch 3/3
- 328/328 - 43s - loss: 0.0665 - accuracy: 0.9744 - val_loss: 0.1547 - val_accuracy: 0.9483 - 43s/epoch - 131ms/step
+ 328/328 - 43s - loss: 0.0709 - accuracy: 0.9755 - val_loss: 0.1266 - val_accuracy: 0.9603 - 43s/epoch - 131ms/step
- <keras.callbacks.History object at 0x7f334d039090>
+ <keras.callbacks.History object at 0x7fcaab8ab250>
@@ -857,7 +857,7 @@ Arduino tutorial for how to do that `on GitHub <https://github.com/guberti/tvm-a
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 5 minutes 10.800 seconds)
+ **Total running time of the script:** ( 4 minutes 44.562 seconds)
.. _sphx_glr_download_how_to_work_with_microtvm_micro_train.py:
diff --git a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
index ae975c873a..59fc014707 100644
--- a/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_microtvm/sg_execution_times.rst.txt
@@ -5,18 +5,18 @@
Computation times
=================
-**07:23.349** total execution time for **how_to_work_with_microtvm** files:
+**06:59.895** total execution time for **how_to_work_with_microtvm** files:
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 05:10.800 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_train.py` (``micro_train.py``) | 04:44.562 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:08.448 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_pytorch.py` (``micro_pytorch.py``) | 01:10.597 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:51.362 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_autotune.py` (``micro_autotune.py``) | 00:51.796 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:08.938 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_aot.py` (``micro_aot.py``) | 00:09.134 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.800 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tflite.py` (``micro_tflite.py``) | 00:03.806 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_microtvm_micro_tvmc.py` (``micro_tvmc.py``) | 00:00.000 | 0.0 MB |
+---------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
index 162facf7b9..f4557090a6 100644
--- a/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_relay/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
Computation times
=================
-**00:44.393** total execution time for **how_to_work_with_relay** files:
+**00:44.279** total execution time for **how_to_work_with_relay** files:
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.379 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_pipeline_executor.py` (``using_pipeline_executor.py``) | 00:32.499 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.447 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_using_external_lib.py` (``using_external_lib.py``) | 00:10.427 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.560 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_relay_build_gcn.py` (``build_gcn.py``) | 00:01.347 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_relay_using_relay_viz.py` (``using_relay_viz.py``) | 00:00.006 | 0.0 MB |
+----------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
index a635f09ca3..85cf18c364 100644
--- a/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/intrin_math.rst.txt
@@ -264,7 +264,7 @@ The following example customizes CUDA lowering rule for :code:`exp`.
.. code-block:: none
- <function my_cuda_math_rule at 0x7f334dccf560>
+ <function my_cuda_math_rule at 0x7fcaab686b90>
diff --git a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
index 804c0262da..5b491d518f 100644
--- a/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/sg_execution_times.rst.txt
@@ -5,22 +5,22 @@
Computation times
=================
-**00:06.549** total execution time for **how_to_work_with_schedules** files:
+**00:07.755** total execution time for **how_to_work_with_schedules** files:
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:04.039 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_intrin_math.py` (``intrin_math.py``) | 00:05.255 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.150 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tensorize.py` (``tensorize.py``) | 00:01.140 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.580 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_reduction.py` (``reduction.py``) | 00:00.576 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.560 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_scan.py` (``scan.py``) | 00:00.558 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.115 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_extern_op.py` (``extern_op.py``) | 00:00.119 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_schedule_primitives.py` (``schedule_primitives.py``) | 00:00.050 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_how_to_work_with_schedules_tedd.py` (``tedd.py``) | 00:00.032 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.023 | 0.0 MB |
+| :ref:`sphx_glr_how_to_work_with_schedules_tuple_inputs.py` (``tuple_inputs.py``) | 00:00.024 | 0.0 MB |
+------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
index 7e7faad471..e2225b7c90 100644
--- a/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
+++ b/docs/_sources/how_to/work_with_schedules/tensorize.rst.txt
@@ -328,7 +328,7 @@ The importing needs to happen before the tensorized GEMV being executed.
def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
i = T.var("int32")
- T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmp7fxwoglv/input0.cc'\nsource_filename = \"/tmp/tmp7fxwoglv/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca float*, [...]
+ T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmpkharrg48/input0.cc'\nsource_filename = \"/tmp/tmpkharrg48/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca float*, align 8\n %8 = alloca float*, align 8\n %9 = alloca float*, [...]
for i, j_outer in T.grid(1024, 32):
T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)
diff --git a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
index b47bf7a05c..0a9849798f 100644
--- a/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/autotvm/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:30.743** total execution time for **topic_vta_tutorials_autotvm** files:
+**00:30.328** total execution time for **topic_vta_tutorials_autotvm** files:
+---------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:30.737 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_relay_vta.py` (``tune_relay_vta.py``) | 00:30.321 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_topic_vta_tutorials_autotvm_tune_alu_vta.py` (``tune_alu_vta.py``) | 00:00.007 | 0.0 MB |
+---------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
index 585c18ff41..5773a44705 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_classification.rst.txt
@@ -293,7 +293,7 @@ The compilation steps are:
DeprecationWarning,
/workspace/vta/tutorials/frontend/deploy_classification.py:213: DeprecationWarning: legacy graph executor behavior of producing json / lib / params will be removed in the next release. Please see documents of tvm.contrib.graph_executor.GraphModule for the new recommended usage.
relay_prog, target=tvm.target.Target(target, host=env.target_host), params=params
- resnet18_v1 inference graph built in 32.22s!
+ resnet18_v1 inference graph built in 32.50s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
index 520620c5e1..7067edcab3 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/deploy_detection.rst.txt
@@ -337,7 +337,7 @@ The compilation steps are:
/workspace/python/tvm/relay/build_module.py:348: DeprecationWarning: Please use input parameter mod (tvm.IRModule) instead of deprecated parameter mod (tvm.relay.function.Function)
DeprecationWarning,
- yolov3-tiny inference graph built in 22.63s!
+ yolov3-tiny inference graph built in 22.08s!
diff --git a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
index 0ec6e7afac..d80c39ae5c 100644
--- a/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/frontend/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**01:38.880** total execution time for **topic_vta_tutorials_frontend** files:
+**01:38.475** total execution time for **topic_vta_tutorials_frontend** files:
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:49.733 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:49.287 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_classification.py` (``deploy_classification.py``) | 00:49.147 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_frontend_deploy_detection.py` (``deploy_detection.py``) | 00:49.187 | 0.0 MB |
+------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
index 3307d2ab94..9512468059 100644
--- a/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/optimize/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:03.143** total execution time for **topic_vta_tutorials_optimize** files:
+**00:03.169** total execution time for **topic_vta_tutorials_optimize** files:
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.676 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_convolution_opt.py` (``convolution_opt.py``) | 00:02.709 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.468 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_optimize_matrix_multiply_opt.py` (``matrix_multiply_opt.py``) | 00:00.460 | 0.0 MB |
+--------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
index d34dae7f67..4c5e4e770d 100644
--- a/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
+++ b/docs/_sources/topic/vta/tutorials/sg_execution_times.rst.txt
@@ -5,10 +5,10 @@
Computation times
=================
-**00:00.829** total execution time for **topic_vta_tutorials** files:
+**00:00.805** total execution time for **topic_vta_tutorials** files:
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.435 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_matrix_multiply.py` (``matrix_multiply.py``) | 00:00.423 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.393 | 0.0 MB |
+| :ref:`sphx_glr_topic_vta_tutorials_vta_get_started.py` (``vta_get_started.py``) | 00:00.382 | 0.0 MB |
+---------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
index 4626b66197..da1a4204cc 100644
--- a/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/auto_scheduler_matmul_x86.rst.txt
@@ -315,7 +315,7 @@ We build the binary and check its correctness and performance.
.. code-block:: none
- Execution time of this operator: 94.265 ms
+ Execution time of this operator: 95.523 ms
@@ -433,7 +433,7 @@ operations.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 1 minutes 31.485 seconds)
+ **Total running time of the script:** ( 1 minutes 31.135 seconds)
.. _sphx_glr_download_tutorial_auto_scheduler_matmul_x86.py:
diff --git a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
index 8557369f24..336c78089f 100644
--- a/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_matmul_x86.rst.txt
@@ -454,16 +454,16 @@ reduce variance, we take 5 measurements and average them.
waiting for device...
device available
Get devices for measurement successfully!
- No: 1 GFLOPS: 0.44/0.44 result: MeasureResult(costs=(0.6083183096,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.948514699935913, timestamp=1674627524.8434086) [('tile_y', [-1, 512]), ('tile_x', [-1, 1])],None,9
- No: 2 GFLOPS: 4.06/4.06 result: MeasureResult(costs=(0.0661649896,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.3219096660614014, timestamp=1674627526.1558943) [('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,44
- No: 3 GFLOPS: 11.34/11.34 result: MeasureResult(costs=(0.0236797962,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6108138561248779, timestamp=1674627527.557018) [('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,58
- No: 4 GFLOPS: 11.68/11.68 result: MeasureResult(costs=(0.0229805812,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.608971118927002, timestamp=1674627528.9412158) [('tile_y', [-1, 32]), ('tile_x', [-1, 256])],None,85
- No: 5 GFLOPS: 11.25/11.68 result: MeasureResult(costs=(0.0238598868,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6242167949676514, timestamp=1674627529.7377863) [('tile_y', [-1, 2]), ('tile_x', [-1, 256])],None,81
- No: 6 GFLOPS: 3.21/11.68 result: MeasureResult(costs=(0.0836804682,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5808813571929932, timestamp=1674627531.326371) [('tile_y', [-1, 2]), ('tile_x', [-1, 8])],None,31
- No: 7 GFLOPS: 10.48/11.68 result: MeasureResult(costs=(0.025615115,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6991691589355469, timestamp=1674627532.7507215) [('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,63
- No: 8 GFLOPS: 3.88/11.68 result: MeasureResult(costs=(0.0692685044,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.370384693145752, timestamp=1674627534.111382) [('tile_y', [-1, 32]), ('tile_x', [-1, 16])],None,45
- No: 9 GFLOPS: 3.26/11.68 result: MeasureResult(costs=(0.0823945236,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5373015403747559, timestamp=1674627535.7757132) [('tile_y', [-1, 32]), ('tile_x', [-1, 8])],None,35
- No: 10 GFLOPS: 9.77/11.68 result: MeasureResult(costs=(0.0274847752,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6630353927612305, timestamp=1674627536.4660132) [('tile_y', [-1, 8]), ('tile_x', [-1, 32])],None,53
+ No: 1 GFLOPS: 13.77/13.77 result: MeasureResult(costs=(0.019501099799999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.567375898361206, timestamp=1674646945.012616) [('tile_y', [-1, 128]), ('tile_x', [-1, 64])],None,67
+ No: 2 GFLOPS: 1.54/13.77 result: MeasureResult(costs=(0.17442023099999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.0188403129577637, timestamp=1674646948.8169198) [('tile_y', [-1, 4]), ('tile_x', [-1, 1])],None,2
+ No: 3 GFLOPS: 1.75/13.77 result: MeasureResult(costs=(0.1530374294,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.683912515640259, timestamp=1674646952.2850173) [('tile_y', [-1, 1]), ('tile_x', [-1, 8])],None,30
+ No: 4 GFLOPS: 10.09/13.77 result: MeasureResult(costs=(0.026614337999999998,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6586635112762451, timestamp=1674646953.7394924) [('tile_y', [-1, 2]), ('tile_x', [-1, 32])],None,51
+ No: 5 GFLOPS: 12.49/13.77 result: MeasureResult(costs=(0.0214840332,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6121816635131836, timestamp=1674646954.467223) [('tile_y', [-1, 128]), ('tile_x', [-1, 256])],None,87
+ No: 6 GFLOPS: 4.14/13.77 result: MeasureResult(costs=(0.0647810424,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.2988002300262451, timestamp=1674646955.7609518) [('tile_y', [-1, 16]), ('tile_x', [-1, 16])],None,44
+ No: 7 GFLOPS: 11.01/13.77 result: MeasureResult(costs=(0.0243857762,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.6280999183654785, timestamp=1674646956.4059246) [('tile_y', [-1, 256]), ('tile_x', [-1, 32])],None,58
+ No: 8 GFLOPS: 0.46/13.77 result: MeasureResult(costs=(0.578640577,), error_no=MeasureErrorNo.NO_ERROR, all_cost=9.519085884094238, timestamp=1674646965.956144) [('tile_y', [-1, 512]), ('tile_x', [-1, 1])],None,9
+ No: 9 GFLOPS: 10.47/13.77 result: MeasureResult(costs=(0.0256268724,), error_no=MeasureErrorNo.NO_ERROR, all_cost=0.681133508682251, timestamp=1674646966.7515464) [('tile_y', [-1, 8]), ('tile_x', [-1, 64])],None,63
+ No: 10 GFLOPS: 3.93/13.77 result: MeasureResult(costs=(0.068315192,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.35294771194458, timestamp=1674646968.0984406) [('tile_y', [-1, 64]), ('tile_x', [-1, 16])],None,46
diff --git a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
index 8e5cceba3d..d67d5e8683 100644
--- a/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
+++ b/docs/_sources/tutorial/autotvm_relay_x86.rst.txt
@@ -311,7 +311,7 @@ standard deviation.
.. code-block:: none
- {'mean': 510.90583774000044, 'median': 511.5418410499956, 'std': 1.5800351073285022}
+ {'mean': 508.56751572999656, 'median': 508.4643390500105, 'std': 0.7452097244428618}
@@ -545,31 +545,30 @@ the tuning data to.
.. code-block:: none
-
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 15.72/ 22.23 GFLOPS | Progress: (4/20) | 7.20 s
[Task 1/25] Current/Best: 11.41/ 22.62 GFLOPS | Progress: (8/20) | 11.07 s
[Task 1/25] Current/Best: 18.05/ 22.62 GFLOPS | Progress: (12/20) | 14.92 s
[Task 1/25] Current/Best: 8.88/ 22.62 GFLOPS | Progress: (16/20) | 18.59 s
[Task 1/25] Current/Best: 17.04/ 22.62 GFLOPS | Progress: (20/20) | 21.29 s Done.
-
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 17.59/ 17.59 GFLOPS | Progress: (4/20) | 3.84 s
[Task 2/25] Current/Best: 17.60/ 17.60 GFLOPS | Progress: (8/20) | 5.77 s
[Task 2/25] Current/Best: 14.53/ 17.60 GFLOPS | Progress: (12/20) | 7.61 s
[Task 2/25] Current/Best: 12.73/ 18.22 GFLOPS | Progress: (16/20) | 9.06 s
[Task 2/25] Current/Best: 16.14/ 18.22 GFLOPS | Progress: (20/20) | 10.71 s Done.
-
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 12.48/ 17.32 GFLOPS | Progress: (4/20) | 3.98 s
[Task 3/25] Current/Best: 13.52/ 22.16 GFLOPS | Progress: (8/20) | 6.10 s
[Task 3/25] Current/Best: 23.28/ 23.28 GFLOPS | Progress: (12/20) | 8.48 s
[Task 3/25] Current/Best: 12.70/ 23.28 GFLOPS | Progress: (16/20) | 11.04 s
[Task 3/25] Current/Best: 9.76/ 23.28 GFLOPS | Progress: (20/20) | 13.19 s Done.
-
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 14.08/ 16.29 GFLOPS | Progress: (4/20) | 5.15 s
[Task 4/25] Current/Best: 13.39/ 16.29 GFLOPS | Progress: (8/20) | 10.46 s
[Task 4/25] Current/Best: 12.77/ 16.29 GFLOPS | Progress: (12/20) | 12.19 s
[Task 4/25] Current/Best: 16.27/ 16.29 GFLOPS | Progress: (16/20) | 14.02 s
[Task 4/25] Current/Best: 16.34/ 16.34 GFLOPS | Progress: (20/20) | 21.12 s Done.
-
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 18.57/ 18.57 GFLOPS | Progress: (4/20) | 3.90 s
[Task 5/25] Current/Best: 7.41/ 18.57 GFLOPS | Progress: (8/20) | 5.85 s
[Task 5/25] Current/Best: 1.60/ 18.57 GFLOPS | Progress: (12/20) | 8.84 s
[Task 5/25] Current/Best: 13.07/ 18.57 GFLOPS | Progress: (16/20) | 12.61 s
[Task 5/25] Current/Best: 5.87/ 18.86 GFLOPS | Progress: (20/20) | 14.66 s Done.
-
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 5.59/ 14.77 GFLOPS | Progress: (4/20) | 5.06 s
[Task 6/25] Current/Best: 14.83/ 14.83 GFLOPS | Progress: (8/20) | 7.97 s
[Task 6/25] Current/Best: 11.43/ 14.95 GFLOPS | Progress: (12/20) | 12.59 s
[Task 6/25] Current/Best: 11.20/ 20.28 GFLOPS | Progress: (16/20) | 16.20 s
[Task 6/25] Current/Best: 8.59/ 22.51 GFLOPS | Progress: (20/20) | 18.13 s Done.
-
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 12.21/ 21.78 GFLOPS | Progress: (4/20) | 4.10 s
[Task 7/25] Current/Best: 15.97/ 21.78 GFLOPS | Progress: (8/20) | 6.31 s
[Task 7/25] Current/Best: 19.98/ 21.78 GFLOPS | Progress: (12/20) | 8.51 s
[Task 7/25] Current/Best: 16.16/ 21.78 GFLOPS | Progress: (16/20) | 11.30 s
[Task 7/25] Current/Best: 15.68/ 21.78 GFLOPS | Progress: (20/20) | 13.62 s Done.
-
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 10.76/ 14.22 GFLOPS | Progress: (4/20) | 6.20 s
[Task 8/25] Current/Best: 19.93/ 19.93 GFLOPS | Progress: (8/20) | 8.44 s
[Task 8/25] Current/Best: 16.99/ 19.93 GFLOPS | Progress: (12/20) | 16.65 s
[Task 8/25] Current/Best: 15.04/ 19.93 GFLOPS | Progress: (16/20) | 22.32 s
[Task 8/25] Current/Best: 12.04/ 19.93 GFLOPS | Progress: (20/20) | 25.11 s Done.
-
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 8.81/ 16.32 GFLOPS | Progress: (4/20) | 10.08 s
[Task 9/25] Current/Best: 13.99/ 23.10 GFLOPS | Progress: (8/20) | 16.08 s
[Task 9/25] Current/Best: 19.93/ 23.10 GFLOPS | Progress: (12/20) | 18.05 s
[Task 9/25] Current/Best: 7.68/ 23.10 GFLOPS | Progress: (16/20) | 24.32 s
[Task 9/25] Current/Best: 15.49/ 23.10 GFLOPS | Progress: (20/20) | 30.98 s Done.
-
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 16.59/ 16.59 GFLOPS | Progress: (4/20) | 3.50 s
[Task 10/25] Current/Best: 14.01/ 18.64 GFLOPS | Progress: (8/20) | 6.81 s
[Task 10/25] Current/Best: 14.14/ 18.64 GFLOPS | Progress: (12/20) | 8.71 s
[Task 10/25] Current/Best: 5.40/ 18.98 GFLOPS | Progress: (16/20) | 10.90 s
[Task 10/25] Current/Best: 8.72/ 21.00 GFLOPS | Progress: (20/20) | 12.60 s Done.
-
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 3.11/ 20.55 GFLOPS | Progress: (4/20) | 5.08 s
[Task 11/25] Current/Best: 7.05/ 21.92 GFLOPS | Progress: (8/20) | 7.39 s
[Task 11/25] Current/Best: 18.22/ 23.35 GFLOPS | Progress: (12/20) | 9.95 s
[Task 11/25] Current/Best: 18.09/ 23.35 GFLOPS | Progress: (16/20) | 12.57 s
[Task 11/25] Current/Best: 18.48/ 23.35 GFLOPS | Progress: (20/20) | 14.77 s Done.
-
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 4.21/ 18.20 GFLOPS | Progress: (4/20) | 4.39 s
[Task 12/25] Current/Best: 8.74/ 18.20 GFLOPS | Progress: (8/20) | 9.89 s
[Task 12/25] Current/Best: 9.65/ 18.20 GFLOPS | Progress: (12/20) | 12.53 s
[Task 12/25] Current/Best: 14.19/ 18.20 GFLOPS | Progress: (16/20) | 15.13 s
[Task 12/25] Current/Best: 12.15/ 18.41 GFLOPS | Progress: (20/20) | 19.27 s Done.
-
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 19.95/ 19.95 GFLOPS | Progress: (4/20) | 4.68 s
[Task 13/25] Current/Best: 19.24/ 19.95 GFLOPS | Progress: (8/20) | 7.51 s
[Task 13/25] Current/Best: 19.90/ 21.42 GFLOPS | Progress: (12/20) | 9.53 s
[Task 13/25] Current/Best: 11.52/ 22.12 GFLOPS | Progress: (16/20) | 12.78 s
[Task 13/25] Current/Best: 12.77/ 22.33 GFLOPS | Progress: (20/20) | 16.29 s Done.
-
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 15.08/ 15.08 GFLOPS | Progress: (4/20) | 9.02 s
[Task 14/25] Current/Best: 13.95/ 20.95 GFLOPS | Progress: (8/20) | 13.95 s
[Task 14/25] Current/Best: 17.49/ 20.95 GFLOPS | Progress: (12/20) | 16.23 s
[Task 14/25] Current/Best: 14.22/ 20.95 GFLOPS | Progress: (16/20) | 18.20 s
[Task 14/25] Current/Best: 10.38/ 20.95 GFLOPS | Progress: (20/20) | 23.55 s Done.
-
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 22.78/ 22.78 GFLOPS | Progress: (4/20) | 3.67 s
[Task 15/25] Current/Best: 12.33/ 22.78 GFLOPS | Progress: (8/20) | 7.06 s
[Task 15/25] Current/Best: 13.34/ 22.78 GFLOPS | Progress: (12/20) | 9.67 s
[Task 15/25] Current/Best: 15.95/ 22.78 GFLOPS | Progress: (16/20) | 11.55 s
[Task 15/25] Current/Best: 11.47/ 22.78 GFLOPS | Progress: (20/20) | 13.30 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 14.67/ 14.67 GFLOPS | Progress: (4/20) | 4.18 s
[Task 16/25] Current/Best: 13.69/ 17.09 GFLOPS | Progress: (8/20) | 5.89 s
[Task 16/25] Current/Best: 9.78/ 17.09 GFLOPS | Progress: (12/20) | 8.30 s
[Task 16/25] Current/Best: 9.06/ 19.26 GFLOPS | Progress: (16/20) | 11.84 s
[Task 16/25] Current/Best: 10.22/ 19.26 GFLOPS | Progress: (20/20)
| 13.56 s Done.
-
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 12.31/ 21.83 GFLOPS | Progress: (4/20) | 3.92 s Done.
-
[Task 17/25] Current/Best: 22.91/ 22.91 GFLOPS | Progress: (8/20) | 6.15 s
[Task 17/25] Current/Best: 20.86/ 22.91 GFLOPS | Progress: (12/20) | 8.76 s
[Task 17/25] Current/Best: 3.10/ 22.91 GFLOPS | Progress: (16/20) | 12.04 s
[Task 17/25] Current/Best: 6.20/ 22.91 GFLOPS | Progress: (20/20) | 15.68 s Done.
-
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 11.29/ 16.01 GFLOPS | Progress: (4/20) | 5.11 s
[Task 18/25] Current/Best: 17.78/ 17.78 GFLOPS | Progress: (8/20) | 7.26 s
[Task 18/25] Current/Best: 5.08/ 18.75 GFLOPS | Progress: (12/20) | 13.64 s
[Task 18/25] Current/Best: 13.28/ 18.75 GFLOPS | Progress: (16/20) | 16.39 s
[Task 18/25] Current/Best: 18.13/ 18.75 GFLOPS | Progress: (20/20) | 18.91 s Done.
-
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 13.73/ 20.74 GFLOPS | Progress: (4/20) | 4.58 s
[Task 19/25] Current/Best: 11.68/ 20.74 GFLOPS | Progress: (8/20) | 8.90 s
[Task 19/25] Current/Best: 13.62/ 20.74 GFLOPS | Progress: (12/20) | 12.90 s
[Task 19/25] Current/Best: 20.05/ 20.74 GFLOPS | Progress: (16/20) | 17.64 s
[Task 19/25] Current/Best: 23.14/ 23.14 GFLOPS | Progress: (20/20) | 21.41 s Done.
-
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 2.32/ 2.70 GFLOPS | Progress: (4/20) | 5.85 s
[Task 20/25] Current/Best: 12.93/ 12.93 GFLOPS | Progress: (8/20) | 11.22 s
[Task 20/25] Current/Best: 1.57/ 12.93 GFLOPS | Progress: (12/20) | 14.64 s
[Task 20/25] Current/Best: 9.28/ 15.91 GFLOPS | Progress: (16/20) | 18.01 s
[Task 20/25] Current/Best: 13.22/ 20.43 GFLOPS | Progress: (20/20) | 20.80 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 15.44/ 17.51 GFLOPS | Progress: (4/20) | 3.29 s
[Task 21/25] Current/Best: 4.56/ 17.51 GFLOPS | Progress: (8/20) | 5.83 s
[Task 21/25] Current/Best: 18.80/ 18.80 GFLOPS | Progress: (12/20) | 8.13 s
[Task 21/25] Current/Best: 16.10/ 18.80 GFLOPS | Progress: (16/20) | 10.10 s
[Task 21/25] Current/Best: 2.72/ 19.78 GFLOPS | Progress: (20/20
) | 12.06 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
[Task 1/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 1/25] Current/Best: 12.97/ 22.93 GFLOPS | Progress: (4/20) | 7.50 s
[Task 1/25] Current/Best: 7.09/ 22.93 GFLOPS | Progress: (8/20) | 10.90 s
[Task 1/25] Current/Best: 14.04/ 22.93 GFLOPS | Progress: (12/20) | 13.39 s
[Task 1/25] Current/Best: 22.27/ 22.93 GFLOPS | Progress: (16/20) | 16.17 s
[Task 1/25] Current/Best: 17.52/ 22.93 GFLOPS | Progress: (20/20) | 18.88 s Done.
+
[Task 2/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 2/25] Current/Best: 8.55/ 16.71 GFLOPS | Progress: (4/20) | 4.24 s
[Task 2/25] Current/Best: 16.30/ 21.28 GFLOPS | Progress: (8/20) | 5.87 s
[Task 2/25] Current/Best: 15.47/ 21.28 GFLOPS | Progress: (12/20) | 8.07 s
[Task 2/25] Current/Best: 12.55/ 21.28 GFLOPS | Progress: (16/20) | 9.84 s
[Task 2/25] Current/Best: 16.72/ 21.28 GFLOPS | Progress: (20/20) | 11.31 s Done.
+
[Task 3/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 3/25] Current/Best: 12.83/ 17.85 GFLOPS | Progress: (4/20) | 4.29 s
[Task 3/25] Current/Best: 8.79/ 21.39 GFLOPS | Progress: (8/20) | 6.68 s
[Task 3/25] Current/Best: 11.68/ 21.39 GFLOPS | Progress: (12/20) | 10.69 s
[Task 3/25] Current/Best: 9.18/ 21.39 GFLOPS | Progress: (16/20) | 13.03 s
[Task 3/25] Current/Best: 19.94/ 21.39 GFLOPS | Progress: (20/20) | 15.11 s Done.
+
[Task 4/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 4/25] Current/Best: 12.93/ 16.93 GFLOPS | Progress: (4/20) | 4.45 s
[Task 4/25] Current/Best: 13.86/ 19.61 GFLOPS | Progress: (8/20) | 6.71 s
[Task 4/25] Current/Best: 12.48/ 19.61 GFLOPS | Progress: (12/20) | 8.53 s
[Task 4/25] Current/Best: 15.76/ 19.61 GFLOPS | Progress: (16/20) | 10.36 s
[Task 4/25] Current/Best: 6.49/ 19.61 GFLOPS | Progress: (20/20) | 12.39 s Done.
+
[Task 5/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 5/25] Current/Best: 11.49/ 18.14 GFLOPS | Progress: (4/20) | 4.52 s
[Task 5/25] Current/Best: 10.03/ 18.14 GFLOPS | Progress: (8/20) | 6.26 s
[Task 5/25] Current/Best: 18.51/ 18.51 GFLOPS | Progress: (12/20) | 8.62 s
[Task 5/25] Current/Best: 3.18/ 18.51 GFLOPS | Progress: (16/20) | 11.24 s
[Task 5/25] Current/Best: 3.05/ 18.51 GFLOPS | Progress: (20/20) | 13.55 s Done.
+
[Task 6/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 6/25] Current/Best: 12.12/ 12.95 GFLOPS | Progress: (4/20) | 5.16 s
[Task 6/25] Current/Best: 14.03/ 16.08 GFLOPS | Progress: (8/20) | 7.79 s
[Task 6/25] Current/Best: 11.52/ 18.26 GFLOPS | Progress: (12/20) | 10.21 s
[Task 6/25] Current/Best: 19.43/ 19.43 GFLOPS | Progress: (16/20) | 12.50 s
[Task 6/25] Current/Best: 17.77/ 19.43 GFLOPS | Progress: (20/20) | 15.05 s Done.
+
[Task 7/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 7/25] Current/Best: 13.67/ 13.67 GFLOPS | Progress: (4/20) | 5.59 s
[Task 7/25] Current/Best: 16.63/ 18.26 GFLOPS | Progress: (8/20) | 8.33 s
[Task 7/25] Current/Best: 11.51/ 22.33 GFLOPS | Progress: (12/20) | 11.05 s
[Task 7/25] Current/Best: 19.29/ 22.33 GFLOPS | Progress: (16/20) | 13.16 s
[Task 7/25] Current/Best: 12.91/ 22.33 GFLOPS | Progress: (20/20) | 15.51 s Done.
+
[Task 8/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 8/25] Current/Best: 10.80/ 19.57 GFLOPS | Progress: (4/20) | 8.26 s
[Task 8/25] Current/Best: 11.75/ 19.57 GFLOPS | Progress: (8/20) | 20.01 s
[Task 8/25] Current/Best: 14.74/ 19.57 GFLOPS | Progress: (12/20) | 23.25 s
[Task 8/25] Current/Best: 1.58/ 20.12 GFLOPS | Progress: (16/20) | 26.59 s
[Task 8/25] Current/Best: 10.03/ 20.12 GFLOPS | Progress: (20/20) | 32.12 s
[Task 9/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 9/25] Current/Best: 17.74/ 17.74 GFLOPS | Progress: (4/20) | 12.29 s
[Task 9/25] Current/Best: 6.65/ 17.74 GFLOPS | Progress: (8/20) | 14.37 s
[Task 9/25] Current/Best: 6.26/ 17.74 GFLOPS | Progress: (12/20) | 18.19 s
[Task 9/25] Current/Best: 9.15/ 23.22 GFLOPS | Progress: (16/20) | 25.83 s
[Task 9/25] Current/Best: 14.25/ 23.22 GFLOPS | Progress: (20
/20) | 29.65 s Done.
+
[Task 10/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 10/25] Current/Best: 11.72/ 13.01 GFLOPS | Progress: (4/20) | 3.90 s
[Task 10/25] Current/Best: 16.70/ 16.70 GFLOPS | Progress: (8/20) | 6.07 s
[Task 10/25] Current/Best: 10.37/ 16.70 GFLOPS | Progress: (12/20) | 7.93 s
[Task 10/25] Current/Best: 10.20/ 18.39 GFLOPS | Progress: (16/20) | 12.07 s
[Task 10/25] Current/Best: 13.46/ 18.39 GFLOPS | Progress: (20/20) | 14.30 s Done.
+
[Task 11/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 11/25] Current/Best: 8.22/ 16.86 GFLOPS | Progress: (4/20) | 4.55 s
[Task 11/25] Current/Best: 11.89/ 18.53 GFLOPS | Progress: (8/20) | 7.63 s
[Task 11/25] Current/Best: 5.08/ 18.53 GFLOPS | Progress: (12/20) | 10.43 s
[Task 11/25] Current/Best: 15.36/ 18.53 GFLOPS | Progress: (16/20) | 13.31 s
[Task 11/25] Current/Best: 6.26/ 21.97 GFLOPS | Progress: (20/20) | 15.84 s Done.
+
[Task 12/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 12/25] Current/Best: 5.90/ 13.44 GFLOPS | Progress: (4/20) | 5.08 s
[Task 12/25] Current/Best: 11.33/ 18.53 GFLOPS | Progress: (8/20) | 10.80 s
[Task 12/25] Current/Best: 14.02/ 18.53 GFLOPS | Progress: (12/20) | 13.16 s
[Task 12/25] Current/Best: 13.54/ 18.53 GFLOPS | Progress: (16/20) | 15.83 s
[Task 12/25] Current/Best: 13.39/ 18.53 GFLOPS | Progress: (20/20) | 17.89 s Done.
+
[Task 13/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 13/25] Current/Best: 19.32/ 19.32 GFLOPS | Progress: (4/20) | 4.29 s
[Task 13/25] Current/Best: 14.01/ 19.32 GFLOPS | Progress: (8/20) | 6.87 s
[Task 13/25] Current/Best: 17.42/ 22.65 GFLOPS | Progress: (12/20) | 8.86 s
[Task 13/25] Current/Best: 11.43/ 22.65 GFLOPS | Progress: (16/20) | 12.54 s
[Task 13/25] Current/Best: 12.15/ 22.65 GFLOPS | Progress: (20/20) | 15.51 s Done.
+
[Task 14/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 14/25] Current/Best: 5.44/ 14.47 GFLOPS | Progress: (4/20) | 4.95 s
[Task 14/25] Current/Best: 2.61/ 19.54 GFLOPS | Progress: (8/20) | 7.42 s
[Task 14/25] Current/Best: 19.24/ 19.54 GFLOPS | Progress: (12/20) | 10.65 s
[Task 14/25] Current/Best: 10.18/ 19.54 GFLOPS | Progress: (16/20) | 14.76 s
[Task 14/25] Current/Best: 16.96/ 19.54 GFLOPS | Progress: (20/20) | 20.87 s
[Task 15/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 15/25] Current/Best: 11.80/ 12.25 GFLOPS | Progress: (4/20) | 5.20 s Done.
Done.
-
[Task 22/25] Current/Best: 10.25/ 10.25 GFLOPS | Progress: (4/20) | 5.43 s
[Task 22/25] Current/Best: 10.87/ 12.83 GFLOPS | Progress: (8/20) | 7.76 s
[Task 22/25] Current/Best: 5.32/ 15.92 GFLOPS | Progress: (12/20) | 11.10 s
[Task 22/25] Current/Best: 9.50/ 16.56 GFLOPS | Progress: (16/20) | 13.09 s
[Task 22/25] Current/Best: 4.16/ 16.56 GFLOPS | Progress: (20/20) | 15.62 s Done.
-
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 12.15/ 22.10 GFLOPS | Progress: (4/20) | 3.91 s
[Task 23/25] Current/Best: 17.26/ 22.10 GFLOPS | Progress: (8/20) | 6.31 s
[Task 23/25] Current/Best: 13.96/ 22.10 GFLOPS | Progress: (12/20) | 9.62 s
[Task 23/25] Current/Best: 12.60/ 22.10 GFLOPS | Progress: (16/20) | 13.09 s
[Task 23/25] Current/Best: 10.53/ 22.10 GFLOPS | Progress: (20/20) | 16.01 s Done.
-
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 6.66/ 6.66 GFLOPS | Progress: (4/20) | 12.70 s
[Task 24/25] Current/Best: 1.65/ 8.22 GFLOPS | Progress: (8/20) | 23.67 s
[Task 24/25] Current/Best: 3.81/ 8.41 GFLOPS | Progress: (12/20) | 34.59 s
[Task 24/25] Current/Best: 8.63/ 8.63 GFLOPS | Progress: (16/20) | 45.23 s
[Task 24/25] Current/Best: 3.73/ 9.08 GFLOPS | Progress: (20/20) | 55.20 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
-
[Task 25/25] Current/Best: 7.27/ 9.13 GFLOPS | Progress: (4/20) | 3.56 s
[Task 25/25] Current/Best: 8.11/ 9.13 GFLOPS | Progress: (8/20) | 14.49 s
[Task 25/25] Current/Best: 3.03/ 9.13 GFLOPS | Progress: (12/20) | 16.90 s
[Task 25/25] Current/Best: 7.15/ 9.13 GFLOPS | Progress: (16/20) | 27.57 s
[Task 25/25] Current/Best: 1.54/ 9.13 GFLOPS | Progress: (20/20) | 33.17 s
+
[Task 15/25] Current/Best: 16.03/ 16.71 GFLOPS | Progress: (8/20) | 11.02 s
[Task 15/25] Current/Best: 18.79/ 18.79 GFLOPS | Progress: (12/20) | 14.21 s
[Task 15/25] Current/Best: 16.25/ 18.79 GFLOPS | Progress: (16/20) | 17.77 s
[Task 15/25] Current/Best: 9.69/ 18.79 GFLOPS | Progress: (20/20) | 19.99 s
[Task 16/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 16/25] Current/Best: 12.12/ 17.12 GFLOPS | Progress: (4/20) | 3.68 s
[Task 16/25] Current/Best: 15.59/ 23.19 GFLOPS | Progress: (8/20) | 6.74 s
[Task 16/25] Current/Best: 19.35/ 23.19 GFLOPS | Progress: (12/20) | 8.69 s
[Task 16/25] Current/Best: 15.30/ 23.19 GFLOPS | Progress: (16/20) | 10.36 s
[Task 16/25] Current/Best: 12.70/ 23.19 GFLOPS | Progress: (20/20) | 13.27 s Done.
+
[Task 17/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 17/25] Current/Best: 16.14/ 16.14 GFLOPS | Progress: (4/20) | 3.98 s
[Task 17/25] Current/Best: 7.06/ 19.88 GFLOPS | Progress: (8/20) | 6.21 s
[Task 17/25] Current/Best: 9.91/ 19.88 GFLOPS | Progress: (12/20) | 9.10 s
[Task 17/25] Current/Best: 16.54/ 19.88 GFLOPS | Progress: (16/20) | 11.06 s
[Task 17/25] Current/Best: 9.65/ 19.88 GFLOPS | Progress: (20/20) | 14.13 s Done.
+
[Task 18/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 18/25] Current/Best: 16.24/ 16.24 GFLOPS | Progress: (4/20) | 4.24 s
[Task 18/25] Current/Best: 16.03/ 16.24 GFLOPS | Progress: (8/20) | 6.78 s
[Task 18/25] Current/Best: 20.50/ 20.50 GFLOPS | Progress: (12/20) | 9.20 s
[Task 18/25] Current/Best: 15.37/ 20.50 GFLOPS | Progress: (16/20) | 11.17 s
[Task 18/25] Current/Best: 13.24/ 20.50 GFLOPS | Progress: (20/20) | 13.32 s Done.
+
[Task 19/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 19/25] Current/Best: 15.92/ 19.99 GFLOPS | Progress: (4/20) | 4.20 s
[Task 19/25] Current/Best: 19.07/ 19.99 GFLOPS | Progress: (8/20) | 7.45 s
[Task 19/25] Current/Best: 10.75/ 19.99 GFLOPS | Progress: (12/20) | 11.42 s
[Task 19/25] Current/Best: 11.35/ 19.99 GFLOPS | Progress: (16/20) | 15.25 s
[Task 19/25] Current/Best: 6.70/ 19.99 GFLOPS | Progress: (20/20) | 19.54 s Done.
+
[Task 20/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 20/25] Current/Best: 8.88/ 11.82 GFLOPS | Progress: (4/20) | 6.13 s
[Task 20/25] Current/Best: 10.47/ 11.82 GFLOPS | Progress: (8/20) | 9.40 s
[Task 20/25] Current/Best: 6.51/ 16.54 GFLOPS | Progress: (12/20) | 12.10 s
[Task 20/25] Current/Best: 18.46/ 18.46 GFLOPS | Progress: (16/20) | 14.67 s
[Task 20/25] Current/Best: 10.68/ 18.46 GFLOPS | Progress: (20/20) | 19.30 s
[Task 21/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 21/25] Current/Best: 8.94/ 20.75 GFLOPS | Progress: (4/20) | 3.56 s
[Task 21/25] Current/Best: 7.61/ 20.75 GFLOPS | Progress: (8/20) | 9.05 s
[Task 21/25] Current/Best: 9.22/ 20.75 GFLOPS | Progress: (12/20) | 10.73 s Done.
+ Done.
+
[Task 21/25] Current/Best: 13.21/ 20.75 GFLOPS | Progress: (16/20) | 12.51 s
[Task 21/25] Current/Best: 18.30/ 20.75 GFLOPS | Progress: (20/20) | 14.59 s
[Task 22/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 22/25] Current/Best: 10.60/ 19.19 GFLOPS | Progress: (4/20) | 4.30 s
[Task 22/25] Current/Best: 15.57/ 19.19 GFLOPS | Progress: (8/20) | 6.43 s
[Task 22/25] Current/Best: 18.41/ 19.19 GFLOPS | Progress: (12/20) | 8.26 s
[Task 22/25] Current/Best: 6.94/ 19.19 GFLOPS | Progress: (16/20) | 11.43 s
[Task 22/25] Current/Best: 11.57/ 19.19 GFLOPS | Progress: (20/20) | 13.50 s Done.
+
[Task 23/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 23/25] Current/Best: 18.52/ 21.64 GFLOPS | Progress: (4/20) | 6.98 s
[Task 23/25] Current/Best: 20.46/ 21.64 GFLOPS | Progress: (8/20) | 9.76 s
[Task 23/25] Current/Best: 13.80/ 23.49 GFLOPS | Progress: (12/20) | 12.79 s
[Task 23/25] Current/Best: 19.68/ 23.49 GFLOPS | Progress: (16/20) | 15.98 s
[Task 23/25] Current/Best: 20.41/ 23.49 GFLOPS | Progress: (20/20) | 18.15 s Done.
+
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s
[Task 24/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (4/20) | 11.91 s
[Task 24/25] Current/Best: 7.94/ 7.94 GFLOPS | Progress: (8/20) | 24.15 s
[Task 24/25] Current/Best: 2.35/ 7.94 GFLOPS | Progress: (12/20) | 34.58 s
[Task 24/25] Current/Best: 3.53/ 7.94 GFLOPS | Progress: (16/20) | 40.22 s
[Task 24/25] Current/Best: 5.79/ 7.94 GFLOPS | Progress: (20/20) | 51.16 s
[Task 25/25] Current/Best: 0.00/ 0.00 GFLOPS | Progress: (0/20) | 0.00 s Done.
+
[Task 25/25] Current/Best: 7.40/ 7.65 GFLOPS | Progress: (4/20) | 12.76 s
[Task 25/25] Current/Best: 2.99/ 9.28 GFLOPS | Progress: (8/20) | 23.73 s
[Task 25/25] Current/Best: 3.02/ 9.28 GFLOPS | Progress: (12/20) | 25.83 s
[Task 25/25] Current/Best: 5.64/ 9.28 GFLOPS | Progress: (16/20) | 36.49 s
[Task 25/25] Current/Best: 9.38/ 9.45 GFLOPS | Progress: (20/20) | 38.73 s
@@ -666,7 +665,7 @@ Verify that the optimized model runs and produces the same results:
.. code-block:: none
class='n02123045 tabby, tabby cat' with probability=0.621104
- class='n02123159 tiger cat' with probability=0.356377
+ class='n02123159 tiger cat' with probability=0.356378
class='n02124075 Egyptian cat' with probability=0.019712
class='n02129604 tiger, Panthera tigris' with probability=0.001215
class='n04040759 radiator' with probability=0.000262
@@ -723,8 +722,8 @@ improvement in comparing the optimized model to the unoptimized model.
.. code-block:: none
- optimized: {'mean': 411.1219674499989, 'median': 411.0714728499943, 'std': 0.6343796983425015}
- unoptimized: {'mean': 510.90583774000044, 'median': 511.5418410499956, 'std': 1.5800351073285022}
+ optimized: {'mean': 423.75794938000126, 'median': 421.92225845001303, 'std': 3.734169125434756}
+ unoptimized: {'mean': 508.56751572999656, 'median': 508.4643390500105, 'std': 0.7452097244428618}
@@ -747,7 +746,7 @@ profiling/benchmarking.
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 11 minutes 55.221 seconds)
+ **Total running time of the script:** ( 11 minutes 48.517 seconds)
.. _sphx_glr_download_tutorial_autotvm_relay_x86.py:
diff --git a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
index 48e95e2cb7..102fc0e013 100644
--- a/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
+++ b/docs/_sources/tutorial/cross_compilation_and_rpc.rst.txt
@@ -274,7 +274,7 @@ device and returns the measured cost. Network overhead is excluded.
.. code-block:: none
- 1.241e-07 secs/op
+ 1.286e-07 secs/op
diff --git a/docs/_sources/tutorial/intro_topi.rst.txt b/docs/_sources/tutorial/intro_topi.rst.txt
index ded5c95a5a..f2ab6b1928 100644
--- a/docs/_sources/tutorial/intro_topi.rst.txt
+++ b/docs/_sources/tutorial/intro_topi.rst.txt
@@ -268,7 +268,7 @@ As you can see, scheduled stages of computation have been accumulated and we can
.. code-block:: none
- [stage(a, placeholder(a, 0x218734f0)), stage(b, placeholder(b, 0x61d45f0)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T. [...]
+ [stage(a, placeholder(a, 0x216ec140)), stage(b, placeholder(b, 0x11211db0)), stage(T_add, compute(T_add, body=[a[ax0, ax1, ax2] + b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T.Range(0, 10), "DataPar", ""), T.iter_var(ax2, T.Range(0, 10), "DataPar", "")], reduce_axis=[], tag=broadcast, attrs={})), stage(T_multiply, compute(T_multiply, body=[a[ax0, ax1, ax2] * b[ax1, ax2]], axis=[T.iter_var(ax0, T.Range(0, 100), "DataPar", ""), T.iter_var(ax1, T [...]
diff --git a/docs/_sources/tutorial/sg_execution_times.rst.txt b/docs/_sources/tutorial/sg_execution_times.rst.txt
index fdb6c6cf7f..148e19dd8d 100644
--- a/docs/_sources/tutorial/sg_execution_times.rst.txt
+++ b/docs/_sources/tutorial/sg_execution_times.rst.txt
@@ -5,24 +5,24 @@
Computation times
=================
-**15:31.535** total execution time for **tutorial** files:
+**15:22.871** total execution time for **tutorial** files:
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 11:55.221 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_relay_x86.py` (``autotvm_relay_x86.py``) | 11:48.517 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:31.485 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_auto_scheduler_matmul_x86.py` (``auto_scheduler_matmul_x86.py``) | 01:31.135 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 01:01.140 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_expr_get_started.py` (``tensor_expr_get_started.py``) | 00:57.970 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:35.574 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_relay_quick_start.py` (``relay_quick_start.py``) | 00:35.705 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:25.827 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_autotvm_matmul_x86.py` (``autotvm_matmul_x86.py``) | 00:27.935 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:01.283 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.822 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_intro_topi.py` (``intro_topi.py``) | 00:00.835 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_tensor_ir_blitz_course.py` (``tensor_ir_blitz_course.py``) | 00:00.623 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.170 | 0.0 MB |
+| :ref:`sphx_glr_tutorial_cross_compilation_and_rpc.py` (``cross_compilation_and_rpc.py``) | 00:00.164 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_tutorial_uma.py` (``uma.py``) | 00:00.000 | 0.0 MB |
+------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
index e8c5af3871..35eb5bead2 100644
--- a/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
+++ b/docs/_sources/tutorial/tensor_expr_get_started.rst.txt
@@ -444,7 +444,7 @@ factor to be the number of threads on your CPU.
.. code-block:: none
- vector: 0.000025
+ vector: 0.000026
@I.ir_module
class Module:
@T.prim_func
@@ -498,10 +498,10 @@ We can now compare the different schedules
.. code-block:: none
Operator Timing Performance
- numpy 7.395259999611881e-06 1.0
- naive 6.6756999999999995e-06 0.9026998375108318
- parallel 8.1278e-06 1.0990553409111463
- vector 2.46142e-05 3.328375202669251
+ numpy 7.255440000335511e-06 1.0
+ naive 6.713099999999999e-06 0.9252505705635451
+ parallel 7.894900000000001e-06 1.0881352474329495
+ vector 2.64746e-05 3.6489310088396767
@@ -922,7 +922,7 @@ matrix multiplication.
.. code-block:: none
- Numpy running time: 0.018198
+ Numpy running time: 0.018825
@@ -980,7 +980,7 @@ optimizations.
.. code-block:: none
- none: 3.427090
+ none: 3.164111
@@ -1077,7 +1077,7 @@ schedule.
.. code-block:: none
- blocking: 0.299261
+ blocking: 0.304901
@@ -1158,7 +1158,7 @@ already cache friendly from our previous optimizations.
.. code-block:: none
- vectorization: 0.333722
+ vectorization: 0.339064
@I.ir_module
class Module:
@T.prim_func
@@ -1221,7 +1221,7 @@ more cache friendly.
.. code-block:: none
- loop permutation: 0.121460
+ loop permutation: 0.115273
@I.ir_module
class Module:
@T.prim_func
@@ -1309,7 +1309,7 @@ optimized schedule.
.. code-block:: none
- array packing: 0.109399
+ array packing: 0.107194
@I.ir_module
class Module:
@T.prim_func
@@ -1389,7 +1389,7 @@ to `C` when all the block results are ready.
.. code-block:: none
- block caching: 0.110710
+ block caching: 0.110575
@I.ir_module
class Module:
@T.prim_func
@@ -1460,7 +1460,7 @@ of thread-level parallelization.
.. code-block:: none
- parallelization: 0.144952
+ parallelization: 0.145853
@I.ir_module
class Module:
@T.prim_func
@@ -1527,13 +1527,13 @@ working, we can compare the results.
.. code-block:: none
Operator Timing Performance
- none 3.4270900767 1.0
- blocking 0.2992614526 0.08732231890682128
- vectorization 0.3337221303 0.09737769443788485
- loop permutation 0.121459927 0.03544112476814607
- array packing 0.10939925399999999 0.031921907960278155
- block caching 0.11071008 0.03230439746906347
- parallelization 0.1449524867 0.042296083107210614
+ none 3.1641112787999996 1.0
+ blocking 0.3049014778 0.09636243827544363
+ vectorization 0.33906354070000005 0.10715917071936581
+ loop permutation 0.11527291720000002 0.03643137268033053
+ array packing 0.1071943396 0.033878182577906625
+ block caching 0.110574533 0.03494647414611024
+ parallelization 0.14585330879999997 0.046096137571784566
@@ -1573,11 +1573,6 @@ operations with tunable parameters that allows you to automatically optimize
the computation for specific platforms.
-.. rst-class:: sphx-glr-timing
-
- **Total running time of the script:** ( 1 minutes 1.140 seconds)
-
-
.. _sphx_glr_download_tutorial_tensor_expr_get_started.py:
.. only:: html
diff --git a/docs/commit_hash b/docs/commit_hash
index 32bf9808d2..8cfd956fbb 100644
--- a/docs/commit_hash
+++ b/docs/commit_hash
@@ -1 +1 @@
-2c109c53e00e99e6e4e198aab7514a1871827a88
+56926009616e5f28bb42dfb9d136474e2bafde15
diff --git a/docs/how_to/compile_models/from_darknet.html b/docs/how_to/compile_models/from_darknet.html
index 0654003a74..f109d49d5a 100644
--- a/docs/how_to/compile_models/from_darknet.html
+++ b/docs/how_to/compile_models/from_darknet.html
@@ -585,7 +585,7 @@ class:['truck 0.9266'] left:471 top:83 right:689 bottom:169
class:['bicycle 0.9984'] left:111 top:113 right:577 bottom:447
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 17.754 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 16.940 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-darknet-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7716f96385bd5abb6e822041e285be54/from_darknet.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_darknet.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/from_keras.html b/docs/how_to/compile_models/from_keras.html
index d4182288ba..460316ad82 100644
--- a/docs/how_to/compile_models/from_keras.html
+++ b/docs/how_to/compile_models/from_keras.html
@@ -506,7 +506,7 @@ Tensorflow is also required since it’s used as the default backend of keras.</
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Relay top-1 id: 285, class name: Egyptian cat
1/1 [==============================] - ETA: 0s
-1/1 [==============================] - 1s 925ms/step
+1/1 [==============================] - 1s 922ms/step
Keras top-1 id: 285, class name: Egyptian cat
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_mxnet.html b/docs/how_to/compile_models/from_mxnet.html
index 55058e221c..21e458c273 100644
--- a/docs/how_to/compile_models/from_mxnet.html
+++ b/docs/how_to/compile_models/from_mxnet.html
@@ -439,7 +439,7 @@
<span class="nb">print</span><span class="p">(</span><span class="s2">"x"</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#tuple" title="builtins.tuple" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">x</span><span class="o">.</span><span class="n">shape</span></a><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zipe3176ba3-0fe9-43f6-b092-3fe63054207d from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
+<img src="../../_images/sphx_glr_from_mxnet_001.png" srcset="../../_images/sphx_glr_from_mxnet_001.png" alt="from mxnet" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/resnet18_v1-a0666292.zip31278884-f624-4b28-a605-c808c85aa157 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1-a0666292.zip...
x (1, 3, 224, 224)
</pre></div>
</div>
diff --git a/docs/how_to/compile_models/from_oneflow.html b/docs/how_to/compile_models/from_oneflow.html
index c0d3ec10b8..cdf651da82 100644
--- a/docs/how_to/compile_models/from_oneflow.html
+++ b/docs/how_to/compile_models/from_oneflow.html
@@ -449,12 +449,13 @@ Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdo
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading: "https://oneflow-public.oss-cn-beijing.aliyuncs.com/model_zoo/flowvision/classification/ResNet/resnet18.zip" to /workspace/.oneflow/flowvision_cache/resnet18.zip
0%| | 0.00/41.5M [00:00<?, ?B/s]
- 19%|#9 | 7.99M/41.5M [00:00<00:00, 48.1MB/s]
- 39%|###8 | 16.0M/41.5M [00:00<00:00, 56.6MB/s]
- 58%|#####7 | 24.0M/41.5M [00:00<00:00, 54.2MB/s]
- 77%|#######7 | 32.0M/41.5M [00:00<00:00, 60.0MB/s]
- 91%|#########1| 37.9M/41.5M [00:00<00:00, 55.1MB/s]
-100%|##########| 41.5M/41.5M [00:00<00:00, 48.8MB/s]
+ 17%|#7 | 7.20M/41.5M [00:00<00:00, 75.4MB/s]
+ 35%|###4 | 14.4M/41.5M [00:00<00:00, 65.8MB/s]
+ 50%|####9 | 20.7M/41.5M [00:00<00:00, 60.0MB/s]
+ 64%|######3 | 26.5M/41.5M [00:00<00:00, 52.7MB/s]
+ 80%|#######9 | 33.0M/41.5M [00:00<00:00, 57.3MB/s]
+ 93%|#########3| 38.6M/41.5M [00:00<00:00, 44.8MB/s]
+100%|##########| 41.5M/41.5M [00:00<00:00, 51.9MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_pytorch.html b/docs/how_to/compile_models/from_pytorch.html
index 7a1c8b294f..a8fc53d948 100644
--- a/docs/how_to/compile_models/from_pytorch.html
+++ b/docs/how_to/compile_models/from_pytorch.html
@@ -432,11 +432,13 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /workspace/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
0%| | 0.00/44.7M [00:00<?, ?B/s]
- 21%|##1 | 9.53M/44.7M [00:00<00:00, 99.9MB/s]
- 43%|####2 | 19.1M/44.7M [00:00<00:00, 86.0MB/s]
- 72%|#######1 | 32.0M/44.7M [00:00<00:00, 97.2MB/s]
- 92%|#########2| 41.3M/44.7M [00:00<00:00, 94.7MB/s]
-100%|##########| 44.7M/44.7M [00:00<00:00, 93.2MB/s]
+ 18%|#7 | 7.99M/44.7M [00:00<00:00, 49.2MB/s]
+ 32%|###2 | 14.3M/44.7M [00:00<00:00, 35.4MB/s]
+ 40%|###9 | 17.8M/44.7M [00:00<00:00, 33.5MB/s]
+ 54%|#####3 | 24.0M/44.7M [00:00<00:00, 41.6MB/s]
+ 72%|#######1 | 32.0M/44.7M [00:00<00:00, 45.6MB/s]
+ 90%|########9 | 40.0M/44.7M [00:00<00:00, 52.3MB/s]
+100%|##########| 44.7M/44.7M [00:00<00:00, 50.1MB/s]
</pre></div>
</div>
</div>
diff --git a/docs/how_to/compile_models/from_tensorflow.html b/docs/how_to/compile_models/from_tensorflow.html
index 6eb1d27dc6..384188e3a5 100644
--- a/docs/how_to/compile_models/from_tensorflow.html
+++ b/docs/how_to/compile_models/from_tensorflow.html
@@ -649,7 +649,7 @@ banana (score = 0.00022)
desk (score = 0.00019)
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 20.985 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 20.604 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-compile-models-from-tensorflow-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7f1d3d1b878694c201c614c807cdebc8/from_tensorflow.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">from_tensorflow.py</span></code></a></p>
diff --git a/docs/how_to/compile_models/sg_execution_times.html b/docs/how_to/compile_models/sg_execution_times.html
index 46ec102960..9c8b7f3b39 100644
--- a/docs/how_to/compile_models/sg_execution_times.html
+++ b/docs/how_to/compile_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-compile-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>06:22.455</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
+<p><strong>06:17.784</strong> total execution time for <strong>how_to_compile_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 81%" />
@@ -349,43 +349,43 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tensorflow.html#sphx-glr-how-to-compile-models-from-tensorflow-py"><span class="std std-ref">Compile Tensorflow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tensorflow.py</span></code>)</p></td>
-<td><p>01:20.985</p></td>
+<td><p>01:20.604</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_darknet.html#sphx-glr-how-to-compile-models-from-darknet-py"><span class="std std-ref">Compile YOLO-V2 and YOLO-V3 in DarkNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_darknet.py</span></code>)</p></td>
-<td><p>01:17.754</p></td>
+<td><p>01:16.940</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_paddle.html#sphx-glr-how-to-compile-models-from-paddle-py"><span class="std std-ref">Compile PaddlePaddle Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_paddle.py</span></code>)</p></td>
-<td><p>00:52.358</p></td>
+<td><p>00:51.291</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_oneflow.html#sphx-glr-how-to-compile-models-from-oneflow-py"><span class="std std-ref">Compile OneFlow Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_oneflow.py</span></code>)</p></td>
-<td><p>00:35.752</p></td>
+<td><p>00:35.095</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
-<td><p>00:30.291</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
+<td><p>00:30.553</p></td>
<td><p>0.0 MB</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal" href="from_coreml.html#sphx-glr-how-to-compile-models-from-coreml-py"><span class="std std-ref">Compile CoreML Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_coreml.py</span></code>)</p></td>
-<td><p>00:30.194</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="from_mxnet.html#sphx-glr-how-to-compile-models-from-mxnet-py"><span class="std std-ref">Compile MXNet Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_mxnet.py</span></code>)</p></td>
+<td><p>00:29.430</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_tflite.html#sphx-glr-how-to-compile-models-from-tflite-py"><span class="std std-ref">Compile TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_tflite.py</span></code>)</p></td>
-<td><p>00:27.749</p></td>
+<td><p>00:26.452</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_pytorch.html#sphx-glr-how-to-compile-models-from-pytorch-py"><span class="std std-ref">Compile PyTorch Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_pytorch.py</span></code>)</p></td>
-<td><p>00:24.724</p></td>
+<td><p>00:24.396</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="from_keras.html#sphx-glr-how-to-compile-models-from-keras-py"><span class="std std-ref">Compile Keras Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_keras.py</span></code>)</p></td>
-<td><p>00:20.012</p></td>
+<td><p>00:20.355</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="from_onnx.html#sphx-glr-how-to-compile-models-from-onnx-py"><span class="std std-ref">Compile ONNX Models</span></a> (<code class="docutils literal notranslate"><span class="pre">from_onnx.py</span></code>)</p></td>
-<td><p>00:02.636</p></td>
+<td><p>00:02.670</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/deploy_models/deploy_model_on_adreno.html b/docs/how_to/deploy_models/deploy_model_on_adreno.html
index 62e47b15e7..bb8771c9ca 100644
--- a/docs/how_to/deploy_models/deploy_model_on_adreno.html
+++ b/docs/how_to/deploy_models/deploy_model_on_adreno.html
@@ -920,7 +920,7 @@ Top5 predictions:
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 2541.9046 2542.1988 2544.5307 2539.4577 1.8976
+ 2545.0897 2544.5280 2548.3965 2543.5946 1.3722
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-model-on-adreno-py">
diff --git a/docs/how_to/deploy_models/deploy_model_on_android.html b/docs/how_to/deploy_models/deploy_model_on_android.html
index 76d1d45ab8..b8c21917c2 100644
--- a/docs/how_to/deploy_models/deploy_model_on_android.html
+++ b/docs/how_to/deploy_models/deploy_model_on_android.html
@@ -662,7 +662,7 @@ to the remote android device.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 16.0990 16.0548 16.5096 15.7118 0.2785
+ 16.0066 16.0084 16.1286 15.8646 0.0826
</pre></div>
</div>
</div>
diff --git a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
index d2ae7769a5..50407f136f 100644
--- a/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
+++ b/docs/how_to/deploy_models/deploy_object_detection_pytorch.html
@@ -454,30 +454,21 @@ be unstable.</p>
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /workspace/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
0%| | 0.00/170M [00:00<?, ?B/s]
- 3%|3 | 5.18M/170M [00:00<00:03, 49.3MB/s]
- 6%|5 | 9.89M/170M [00:00<00:04, 33.6MB/s]
- 9%|9 | 16.0M/170M [00:00<00:04, 33.9MB/s]
- 14%|#4 | 24.0M/170M [00:00<00:04, 30.9MB/s]
- 19%|#8 | 31.7M/170M [00:00<00:03, 36.7MB/s]
- 21%|## | 35.4M/170M [00:01<00:06, 20.5MB/s]
- 24%|##3 | 40.0M/170M [00:01<00:06, 22.6MB/s]
- 28%|##8 | 48.0M/170M [00:01<00:04, 30.3MB/s]
- 34%|###3 | 57.4M/170M [00:01<00:02, 42.2MB/s]
- 38%|###7 | 64.0M/170M [00:01<00:02, 46.6MB/s]
- 42%|####2 | 72.0M/170M [00:02<00:02, 45.7MB/s]
- 48%|####7 | 81.4M/170M [00:02<00:01, 56.7MB/s]
- 52%|#####1 | 88.0M/170M [00:02<00:01, 53.2MB/s]
- 55%|#####5 | 93.8M/170M [00:02<00:01, 48.6MB/s]
- 58%|#####8 | 99.0M/170M [00:02<00:01, 45.8MB/s]
- 61%|######1 | 104M/170M [00:02<00:01, 43.6MB/s]
- 66%|######5 | 112M/170M [00:02<00:01, 45.1MB/s]
- 71%|####### | 120M/170M [00:03<00:00, 52.7MB/s]
- 75%|#######5 | 128M/170M [00:03<00:00, 55.6MB/s]
- 80%|######## | 136M/170M [00:03<00:00, 55.8MB/s]
- 86%|########5 | 146M/170M [00:03<00:00, 66.6MB/s]
- 91%|######### | 154M/170M [00:03<00:00, 67.1MB/s]
- 95%|#########4| 161M/170M [00:03<00:00, 55.1MB/s]
-100%|##########| 170M/170M [00:03<00:00, 45.7MB/s]
+ 7%|6 | 11.7M/170M [00:00<00:01, 123MB/s]
+ 14%|#3 | 23.5M/170M [00:00<00:01, 123MB/s]
+ 21%|## | 35.3M/170M [00:00<00:01, 83.0MB/s]
+ 28%|##8 | 48.0M/170M [00:00<00:01, 89.2MB/s]
+ 37%|###7 | 63.7M/170M [00:00<00:01, 110MB/s]
+ 44%|####4 | 75.2M/170M [00:00<00:00, 102MB/s]
+ 52%|#####1 | 88.0M/170M [00:00<00:00, 96.3MB/s]
+ 59%|#####9 | 100M/170M [00:01<00:00, 105MB/s]
+ 66%|######5 | 112M/170M [00:01<00:00, 100MB/s]
+ 72%|#######2 | 123M/170M [00:01<00:00, 98.6MB/s]
+ 80%|######## | 136M/170M [00:01<00:00, 99.1MB/s]
+ 87%|########7 | 148M/170M [00:01<00:00, 106MB/s]
+ 94%|#########3| 159M/170M [00:01<00:00, 108MB/s]
+100%|#########9| 169M/170M [00:01<00:00, 98.4MB/s]
+100%|##########| 170M/170M [00:01<00:00, 101MB/s]
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torch/nn/functional.py:3897: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
for i in range(dim)
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/torchvision/models/detection/anchor_utils.py:124: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode=& [...]
@@ -575,7 +566,7 @@ torchvision rcnn models.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Get 9 valid boxes
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 24.958 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 27.556 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-object-detection-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7795da4b258c8feff986668b95ef57ad/deploy_object_detection_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_object_detection_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized.html b/docs/how_to/deploy_models/deploy_prequantized.html
index 713cd8f698..1f23c3864f 100644
--- a/docs/how_to/deploy_models/deploy_prequantized.html
+++ b/docs/how_to/deploy_models/deploy_prequantized.html
@@ -495,8 +495,8 @@ training. Other models require a full post training calibration.</p>
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
0%| | 0.00/13.6M [00:00<?, ?B/s]
- 59%|#####8 | 7.99M/13.6M [00:00<00:00, 61.5MB/s]
-100%|##########| 13.6M/13.6M [00:00<00:00, 70.2MB/s]
+ 59%|#####8 | 7.99M/13.6M [00:00<00:00, 64.6MB/s]
+100%|##########| 13.6M/13.6M [00:00<00:00, 93.8MB/s]
</pre></div>
</div>
</div>
@@ -587,7 +587,7 @@ output values are identical out of 1000 outputs from mobilenet v2.</p>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 90.3890 90.2207 94.9386 89.9815 0.5996
+ 90.4616 90.3165 94.6651 90.0549 0.5140
</pre></div>
</div>
<div class="admonition note">
@@ -626,7 +626,7 @@ This includes support for the VNNI 8 bit dot product instruction (CascadeLake or
<div class="section" id="deploy-a-quantized-tflite-model">
<h2>Deploy a quantized TFLite Model<a class="headerlink" href="#deploy-a-quantized-tflite-model" title="Permalink to this headline">¶</a></h2>
<p>TODO</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 13.851 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 13.799 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/fb8217c13f4351224c6cf3aacf1a87fc/deploy_prequantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_prequantized_tflite.html b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
index 698283e47d..b3f30520dd 100644
--- a/docs/how_to/deploy_models/deploy_prequantized_tflite.html
+++ b/docs/how_to/deploy_models/deploy_prequantized_tflite.html
@@ -580,7 +580,7 @@ TFLite Top-5 labels: [387 102 386 341 349]
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 118.6146 118.4709 122.3098 117.2891 0.7489
+ 119.6743 119.5340 125.3106 118.7616 0.7213
</pre></div>
</div>
<div class="admonition note">
@@ -608,7 +608,7 @@ network for ARM CPU</span></a>.</p></li>
</ul>
</div></blockquote>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 36.525 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 30.960 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-prequantized-tflite-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/56691c7a27d45da61d112276334640d3/deploy_prequantized_tflite.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_prequantized_tflite.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_quantized.html b/docs/how_to/deploy_models/deploy_quantized.html
index b8f895fb75..83083c6d2a 100644
--- a/docs/how_to/deploy_models/deploy_quantized.html
+++ b/docs/how_to/deploy_models/deploy_quantized.html
@@ -521,7 +521,7 @@ for calibration. But the accuracy might be impacted.</p>
DeprecationWarning,
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 43.203 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 40.857 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-quantized-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/7810ecf51bfc05f7d5e8a400ac3e815d/deploy_quantized.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_quantized.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
index a3efe31750..60fa01a271 100644
--- a/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
+++ b/docs/how_to/deploy_models/deploy_ssd_gluoncv.html
@@ -463,24 +463,22 @@ to your device.</p>
Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_512_resnet50_v1_voc-9c8b225a.zip...
0%| | 0/132723 [00:00<?, ?KB/s]
- 4%|4 | 5363/132723 [00:00<00:02, 53626.60KB/s]
- 9%|8 | 11773/132723 [00:00<00:02, 59781.93KB/s]
- 13%|#3 | 17752/132723 [00:00<00:02, 42451.97KB/s]
- 19%|#9 | 25526/132723 [00:00<00:01, 53620.40KB/s]
- 25%|##5 | 33283/132723 [00:00<00:01, 61077.20KB/s]
- 30%|### | 39857/132723 [00:00<00:01, 62164.31KB/s]
- 36%|###6 | 47815/132723 [00:00<00:01, 67461.80KB/s]
- 42%|####2 | 55744/132723 [00:00<00:01, 71044.07KB/s]
- 48%|####8 | 63714/132723 [00:00<00:00, 73657.84KB/s]
- 54%|#####4 | 71757/132723 [00:01<00:00, 75694.75KB/s]
- 60%|###### | 79750/132723 [00:01<00:00, 76966.32KB/s]
- 66%|######6 | 87810/132723 [00:01<00:00, 78055.76KB/s]
- 72%|#######2 | 95814/132723 [00:01<00:00, 78645.29KB/s]
- 78%|#######8 | 103809/132723 [00:01<00:00, 79029.89KB/s]
- 84%|########4 | 111789/132723 [00:01<00:00, 79257.25KB/s]
- 90%|######### | 119756/132723 [00:01<00:00, 79380.24KB/s]
- 96%|#########6| 127739/132723 [00:01<00:00, 79513.47KB/s]
-100%|##########| 132723/132723 [00:01<00:00, 71599.06KB/s]
+ 5%|5 | 7045/132723 [00:00<00:01, 70438.82KB/s]
+ 12%|#1 | 15891/132723 [00:00<00:01, 81035.00KB/s]
+ 19%|#8 | 24732/132723 [00:00<00:01, 84400.02KB/s]
+ 25%|##4 | 33173/132723 [00:00<00:01, 64674.57KB/s]
+ 32%|###1 | 41996/132723 [00:00<00:01, 71781.26KB/s]
+ 38%|###8 | 50852/132723 [00:00<00:01, 76831.11KB/s]
+ 45%|####4 | 59620/132723 [00:00<00:00, 80091.26KB/s]
+ 52%|#####1 | 68471/132723 [00:00<00:00, 82619.56KB/s]
+ 58%|#####8 | 77329/132723 [00:00<00:00, 84403.74KB/s]
+ 65%|######4 | 86215/132723 [00:01<00:00, 85740.83KB/s]
+ 72%|#######1 | 95059/132723 [00:01<00:00, 86548.74KB/s]
+ 78%|#######8 | 103969/132723 [00:01<00:00, 87310.83KB/s]
+ 85%|########5 | 112861/132723 [00:01<00:00, 87792.60KB/s]
+ 92%|#########1| 121707/132723 [00:01<00:00, 87990.67KB/s]
+ 98%|#########8| 130579/132723 [00:01<00:00, 88206.36KB/s]
+100%|##########| 132723/132723 [00:01<00:00, 82765.82KB/s]
</pre></div>
</div>
<p>Create TVM runtime and do inference
@@ -519,7 +517,7 @@ Downloading /workspace/.mxnet/models/ssd_512_resnet50_v1_voc-9c8b225a.zip from h
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 31.458 seconds)</p>
+<img src="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" srcset="../../_images/sphx_glr_deploy_ssd_gluoncv_001.png" alt="deploy ssd gluoncv" class = "sphx-glr-single-img"/><p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 35.155 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-deploy-models-deploy-ssd-gluoncv-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/cccb17d28e5e8b2e94ea8cd5ec59f6ed/deploy_ssd_gluoncv.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">deploy_ssd_gluoncv.py</span></code></a></p>
diff --git a/docs/how_to/deploy_models/sg_execution_times.html b/docs/how_to/deploy_models/sg_execution_times.html
index 57d32e9c97..9d237983d7 100644
--- a/docs/how_to/deploy_models/sg_execution_times.html
+++ b/docs/how_to/deploy_models/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-deploy-models-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>14:58.194</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
+<p><strong>14:56.410</strong> total execution time for <strong>how_to_deploy_models</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 86%" />
@@ -349,39 +349,39 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_ssd_gluoncv.html#sphx-glr-how-to-deploy-models-deploy-ssd-gluoncv-py"><span class="std std-ref">Deploy Single Shot Multibox Detector(SSD) model</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_ssd_gluoncv.py</span></code>)</p></td>
-<td><p>03:31.458</p></td>
+<td><p>03:35.155</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_object_detection_pytorch.html#sphx-glr-how-to-deploy-models-deploy-object-detection-pytorch-py"><span class="std std-ref">Compile PyTorch Object Detection Models</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_object_detection_pytorch.py</span></code>)</p></td>
-<td><p>03:24.958</p></td>
+<td><p>03:27.556</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized_tflite.html#sphx-glr-how-to-deploy-models-deploy-prequantized-tflite-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite)</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized_tflite.py</span></code>)</p></td>
-<td><p>02:36.525</p></td>
+<td><p>02:30.960</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_quantized.html#sphx-glr-how-to-deploy-models-deploy-quantized-py"><span class="std std-ref">Deploy a Quantized Model on Cuda</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_quantized.py</span></code>)</p></td>
-<td><p>01:43.203</p></td>
+<td><p>01:40.857</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_prequantized.html#sphx-glr-how-to-deploy-models-deploy-prequantized-py"><span class="std std-ref">Deploy a Framework-prequantized Model with TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_prequantized.py</span></code>)</p></td>
-<td><p>01:13.851</p></td>
+<td><p>01:13.799</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_adreno.html#sphx-glr-how-to-deploy-models-deploy-model-on-adreno-py"><span class="std std-ref">Deploy the Pretrained Model on Adreno</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_adreno.py</span></code>)</p></td>
-<td><p>00:53.577</p></td>
+<td><p>00:53.705</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_android.html#sphx-glr-how-to-deploy-models-deploy-model-on-android-py"><span class="std std-ref">Deploy the Pretrained Model on Android</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_android.py</span></code>)</p></td>
-<td><p>00:40.151</p></td>
+<td><p>00:40.621</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_model_on_nano.html#sphx-glr-how-to-deploy-models-deploy-model-on-nano-py"><span class="std std-ref">Deploy the Pretrained Model on Jetson Nano</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_nano.py</span></code>)</p></td>
-<td><p>00:27.322</p></td>
+<td><p>00:27.093</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="deploy_model_on_rasp.html#sphx-glr-how-to-deploy-models-deploy-model-on-rasp-py"><span class="std std-ref">Deploy the Pretrained Model on Raspberry Pi</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_model_on_rasp.py</span></code>)</p></td>
-<td><p>00:27.143</p></td>
+<td><p>00:26.659</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="deploy_sparse.html#sphx-glr-how-to-deploy-models-deploy-sparse-py"><span class="std std-ref">Deploy a Hugging Face Pruned Model on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">deploy_sparse.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/bring_your_own_datatypes.html b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
index 1565b2e223..08ea87dfd7 100644
--- a/docs/how_to/extend_tvm/bring_your_own_datatypes.html
+++ b/docs/how_to/extend_tvm/bring_your_own_datatypes.html
@@ -619,7 +619,7 @@ In this alpha state of the Bring Your Own Datatypes framework, we have not imple
<span class="n">module</span><span class="p">,</span> <a href="https://docs.python.org/3/library/stdtypes.html#dict" title="builtins.dict" class="sphx-glr-backref-module-builtins sphx-glr-backref-type-py-class sphx-glr-backref-instance"><span class="n">params</span></a> <span class="o">=</span> <span class="n">get_mobilenet</span><span class="p">()</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zip837d4979-2338-4b33-abd0-3108602ea2f1 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Downloading /workspace/.mxnet/models/mobilenet0.25-9f83e440.zipc284f576-75fc-4dd1-87fd-ddce03ce8b37 from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/mobilenet0.25-9f83e440.zip...
</pre></div>
</div>
<p>It’s easy to execute MobileNet with native TVM:</p>
diff --git a/docs/how_to/extend_tvm/sg_execution_times.html b/docs/how_to/extend_tvm/sg_execution_times.html
index d0d23f12fc..a5958e3edd 100644
--- a/docs/how_to/extend_tvm/sg_execution_times.html
+++ b/docs/how_to/extend_tvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-extend-tvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:52.320</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
+<p><strong>00:52.733</strong> total execution time for <strong>how_to_extend_tvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="bring_your_own_datatypes.html#sphx-glr-how-to-extend-tvm-bring-your-own-datatypes-py"><span class="std std-ref">Bring Your Own Datatypes to TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">bring_your_own_datatypes.py</span></code>)</p></td>
-<td><p>00:48.568</p></td>
+<td><p>00:49.000</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="use_pass_instrument.html#sphx-glr-how-to-extend-tvm-use-pass-instrument-py"><span class="std std-ref">How to Use TVM Pass Instrument</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_instrument.py</span></code>)</p></td>
-<td><p>00:02.702</p></td>
+<td><p>00:02.660</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="use_pass_infra.html#sphx-glr-how-to-extend-tvm-use-pass-infra-py"><span class="std std-ref">How to Use TVM Pass Infra</span></a> (<code class="docutils literal notranslate"><span class="pre">use_pass_infra.py</span></code>)</p></td>
-<td><p>00:01.043</p></td>
+<td><p>00:01.066</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="low_level_custom_pass.html#sphx-glr-how-to-extend-tvm-low-level-custom-pass-py"><span class="std std-ref">Writing a Customized Pass</span></a> (<code class="docutils literal notranslate"><span class="pre">low_level_custom_pass.py</span></code>)</p></td>
diff --git a/docs/how_to/extend_tvm/use_pass_instrument.html b/docs/how_to/extend_tvm/use_pass_instrument.html
index 8e4217ef4e..995ef2a82e 100644
--- a/docs/how_to/extend_tvm/use_pass_instrument.html
+++ b/docs/how_to/extend_tvm/use_pass_instrument.html
@@ -526,10 +526,10 @@ profile the execution time of each passes.</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 20645us [20645us] (48.79%; 48.79%)
-FoldScaleAxis: 21669us [7us] (51.21%; 51.21%)
- FoldConstant: 21662us [1649us] (51.19%; 99.97%)
- InferType: 20013us [20013us] (47.30%; 92.39%)
+InferType: 21251us [21251us] (48.53%; 48.53%)
+FoldScaleAxis: 22538us [9us] (51.47%; 51.47%)
+ FoldConstant: 22529us [1710us] (51.45%; 99.96%)
+ InferType: 20819us [20819us] (47.54%; 92.41%)
</pre></div>
</div>
</div>
@@ -551,10 +551,10 @@ Refer to following sections and <a class="reference internal" href="../../refere
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Printing results of timing profile...
-InferType: 20069us [20069us] (48.20%; 48.20%)
-FoldScaleAxis: 21565us [5us] (51.80%; 51.80%)
- FoldConstant: 21560us [1675us] (51.78%; 99.98%)
- InferType: 19885us [19885us] (47.76%; 92.23%)
+InferType: 20727us [20727us] (48.14%; 48.14%)
+FoldScaleAxis: 22330us [6us] (51.86%; 51.86%)
+ FoldConstant: 22325us [1732us] (51.85%; 99.97%)
+ InferType: 20593us [20593us] (47.83%; 92.24%)
</pre></div>
</div>
<p>Register empty list to clear existing instruments.</p>
diff --git a/docs/how_to/optimize_operators/opt_conv_cuda.html b/docs/how_to/optimize_operators/opt_conv_cuda.html
index 981eb03b44..556caf14e7 100644
--- a/docs/how_to/optimize_operators/opt_conv_cuda.html
+++ b/docs/how_to/optimize_operators/opt_conv_cuda.html
@@ -575,7 +575,7 @@ latency of convolution.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Convolution: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">*</span> <span cl [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 54.177120 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Convolution: 39.232894 ms
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-optimize-operators-opt-conv-cuda-py">
diff --git a/docs/how_to/optimize_operators/opt_conv_tensorcore.html b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
index ef2348b23a..1ffceed553 100644
--- a/docs/how_to/optimize_operators/opt_conv_tensorcore.html
+++ b/docs/how_to/optimize_operators/opt_conv_tensorcore.html
@@ -861,7 +861,7 @@ be able to run on our build server</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"conv2d with tensor core: </span><span class="si">%f</span><span class="s2"> ms"</span> <span class="o">%</span> <span class="p">(</span><span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span> <span class="o">* [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 6.674224 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>conv2d with tensor core: 13.368719 ms
</pre></div>
</div>
</div>
diff --git a/docs/how_to/optimize_operators/opt_gemm.html b/docs/how_to/optimize_operators/opt_gemm.html
index 367d99b346..fe6ab00042 100644
--- a/docs/how_to/optimize_operators/opt_gemm.html
+++ b/docs/how_to/optimize_operators/opt_gemm.html
@@ -472,8 +472,8 @@ Then we write a baseline implementation, the simplest way to write a matrix mult
<span class="nb">print</span><span class="p">(</span><span class="s2">"Baseline: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.017826
-Baseline: 3.292331
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Numpy running time: 0.018515
+Baseline: 3.232754
</pre></div>
</div>
<p>In TVM, we can always inspect lower level IR to debug or optimize our schedule.
@@ -529,7 +529,7 @@ fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt1: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.305006
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt1: 0.304784
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -583,7 +583,7 @@ vastly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt2: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.339556
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt2: 0.335927
</pre></div>
</div>
<p>Here is the generated IR after vectorization.</p>
@@ -635,7 +635,7 @@ the access pattern for A matrix is more cache friendly.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt3: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.116267
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt3: 0.115428
</pre></div>
</div>
<p>Here is the generated IR after loop permutation.</p>
@@ -709,7 +709,7 @@ flattening.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt4: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.108496
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt4: 0.109501
</pre></div>
</div>
<p>Here is the generated IR after array packing.</p>
@@ -784,7 +784,7 @@ write to C when all the block results are ready.</p>
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt5: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">evaluator</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">c</span><span class="p">)</span><span class="o">.</span><span class="n">mean</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111046
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt5: 0.111203
</pre></div>
</div>
<p>Here is the generated IR after blocking.</p>
@@ -861,7 +861,7 @@ class Module:
<span class="nb">print</span><span class="p">(</span><span class="s2">"Opt6: </span><span class="si">%f</span><span class="s2">"</span> <span class="o">%</span> <span class="n">opt6_time</span><span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.147583
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Opt6: 0.146464
</pre></div>
</div>
<p>Here is the generated IR after parallelization.</p>
diff --git a/docs/how_to/optimize_operators/sg_execution_times.html b/docs/how_to/optimize_operators/sg_execution_times.html
index 0b4fc4599f..beec46f0b6 100644
--- a/docs/how_to/optimize_operators/sg_execution_times.html
+++ b/docs/how_to/optimize_operators/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-optimize-operators-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:34.615</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
+<p><strong>00:34.352</strong> total execution time for <strong>how_to_optimize_operators</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_gemm.html#sphx-glr-how-to-optimize-operators-opt-gemm-py"><span class="std std-ref">How to optimize GEMM on CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_gemm.py</span></code>)</p></td>
-<td><p>00:32.086</p></td>
+<td><p>00:31.727</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="opt_conv_tensorcore.html#sphx-glr-how-to-optimize-operators-opt-conv-tensorcore-py"><span class="std std-ref">How to optimize convolution using TensorCores</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_tensorcore.py</span></code>)</p></td>
-<td><p>00:01.443</p></td>
+<td><p>00:01.563</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="opt_conv_cuda.html#sphx-glr-how-to-optimize-operators-opt-conv-cuda-py"><span class="std std-ref">How to optimize convolution on GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">opt_conv_cuda.py</span></code>)</p></td>
-<td><p>00:01.086</p></td>
+<td><p>00:01.062</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
index 9572d37c7b..2e8fe861eb 100644
--- a/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
+++ b/docs/how_to/tune_with_autoscheduler/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autoscheduler-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>09:18.552</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
+<p><strong>09:28.269</strong> total execution time for <strong>how_to_tune_with_autoscheduler</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -349,27 +349,27 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_layer_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py"><span class="std std-ref">Auto-scheduling a Convolution Layer for GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_layer_cuda.py</span></code>)</p></td>
-<td><p>05:38.300</p></td>
+<td><p>05:34.610</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-x86-py"><span class="std std-ref">Auto-scheduling a Neural Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_x86.py</span></code>)</p></td>
-<td><p>01:39.258</p></td>
+<td><p>01:39.417</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_cuda.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-cuda-py"><span class="std std-ref">Auto-scheduling a Neural Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_cuda.py</span></code>)</p></td>
-<td><p>01:05.499</p></td>
+<td><p>01:06.112</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_sparse_x86.html#sphx-glr-how-to-tune-with-autoscheduler-tune-sparse-x86-py"><span class="std std-ref">Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_sparse_x86.py</span></code>)</p></td>
-<td><p>00:28.685</p></td>
+<td><p>00:40.800</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_network_arm.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-arm-py"><span class="std std-ref">Auto-scheduling a Neural Network for ARM CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_arm.py</span></code>)</p></td>
-<td><p>00:13.915</p></td>
+<td><p>00:14.207</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_network_mali.html#sphx-glr-how-to-tune-with-autoscheduler-tune-network-mali-py"><span class="std std-ref">Auto-scheduling a Neural Network for mali GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_network_mali.py</span></code>)</p></td>
-<td><p>00:12.895</p></td>
+<td><p>00:13.123</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
index 93eb795d33..658fef7c52 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html
@@ -503,481 +503,356 @@ class Module:
def main(data: T.Buffer((1, 512, 7, 7), "float32"), kernel: T.Buffer((512, 512, 3, 3), "float32"), bias: T.Buffer((1, 512, 1, 1), "float32"), compute: T.Buffer((1, 512, 7, 7), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
blockIdx_x = T.env_thread("blockIdx.x")
- T.launch_thread(blockIdx_x, 28)
+ T.launch_thread(blockIdx_x, 32)
conv2d_nchw = T.allocate([14], "float32", "local")
- pad_temp_shared = T.allocate([72], "float32", "shared")
- kernel_shared = T.allocate([3072], "float32", "shared")
+ pad_temp_shared = T.allocate([1008], "float32", "shared")
+ kernel_shared = T.allocate([768], "float32", "shared")
threadIdx_x = T.env_thread("threadIdx.x")
- T.launch_thread(threadIdx_x, 64)
- conv2d_nchw_1 = T.Buffer((14,), data=conv2d_nchw, scope="local", align=32)
+ T.launch_thread(threadIdx_x, 56)
+ conv2d_nchw_1 = T.Buffer((4,), data=conv2d_nchw, scope="local", align=8)
conv2d_nchw_1[0] = T.float32(0)
- conv2d_nchw_1[1] = T.float32(0)
conv2d_nchw_1[2] = T.float32(0)
- conv2d_nchw_1[3] = T.float32(0)
conv2d_nchw_1[4] = T.float32(0)
- conv2d_nchw_1[5] = T.float32(0)
conv2d_nchw_1[6] = T.float32(0)
- conv2d_nchw_1[7] = T.float32(0)
conv2d_nchw_1[8] = T.float32(0)
- conv2d_nchw_1[9] = T.float32(0)
conv2d_nchw_1[10] = T.float32(0)
- conv2d_nchw_1[11] = T.float32(0)
conv2d_nchw_1[12] = T.float32(0)
+ conv2d_nchw_1[1] = T.float32(0)
+ conv2d_nchw_1[3] = T.float32(0)
+ conv2d_nchw_1[5] = T.float32(0)
+ conv2d_nchw_1[7] = T.float32(0)
+ conv2d_nchw_1[9] = T.float32(0)
+ conv2d_nchw_1[11] = T.float32(0)
conv2d_nchw_1[13] = T.float32(0)
- for rc_outer_outer, ry_outer_outer in T.grid(64, 3):
- cse_var_2: T.int32 = rc_outer_outer * 72
- cse_var_1: T.int32 = ry_outer_outer * 3
+ for rc_outer_outer, rx_outer_outer in T.grid(32, 3):
+ cse_var_1: T.int32 = rc_outer_outer * 144
threadIdx_x_1 = T.env_thread("threadIdx.x")
- pad_temp_shared_1 = T.Buffer((72,), data=pad_temp_shared, scope="shared")
- with T.launch_thread(threadIdx_x_1, 64):
+ pad_temp_shared_1 = T.Buffer((1008,), data=pad_temp_shared, scope="shared")
+ with T.launch_thread(threadIdx_x_1, 56):
data_1 = T.Buffer((25088,), data=data.data)
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= threadIdx_x_1 * 4 % 9 and threadIdx_x_1 * 4 % 9 < 8, data_1[rc_outer_outer * 392 + threadIdx_x_1 * 4 // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + threadIdx_x_1 * 4 % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 1] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 2] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 3] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 4] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 5] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 6] = T.if_then_else(1 <= threadIdx_x_1 * 8 % 9 and threadIdx_x_1 * 8 % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + threadIdx_x_1 * 8 // 9 * 49 + threadIdx_x_1 * 8 % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 7] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 8] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 9] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 10] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 11] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 12] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 13] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 1) % 9 and (threadIdx_x_1 * 8 + 1) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 1) // 9 * 49 + (threadIdx_x_1 * 8 + 1) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 14] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 15] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 16] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 17] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 18] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 19] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 20] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 2) % 9 and (threadIdx_x_1 * 8 + 2) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 2) // 9 * 49 + (threadIdx_x_1 * 8 + 2) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 21] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 22] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 23] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 24] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 25] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 26] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 27] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 3) % 9 and (threadIdx_x_1 * 8 + 3) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 3) // 9 * 49 + (threadIdx_x_1 * 8 + 3) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 28] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 29] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 30] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 31] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 32] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 33] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 34] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 4) % 9 and (threadIdx_x_1 * 8 + 4) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 4) // 9 * 49 + (threadIdx_x_1 * 8 + 4) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 1] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 1) % 9 and (threadIdx_x_1 * 4 + 1) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 1) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 1) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 35] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 2] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 2) % 9 and (threadIdx_x_1 * 4 + 2) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 2) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 2) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 36] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
if T.likely(threadIdx_x_1 < 18):
- pad_temp_shared_1[threadIdx_x_1 * 4 + 3] = T.if_then_else(1 <= ry_outer_outer + blockIdx_x % 7 and ry_outer_outer + blockIdx_x % 7 < 8 and 1 <= (threadIdx_x_1 * 4 + 3) % 9 and (threadIdx_x_1 * 4 + 3) % 9 < 8, data_1[rc_outer_outer * 392 + (threadIdx_x_1 * 4 + 3) // 9 * 49 + ry_outer_outer * 7 + blockIdx_x % 7 * 7 + (threadIdx_x_1 * 4 + 3) % 9 - 8], T.float32(0))
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 37] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 38] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 39] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 40] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 41] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 5) % 9 and (threadIdx_x_1 * 8 + 5) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 5) // 9 * 49 + (threadIdx_x_1 * 8 + 5) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 42] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 43] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 44] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 45] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 46] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 47] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 48] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 6) % 9 and (threadIdx_x_1 * 8 + 6) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 6) // 9 * 49 + (threadIdx_x_1 * 8 + 6) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 49] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8 and 1 <= rx_outer_outer, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 8], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 50] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 7], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 51] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 6], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 52] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 5], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 53] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 4], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 54] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 3], T.float32(0))
+ if T.likely(threadIdx_x_1 < 18):
+ pad_temp_shared_1[threadIdx_x_1 * 56 + 55] = T.if_then_else(1 <= (threadIdx_x_1 * 8 + 7) % 9 and (threadIdx_x_1 * 8 + 7) % 9 < 8 and rx_outer_outer < 2, data_1[rc_outer_outer * 784 + (threadIdx_x_1 * 8 + 7) // 9 * 49 + (threadIdx_x_1 * 8 + 7) % 9 * 7 + rx_outer_outer - 2], T.float32(0))
threadIdx_x_2 = T.env_thread("threadIdx.x")
- kernel_shared_1 = T.Buffer((3072,), data=kernel_shared, scope="shared")
+ kernel_shared_1 = T.Buffer((768,), data=kernel_shared, scope="shared")
kernel_1 = T.Buffer((2359296,), data=kernel.data)
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 64] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 64) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 128] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 128) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 192] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 36864]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 256] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 256) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 320] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 320) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 384] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 73728]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 448) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 512] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 512) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 576] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 110592]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 640] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 640) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 704] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 704) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 768] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 147456]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 832] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 832) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 896] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 896) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 960] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 184320]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1024] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1024) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1088] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1088) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1152] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 221184]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1216] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1216) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1280] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1280) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1344] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 258048]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1408] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1408) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1472] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1472) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1536] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 294912]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1600] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1600) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1664] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1664) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1728] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 331776]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1792] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1792) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1856] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1856) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1920] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 368640]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 1984] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 1984) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2048] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2048) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2112] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 405504]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2176] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2176) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2240] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2240) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2304] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 442368]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2368] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2368) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2432] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2432) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2496] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 479232]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2560] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2560) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2624] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2624) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2688] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 516096]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2752] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2752) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2816] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2816) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2880] = kernel_1[blockIdx_x // 7 * 589824 + threadIdx_x_2 // 24 * 4608 + cse_var_2 + threadIdx_x_2 % 24 // 3 * 9 + cse_var_1 + threadIdx_x_2 % 3 + 552960]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 2944] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 2944) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 16) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 1) % 3]
- with T.launch_thread(threadIdx_x_2, 64):
- kernel_shared_1[threadIdx_x_2 + 3008] = kernel_1[blockIdx_x // 7 * 589824 + (threadIdx_x_2 + 3008) // 24 * 4608 + cse_var_2 + (threadIdx_x_2 + 8) % 24 // 3 * 9 + cse_var_1 + (threadIdx_x_2 + 2) % 3]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 3]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[0] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[9] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 24]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 27]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 1]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 4]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[1] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[10] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 25]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 28]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 2]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 5]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[2] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[11] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[3] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[12] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[4] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[13] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[5] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[14] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[6] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[15] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[7] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[16] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[8] * kernel_shared_1[threadIdx_x * 48 + 26]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[17] * kernel_shared_1[threadIdx_x * 48 + 29]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 6]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 9]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[18] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[27] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 30]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 33]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 7]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 10]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[19] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[28] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 31]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 34]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 8]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 11]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[20] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[29] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[21] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[30] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[22] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[31] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[23] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[32] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[24] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[33] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[25] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[34] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[26] * kernel_shared_1[threadIdx_x * 48 + 32]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[35] * kernel_shared_1[threadIdx_x * 48 + 35]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 12]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 15]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[36] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[45] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 36]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 39]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 13]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 16]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[37] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[46] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 37]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 40]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 14]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 17]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[38] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[47] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[39] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[48] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[40] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[49] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[41] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[50] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[42] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[51] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[43] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[52] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[44] * kernel_shared_1[threadIdx_x * 48 + 38]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[53] * kernel_shared_1[threadIdx_x * 48 + 41]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 18]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 21]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[54] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[63] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 42]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 45]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 19]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 22]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[55] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[64] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 43]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 46]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 20]
- conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 23]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[56] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[65] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[57] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[66] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[58] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[67] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[59] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[68] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[60] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[69] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[61] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[70] * kernel_shared_1[threadIdx_x * 48 + 47]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[62] * kernel_shared_1[threadIdx_x * 48 + 44]
- conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[71] * kernel_shared_1[threadIdx_x * 48 + 47]
- for i1_inner, i3_inner in T.grid(2, 7):
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 56] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 56) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 112] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 112) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 168] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 168) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 16 * 9 + threadIdx_x_2 % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 224] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 224) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 280] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 280) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 336] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer + 32256]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 392] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 392) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 448] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 448) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 16) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 504] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 504) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 // 3 + 8) % 16 * 9 + threadIdx_x_2 % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 560] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 560) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 32) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 616] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 616) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 40) % 48 // 3 * 9 + (threadIdx_x_2 + 1) % 3 * 3 + rx_outer_outer]
+ with T.launch_thread(threadIdx_x_2, 56):
+ kernel_shared_1[threadIdx_x_2 + 672] = kernel_1[blockIdx_x * 73728 + threadIdx_x_2 // 48 * 4608 + cse_var_1 + threadIdx_x_2 % 48 * 3 + rx_outer_outer + 64512]
+ with T.launch_thread(threadIdx_x_2, 56):
+ if T.likely(threadIdx_x_2 < 40):
+ kernel_shared_1[threadIdx_x_2 + 728] = kernel_1[blockIdx_x * 73728 + (threadIdx_x_2 + 728) // 48 * 4608 + cse_var_1 + (threadIdx_x_2 + 8) % 48 // 3 * 9 + (threadIdx_x_2 + 2) % 3 * 3 + rx_outer_outer]
+ for rc_outer_inner in range(4):
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 3] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 4] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 5] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 6] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 66] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 67] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 68] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 69] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 3]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 129] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 130] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 131] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 132] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 6]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 192] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 193] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 194] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 195] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 9]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 1] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 2] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 3] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 4] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 5] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 6] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 48]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 63] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 64] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 65] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 66] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 67] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 68] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 69] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 51]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 126] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 127] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 128] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 129] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 130] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 131] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 132] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 54]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 189] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 190] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 191] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 192] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 193] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 194] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 195] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 57]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 8] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 12] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 13] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 1]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 70] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 71] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 75] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 76] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 4]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 133] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 134] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 138] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 139] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 7]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 196] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 197] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 201] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 202] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 10]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 7] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 8] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 9] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 10] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 11] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 12] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 13] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 49]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 70] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 71] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 72] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 73] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 74] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 75] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 76] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 52]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 133] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 134] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 135] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 136] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 137] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 138] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 139] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 55]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 196] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 197] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 198] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 199] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 200] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 201] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 202] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 58]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 14] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 15] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 16] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 17] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 2]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 77] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 78] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 79] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 80] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 5]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 140] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 141] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 142] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 143] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 8]
+ conv2d_nchw_1[0] = conv2d_nchw_1[0] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 203] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[2] = conv2d_nchw_1[2] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 204] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[4] = conv2d_nchw_1[4] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 205] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[6] = conv2d_nchw_1[6] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 206] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[8] = conv2d_nchw_1[8] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[10] = conv2d_nchw_1[10] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[12] = conv2d_nchw_1[12] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 11]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 14] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 15] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 16] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 17] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 18] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 19] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 20] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 50]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 77] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 78] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 79] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 80] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 81] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 82] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 83] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 53]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 140] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 141] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 142] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 143] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 144] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 145] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 146] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 56]
+ conv2d_nchw_1[1] = conv2d_nchw_1[1] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 203] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[3] = conv2d_nchw_1[3] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 204] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[5] = conv2d_nchw_1[5] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 205] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[7] = conv2d_nchw_1[7] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 206] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[9] = conv2d_nchw_1[9] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 207] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[11] = conv2d_nchw_1[11] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 208] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ conv2d_nchw_1[13] = conv2d_nchw_1[13] + pad_temp_shared_1[rc_outer_inner * 252 + threadIdx_x % 7 * 7 + 209] * kernel_shared_1[threadIdx_x // 7 * 96 + rc_outer_inner * 12 + 59]
+ for i1_inner in range(2):
compute_1 = T.Buffer((25088,), data=compute.data)
bias_1 = T.Buffer((512,), data=bias.data)
- compute_1[blockIdx_x // 7 * 6272 + threadIdx_x * 98 + i1_inner * 49 + blockIdx_x % 7 * 7 + i3_inner] = T.max(conv2d_nchw_1[i1_inner * 7 + i3_inner] + bias_1[blockIdx_x // 7 * 128 + threadIdx_x * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7] = T.max(conv2d_nchw_1[i1_inner] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 1] = T.max(conv2d_nchw_1[i1_inner + 2] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 2] = T.max(conv2d_nchw_1[i1_inner + 4] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 3] = T.max(conv2d_nchw_1[i1_inner + 6] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 4] = T.max(conv2d_nchw_1[i1_inner + 8] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 5] = T.max(conv2d_nchw_1[i1_inner + 10] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
+ compute_1[blockIdx_x * 784 + threadIdx_x // 7 * 98 + i1_inner * 49 + threadIdx_x % 7 * 7 + 6] = T.max(conv2d_nchw_1[i1_inner + 12] + bias_1[blockIdx_x * 16 + threadIdx_x // 7 * 2 + i1_inner], T.float32(0))
</pre></div>
</div>
</div>
@@ -1011,7 +886,7 @@ class Module:
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.352 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 0.383 ms
</pre></div>
</div>
</div>
@@ -1042,35 +917,35 @@ conv2d_nchw_nn_o_o_o_i, conv2d_nchw_nn_o_o_i = s[conv2d_nchw].split(conv2d_nchw_
conv2d_nchw_nn_o_o_o_o, conv2d_nchw_nn_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_nn_o_o_o_i, factor=1)
conv2d_nchw_ff_o_i, conv2d_nchw_ff_i = s[conv2d_nchw].split(conv2d_nchw_ff, factor=1)
conv2d_nchw_ff_o_o_i, conv2d_nchw_ff_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_i, factor=2)
-conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=64)
+conv2d_nchw_ff_o_o_o_i, conv2d_nchw_ff_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_i, factor=8)
conv2d_nchw_ff_o_o_o_o, conv2d_nchw_ff_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_ff_o_o_o_i, factor=1)
conv2d_nchw_yy_o_i, conv2d_nchw_yy_i = s[conv2d_nchw].split(conv2d_nchw_yy, factor=1)
conv2d_nchw_yy_o_o_i, conv2d_nchw_yy_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_i, factor=1)
-conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=1)
+conv2d_nchw_yy_o_o_o_i, conv2d_nchw_yy_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_i, factor=7)
conv2d_nchw_yy_o_o_o_o, conv2d_nchw_yy_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_yy_o_o_o_i, factor=1)
conv2d_nchw_xx_o_i, conv2d_nchw_xx_i = s[conv2d_nchw].split(conv2d_nchw_xx, factor=1)
-conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=7)
+conv2d_nchw_xx_o_o_i, conv2d_nchw_xx_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_i, factor=1)
conv2d_nchw_xx_o_o_o_i, conv2d_nchw_xx_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_i, factor=1)
-conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=1)
-conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=2)
+conv2d_nchw_xx_o_o_o_o, conv2d_nchw_xx_o_o_o_i = s[conv2d_nchw].split(conv2d_nchw_xx_o_o_o_i, factor=7)
+conv2d_nchw_rc_o_i, conv2d_nchw_rc_i = s[conv2d_nchw].split(conv2d_nchw_rc, factor=4)
conv2d_nchw_rc_o_o, conv2d_nchw_rc_o_i = s[conv2d_nchw].split(conv2d_nchw_rc_o_i, factor=4)
conv2d_nchw_ry_o_i, conv2d_nchw_ry_i = s[conv2d_nchw].split(conv2d_nchw_ry, factor=1)
-conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=1)
+conv2d_nchw_ry_o_o, conv2d_nchw_ry_o_i = s[conv2d_nchw].split(conv2d_nchw_ry_o_i, factor=3)
conv2d_nchw_rx_o_i, conv2d_nchw_rx_i = s[conv2d_nchw].split(conv2d_nchw_rx, factor=1)
-conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=3)
+conv2d_nchw_rx_o_o, conv2d_nchw_rx_o_i = s[conv2d_nchw].split(conv2d_nchw_rx_o_i, factor=1)
s[conv2d_nchw].reorder(conv2d_nchw_nn_o_o_o_o, conv2d_nchw_ff_o_o_o_o, conv2d_nchw_yy_o_o_o_o, conv2d_nchw_xx_o_o_o_o, conv2d_nchw_nn_o_o_o_i, conv2d_nchw_ff_o_o_o_i, conv2d_nchw_yy_o_o_o_i, conv2d_nchw_xx_o_o_o_i, conv2d_nchw_nn_o_o_i, conv2d_nchw_ff_o_o_i, conv2d_nchw_yy_o_o_i, conv2d_nchw_xx_o_o_i, conv2d_nchw_rc_o_o, conv2d_nchw_ry_o_o, conv2d_nchw_rx_o_o, conv2d_nchw_rc_o_i, conv2d_nchw_ry_o_i, conv2d_nchw_rx_o_i, conv2d_nchw_nn_o_i, conv2d_nchw_ff_o_i, conv2d_nchw_yy_o_i, conv2d_nc [...]
compute_i0_o_i, compute_i0_i = s[compute].split(compute_i0, factor=1)
compute_i0_o_o_i, compute_i0_o_i = s[compute].split(compute_i0_o_i, factor=1)
compute_i0_o_o_o, compute_i0_o_o_i = s[compute].split(compute_i0_o_o_i, factor=1)
compute_i1_o_i, compute_i1_i = s[compute].split(compute_i1, factor=2)
-compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=64)
+compute_i1_o_o_i, compute_i1_o_i = s[compute].split(compute_i1_o_i, factor=8)
compute_i1_o_o_o, compute_i1_o_o_i = s[compute].split(compute_i1_o_o_i, factor=1)
compute_i2_o_i, compute_i2_i = s[compute].split(compute_i2, factor=1)
-compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=1)
+compute_i2_o_o_i, compute_i2_o_i = s[compute].split(compute_i2_o_i, factor=7)
compute_i2_o_o_o, compute_i2_o_o_i = s[compute].split(compute_i2_o_o_i, factor=1)
-compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=7)
+compute_i3_o_i, compute_i3_i = s[compute].split(compute_i3, factor=1)
compute_i3_o_o_i, compute_i3_o_i = s[compute].split(compute_i3_o_i, factor=1)
-compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=1)
+compute_i3_o_o_o, compute_i3_o_o_i = s[compute].split(compute_i3_o_o_i, factor=7)
s[compute].reorder(compute_i0_o_o_o, compute_i1_o_o_o, compute_i2_o_o_o, compute_i3_o_o_o, compute_i0_o_o_i, compute_i1_o_o_i, compute_i2_o_o_i, compute_i3_o_o_i, compute_i0_o_i, compute_i1_o_i, compute_i2_o_i, compute_i3_o_i, compute_i0_i, compute_i1_i, compute_i2_i, compute_i3_i)
s[conv2d_nchw].compute_at(s[compute], compute_i3_o_i)
kernel_shared = s.cache_read(kernel, "shared", [conv2d_nchw])
@@ -1089,12 +964,12 @@ s[compute].bind(compute_i0_o_i_i1_o_i_fused_i2_o_i_fused_i3_o_i_fused, te.thread
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[kernel_shared].fuse(kernel_shared_ax0, kernel_shared_ax1, kernel_shared_ax2, kernel_shared_ax3)
kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=1)
s[kernel_shared].vectorize(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[kernel_shared].split(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[kernel_shared].bind(kernel_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused = s[pad_temp_shared].fuse(pad_temp_shared_ax0, pad_temp_shared_ax1, pad_temp_shared_ax2, pad_temp_shared_ax3)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=4)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused, factor=56)
s[pad_temp_shared].vectorize(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_i)
-pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=64)
+pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_o, pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i = s[pad_temp_shared].split(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o, factor=56)
s[pad_temp_shared].bind(pad_temp_shared_ax0_ax1_fused_ax2_fused_ax3_fused_o_i, te.thread_axis("threadIdx.x"))
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "auto_unroll_max_step", 512)
s[conv2d_nchw].pragma(conv2d_nchw_nn_o_o_o_o, "unroll_explicit", True)
@@ -1114,430 +989,392 @@ CUDA source code:
#define int64_t long long
#define uint64_t unsigned long long
#endif
-extern "C" __global__ void __launch_bounds__(64) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
+extern "C" __global__ void __launch_bounds__(56) default_function_kernel0(float* __restrict__ data, float* __restrict__ kernel, float* __restrict__ compute, float* __restrict__ bias) {
float conv2d_nchw[14];
- __shared__ float pad_temp_shared[72];
- __shared__ float kernel_shared[3072];
+ __shared__ float pad_temp_shared[1008];
+ __shared__ float kernel_shared[768];
conv2d_nchw[0] = 0.000000e+00f;
- conv2d_nchw[1] = 0.000000e+00f;
conv2d_nchw[2] = 0.000000e+00f;
- conv2d_nchw[3] = 0.000000e+00f;
conv2d_nchw[4] = 0.000000e+00f;
- conv2d_nchw[5] = 0.000000e+00f;
conv2d_nchw[6] = 0.000000e+00f;
- conv2d_nchw[7] = 0.000000e+00f;
conv2d_nchw[8] = 0.000000e+00f;
- conv2d_nchw[9] = 0.000000e+00f;
conv2d_nchw[10] = 0.000000e+00f;
- conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[12] = 0.000000e+00f;
+ conv2d_nchw[1] = 0.000000e+00f;
+ conv2d_nchw[3] = 0.000000e+00f;
+ conv2d_nchw[5] = 0.000000e+00f;
+ conv2d_nchw[7] = 0.000000e+00f;
+ conv2d_nchw[9] = 0.000000e+00f;
+ conv2d_nchw[11] = 0.000000e+00f;
conv2d_nchw[13] = 0.000000e+00f;
- for (int rc_outer_outer = 0; rc_outer_outer < 64; ++rc_outer_outer) {
- for (int ry_outer_outer = 0; ry_outer_outer < 3; ++ry_outer_outer) {
+ for (int rc_outer_outer = 0; rc_outer_outer < 32; ++rc_outer_outer) {
+ for (int rx_outer_outer = 0; rx_outer_outer < 3; ++rx_outer_outer) {
__syncthreads();
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[(((int)threadIdx.x) * 4)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= ((((int)threadIdx.x) * 4) % 9))) && (((((int)threadIdx.x) * 4) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + (((((int)threadIdx.x) * 4) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + ((((int)threadIdx.x) * 4) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[(((int)threadIdx.x) * 56)] = ((((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 1)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 2)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 3)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 4)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 5)] = (((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 6)] = ((((1 <= ((((int)threadIdx.x) * 8) % 9)) && (((((int)threadIdx.x) * 8) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + (((((int)threadIdx.x) * 8) / 9) * 49)) + (((((int)threadIdx.x) * 8) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 7)] = ((((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 8)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 1)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 1) % 9))) && ((((((int)threadIdx.x) * 4) + 1) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 1) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 1) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 9)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 2)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 2) % 9))) && ((((((int)threadIdx.x) * 4) + 2) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 2) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 2) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 10)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
}
if (((int)threadIdx.x) < 18) {
- pad_temp_shared[((((int)threadIdx.x) * 4) + 3)] = (((((1 <= (ry_outer_outer + (((int)blockIdx.x) % 7))) && ((ry_outer_outer + (((int)blockIdx.x) % 7)) < 8)) && (1 <= (((((int)threadIdx.x) * 4) + 3) % 9))) && ((((((int)threadIdx.x) * 4) + 3) % 9) < 8)) ? data[((((((rc_outer_outer * 392) + ((((((int)threadIdx.x) * 4) + 3) / 9) * 49)) + (ry_outer_outer * 7)) + ((((int)blockIdx.x) % 7) * 7)) + (((((int)threadIdx.x) * 4) + 3) % 9)) - 8)] : 0.000000e+00f);
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 11)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 12)] = (((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 13)] = ((((1 <= (((((int)threadIdx.x) * 8) + 1) % 9)) && ((((((int)threadIdx.x) * 8) + 1) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 1) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 1) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 14)] = ((((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 15)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 16)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 17)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 18)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 19)] = (((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 20)] = ((((1 <= (((((int)threadIdx.x) * 8) + 2) % 9)) && ((((((int)threadIdx.x) * 8) + 2) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 2) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 2) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 21)] = ((((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 22)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 23)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 24)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 25)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 26)] = (((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 27)] = ((((1 <= (((((int)threadIdx.x) * 8) + 3) % 9)) && ((((((int)threadIdx.x) * 8) + 3) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 3) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 3) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 28)] = ((((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 29)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 30)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 31)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 32)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 33)] = (((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 34)] = ((((1 <= (((((int)threadIdx.x) * 8) + 4) % 9)) && ((((((int)threadIdx.x) * 8) + 4) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 4) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 4) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 35)] = ((((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 36)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 37)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 38)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 39)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 40)] = (((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 41)] = ((((1 <= (((((int)threadIdx.x) * 8) + 5) % 9)) && ((((((int)threadIdx.x) * 8) + 5) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 5) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 5) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 42)] = ((((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 43)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 44)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 45)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 46)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 47)] = (((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 48)] = ((((1 <= (((((int)threadIdx.x) * 8) + 6) % 9)) && ((((((int)threadIdx.x) * 8) + 6) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 6) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 6) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 49)] = ((((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) && (1 <= rx_outer_outer)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 8)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 50)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 7)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 51)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 6)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 52)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 5)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 53)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 4)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 54)] = (((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 3)] : 0.000000e+00f);
+ }
+ if (((int)threadIdx.x) < 18) {
+ pad_temp_shared[((((int)threadIdx.x) * 56) + 55)] = ((((1 <= (((((int)threadIdx.x) * 8) + 7) % 9)) && ((((((int)threadIdx.x) * 8) + 7) % 9) < 8)) && (rx_outer_outer < 2)) ? data[(((((rc_outer_outer * 784) + ((((((int)threadIdx.x) * 8) + 7) / 9) * 49)) + ((((((int)threadIdx.x) * 8) + 7) % 9) * 7)) + rx_outer_outer) - 2)] : 0.000000e+00f);
+ }
+ kernel_shared[((int)threadIdx.x)] = kernel[(((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 56)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 56) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 112)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 112) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 168)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 168) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) & 15) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 224)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 224) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 280)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 280) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 40) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 336)] = kernel[((((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 32256)];
+ kernel_shared[(((int)threadIdx.x) + 392)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 392) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 8) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 448)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 448) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 16) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 504)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 504) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) / 3) + 8) & 15) * 9)) + ((((int)threadIdx.x) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 560)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 560) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 32) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 616)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 616) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((((int)threadIdx.x) + 40) % 48) / 3) * 9)) + (((((int)threadIdx.x) + 1) % 3) * 3)) + rx_outer_outer)];
+ kernel_shared[(((int)threadIdx.x) + 672)] = kernel[((((((((int)blockIdx.x) * 73728) + ((((int)threadIdx.x) / 48) * 4608)) + (rc_outer_outer * 144)) + ((((int)threadIdx.x) % 48) * 3)) + rx_outer_outer) + 64512)];
+ if (((int)threadIdx.x) < 40) {
+ kernel_shared[(((int)threadIdx.x) + 728)] = kernel[((((((((int)blockIdx.x) * 73728) + (((((int)threadIdx.x) + 728) / 48) * 4608)) + (rc_outer_outer * 144)) + (((((int)threadIdx.x) + 8) / 3) * 9)) + (((((int)threadIdx.x) + 2) % 3) * 3)) + rx_outer_outer)];
}
- kernel_shared[((int)threadIdx.x)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3))];
- kernel_shared[(((int)threadIdx.x) + 64)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 64) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 128)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 128) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 192)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 36864)];
- kernel_shared[(((int)threadIdx.x) + 256)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 256) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 320)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 320) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 384)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 73728)];
- kernel_shared[(((int)threadIdx.x) + 448)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 448) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 512)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 512) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 576)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 110592)];
- kernel_shared[(((int)threadIdx.x) + 640)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 640) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 704)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 704) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 768)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 147456)];
- kernel_shared[(((int)threadIdx.x) + 832)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 832) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 896)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 896) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 960)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 184320)];
- kernel_shared[(((int)threadIdx.x) + 1024)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1024) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1088)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1088) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1152)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 221184)];
- kernel_shared[(((int)threadIdx.x) + 1216)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1216) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1280)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1280) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1344)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 258048)];
- kernel_shared[(((int)threadIdx.x) + 1408)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1408) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1472)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1472) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1536)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 294912)];
- kernel_shared[(((int)threadIdx.x) + 1600)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1600) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1664)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1664) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1728)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 331776)];
- kernel_shared[(((int)threadIdx.x) + 1792)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1792) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1856)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1856) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 1920)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 368640)];
- kernel_shared[(((int)threadIdx.x) + 1984)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 1984) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2048)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2048) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2112)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 405504)];
- kernel_shared[(((int)threadIdx.x) + 2176)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2176) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2240)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2240) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2304)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 442368)];
- kernel_shared[(((int)threadIdx.x) + 2368)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2368) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2432)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2432) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2496)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 479232)];
- kernel_shared[(((int)threadIdx.x) + 2560)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2560) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2624)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2624) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2688)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 516096)];
- kernel_shared[(((int)threadIdx.x) + 2752)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2752) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2816)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2816) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
- kernel_shared[(((int)threadIdx.x) + 2880)] = kernel[((((((((((int)blockIdx.x) / 7) * 589824) + ((((int)threadIdx.x) / 24) * 4608)) + (rc_outer_outer * 72)) + (((((int)threadIdx.x) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + (((int)threadIdx.x) % 3)) + 552960)];
- kernel_shared[(((int)threadIdx.x) + 2944)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 2944) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 16) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 1) % 3))];
- kernel_shared[(((int)threadIdx.x) + 3008)] = kernel[(((((((((int)blockIdx.x) / 7) * 589824) + (((((int)threadIdx.x) + 3008) / 24) * 4608)) + (rc_outer_outer * 72)) + ((((((int)threadIdx.x) + 8) % 24) / 3) * 9)) + (ry_outer_outer * 3)) + ((((int)threadIdx.x) + 2) % 3))];
__syncthreads();
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[0] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[1] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[2] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[3] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[4] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[5] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[6] * kernel_shared[(((int)threadIdx.x) * 48)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 3)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[0] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[9] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 24)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 27)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 1)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 4)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[1] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[10] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 25)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 28)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 2)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 5)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[2] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[11] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[3] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[12] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[4] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[13] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[5] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[14] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[6] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[15] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[7] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[16] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[8] * kernel_shared[((((int)threadIdx.x) * 48) + 26)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[17] * kernel_shared[((((int)threadIdx.x) * 48) + 29)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 6)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 9)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[18] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[27] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 30)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 33)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 7)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 10)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[19] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[28] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 31)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 34)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 8)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 11)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[20] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[29] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[21] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[30] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[22] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[31] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[23] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[32] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[24] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[33] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[25] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[34] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[26] * kernel_shared[((((int)threadIdx.x) * 48) + 32)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[35] * kernel_shared[((((int)threadIdx.x) * 48) + 35)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 12)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 15)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[36] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[45] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 36)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 39)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 13)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 16)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[37] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[46] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 37)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 40)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 14)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 17)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[38] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[47] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[39] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[48] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[40] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[49] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[41] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[50] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[42] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[51] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[43] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[52] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[44] * kernel_shared[((((int)threadIdx.x) * 48) + 38)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[53] * kernel_shared[((((int)threadIdx.x) * 48) + 41)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 18)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 21)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[54] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[63] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 42)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 45)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 19)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 22)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[55] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[64] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 43)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 46)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 20)]));
- conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 23)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[56] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[65] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[57] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[66] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[58] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[67] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[59] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[68] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[60] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[69] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[61] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[70] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[62] * kernel_shared[((((int)threadIdx.x) * 48) + 44)]));
- conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[71] * kernel_shared[((((int)threadIdx.x) * 48) + 47)]));
+ for (int rc_outer_inner = 0; rc_outer_inner < 4; ++rc_outer_inner) {
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[(((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12))]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 66)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 67)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 68)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 69)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 3)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 129)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 130)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 131)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 132)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 6)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 190)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 191)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 192)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 193)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 194)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 195)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 9)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7))] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 1)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 2)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 3)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 4)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 5)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 6)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 48)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 63)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 64)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 65)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 66)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 67)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 68)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 69)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 51)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 126)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 127)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 128)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 129)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 130)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 131)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 132)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 54)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 189)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 190)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 191)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 192)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 193)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 194)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 195)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 57)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 1)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 71)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 75)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 76)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 4)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 134)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 138)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 139)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 7)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 197)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 198)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 199)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 200)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 201)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 202)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 10)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 7)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 8)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 9)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 10)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 11)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 12)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 13)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 49)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 70)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 71)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 72)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 73)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 74)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 75)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 76)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 52)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 133)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 134)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 135)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 136)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 137)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 138)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 139)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 55)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 196)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 197)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 198)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 199)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 200)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 201)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 202)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 58)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 2)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 78)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 79)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 80)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 5)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 141)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 142)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 143)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 8)]));
+ conv2d_nchw[0] = (conv2d_nchw[0] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[2] = (conv2d_nchw[2] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 204)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[4] = (conv2d_nchw[4] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 205)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[6] = (conv2d_nchw[6] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 206)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[8] = (conv2d_nchw[8] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 207)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[10] = (conv2d_nchw[10] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 208)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[12] = (conv2d_nchw[12] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 209)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 11)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 14)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 15)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 16)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 17)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 18)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 19)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 20)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 50)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 77)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 78)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 79)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 80)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 81)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 82)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 83)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 53)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 140)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 141)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 142)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 143)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 144)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 145)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 146)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 56)]));
+ conv2d_nchw[1] = (conv2d_nchw[1] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 203)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[3] = (conv2d_nchw[3] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 204)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[5] = (conv2d_nchw[5] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 205)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[7] = (conv2d_nchw[7] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 206)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[9] = (conv2d_nchw[9] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 207)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[11] = (conv2d_nchw[11] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 208)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ conv2d_nchw[13] = (conv2d_nchw[13] + (pad_temp_shared[(((rc_outer_inner * 252) + ((((int)threadIdx.x) % 7) * 7)) + 209)] * kernel_shared[((((((int)threadIdx.x) / 7) * 96) + (rc_outer_inner * 12)) + 59)]));
+ }
}
}
for (int i1_inner = 0; i1_inner < 2; ++i1_inner) {
- for (int i3_inner = 0; i3_inner < 7; ++i3_inner) {
- compute[((((((((int)blockIdx.x) / 7) * 6272) + (((int)threadIdx.x) * 98)) + (i1_inner * 49)) + ((((int)blockIdx.x) % 7) * 7)) + i3_inner)] = max((conv2d_nchw[((i1_inner * 7) + i3_inner)] + bias[((((((int)blockIdx.x) / 7) * 128) + (((int)threadIdx.x) * 2)) + i1_inner)]), 0.000000e+00f);
- }
+ compute[((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7))] = max((conv2d_nchw[i1_inner] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 1)] = max((conv2d_nchw[(i1_inner + 2)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 2)] = max((conv2d_nchw[(i1_inner + 4)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 3)] = max((conv2d_nchw[(i1_inner + 6)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 4)] = max((conv2d_nchw[(i1_inner + 8)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 5)] = max((conv2d_nchw[(i1_inner + 10)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
+ compute[(((((((int)blockIdx.x) * 784) + ((((int)threadIdx.x) / 7) * 98)) + (i1_inner * 49)) + ((((int)threadIdx.x) % 7) * 7)) + 6)] = max((conv2d_nchw[(i1_inner + 12)] + bias[(((((int)blockIdx.x) * 16) + ((((int)threadIdx.x) / 7) * 2)) + i1_inner)]), 0.000000e+00f);
}
}
</pre></div>
@@ -1574,7 +1411,7 @@ In the example below we resume the status and do more 5 trials.</p>
Get devices for measurement successfully!
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 38.300 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 34.610 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-conv2d-layer-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e3e540f3b477c0c52d8eb73e674e8ffd/tune_conv2d_layer_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_conv2d_layer_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
index e0c4999076..2e73a3bf40 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_cuda.html
@@ -916,7 +916,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 7.8796 7.8807 7.8808 7.8772 0.0017
+ 7.8650 7.8617 7.8742 7.8589 0.0066
</pre></div>
</div>
</div>
@@ -938,7 +938,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 5.499 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 6.112 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-cuda-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/eafe360d52540634c9eea0fa89e804bd/tune_network_cuda.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_cuda.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
index 6cec3c7abf..8dee012ed4 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_network_x86.html
@@ -935,7 +935,7 @@ so we can read the log file and load the best schedules.</p>
Evaluate inference time cost...
Execution time summary:
mean (ms) median (ms) max (ms) min (ms) std (ms)
- 754.1366 754.1049 754.7571 753.5476 0.4943
+ 747.3662 747.1039 748.5371 746.4578 0.8689
</pre></div>
</div>
</div>
@@ -957,7 +957,7 @@ to learn how to use the RPC Tracker and RPC Server.
To use the RPC Tracker in auto-scheduler, replace the runner in <code class="code docutils literal notranslate"><span class="pre">TuningOptions</span></code>
with <a class="reference internal" href="../../reference/api/python/auto_scheduler.html#tvm.auto_scheduler.RPCRunner" title="tvm.auto_scheduler.RPCRunner"><code class="xref any py py-class docutils literal notranslate"><span class="pre">auto_scheduler.RPCRunner</span></code></a>.</p></li>
</ol>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 39.258 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 39.417 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autoscheduler-tune-network-x86-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/e416b94ca1090b0897c0f6e0df95b911/tune_network_x86.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tune_network_x86.py</span></code></a></p>
diff --git a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
index a9980e6074..9c751d55f0 100644
--- a/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
+++ b/docs/how_to/tune_with_autoscheduler/tune_sparse_x86.html
@@ -629,26 +629,86 @@ class Module:
@T.prim_func
def main(placeholder: T.Buffer((128, 256), "float32"), placeholder_1: T.Buffer((4916, 16, 1), "float32"), placeholder_2: T.Buffer((4916,), "int32"), placeholder_3: T.Buffer((33,), "int32"), placeholder_4: T.Buffer((128, 512), "float32"), compute: T.Buffer((128, 512), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
- for i0_outer_i1_outer_fused in T.parallel(64):
- compute_1 = T.allocate([1024], "float32", "global")
- compute_2 = T.Buffer((1024,), data=compute_1)
- for nb_j_inner in range(2):
- for i_inner_init, j_init in T.grid(32, 16):
- compute_2[i_inner_init * 32 + nb_j_inner * 16 + j_init] = T.float32(0)
- for elem_idx, i_inner, j in T.grid(T.let(cse_var_1, i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner, placeholder_5[cse_var_1 + 1] - placeholder_5[cse_var_1]), 32, 16):
- cse_var_1 = T.var("int32")
+ for i0_outer_i1_outer_fused in T.parallel(32):
+ compute_1 = T.allocate([2048], "float32", "global")
+ compute_2 = T.Buffer((2048,), data=compute_1)
+ for i_outer_inner in range(2):
+ for i_inner_init in range(64):
+ cse_var_1: T.int32 = i_outer_inner * 1024 + i_inner_init * 16
+ compute_2[cse_var_1] = T.float32(0)
+ compute_2[cse_var_1 + 1] = T.float32(0)
+ compute_2[cse_var_1 + 2] = T.float32(0)
+ compute_2[cse_var_1 + 3] = T.float32(0)
+ compute_2[cse_var_1 + 4] = T.float32(0)
+ compute_2[cse_var_1 + 5] = T.float32(0)
+ compute_2[cse_var_1 + 6] = T.float32(0)
+ compute_2[cse_var_1 + 7] = T.float32(0)
+ compute_2[cse_var_1 + 8] = T.float32(0)
+ compute_2[cse_var_1 + 9] = T.float32(0)
+ compute_2[cse_var_1 + 10] = T.float32(0)
+ compute_2[cse_var_1 + 11] = T.float32(0)
+ compute_2[cse_var_1 + 12] = T.float32(0)
+ compute_2[cse_var_1 + 13] = T.float32(0)
+ compute_2[cse_var_1 + 14] = T.float32(0)
+ compute_2[cse_var_1 + 15] = T.float32(0)
+ for elem_idx, i_inner in T.grid(placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused], 64):
placeholder_5 = T.Buffer((33,), "int32", data=placeholder_3.data)
- cse_var_3: T.int32 = i0_outer_i1_outer_fused % 16 * 2 + nb_j_inner
- cse_var_2: T.int32 = i_inner * 32 + nb_j_inner * 16 + j
placeholder_6 = T.Buffer((78656,), data=placeholder_1.data)
placeholder_7 = T.Buffer((32768,), data=placeholder.data)
placeholder_8 = T.Buffer((4916,), "int32", data=placeholder_2.data)
- compute_2[cse_var_2] = compute_2[cse_var_2] + placeholder_6[placeholder_5[cse_var_3] * 16 + elem_idx * 16 + j] * T.max(placeholder_7[i0_outer_i1_outer_fused // 16 * 8192 + i_inner * 256 + placeholder_8[placeholder_5[cse_var_3] + elem_idx]], T.float32(0))
- for i0_inner, i1_inner in T.grid(32, 32):
- cse_var_4: T.int32 = i0_outer_i1_outer_fused // 16 * 16384 + i0_inner * 512 + i0_outer_i1_outer_fused % 16 * 32 + i1_inner
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_2: T.int32 = i_outer_inner * 1024 + i_inner * 16
+ compute_2[cse_var_2] = compute_2[cse_var_2] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_3: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 1
+ compute_2[cse_var_3] = compute_2[cse_var_3] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 1] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_4: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 2
+ compute_2[cse_var_4] = compute_2[cse_var_4] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 2] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_5: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 3
+ compute_2[cse_var_5] = compute_2[cse_var_5] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 3] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_6: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 4
+ compute_2[cse_var_6] = compute_2[cse_var_6] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 4] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_7: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 5
+ compute_2[cse_var_7] = compute_2[cse_var_7] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 5] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_8: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 6
+ compute_2[cse_var_8] = compute_2[cse_var_8] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 6] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_9: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 7
+ compute_2[cse_var_9] = compute_2[cse_var_9] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 7] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_10: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 8
+ compute_2[cse_var_10] = compute_2[cse_var_10] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 8] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_11: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 9
+ compute_2[cse_var_11] = compute_2[cse_var_11] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 9] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_12: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 10
+ compute_2[cse_var_12] = compute_2[cse_var_12] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 10] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_13: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 11
+ compute_2[cse_var_13] = compute_2[cse_var_13] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 11] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_14: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 12
+ compute_2[cse_var_14] = compute_2[cse_var_14] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 12] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_15: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 13
+ compute_2[cse_var_15] = compute_2[cse_var_15] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 13] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_16: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 14
+ compute_2[cse_var_16] = compute_2[cse_var_16] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 14] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ if T.likely(elem_idx < placeholder_5[i0_outer_i1_outer_fused + 1] - placeholder_5[i0_outer_i1_outer_fused]):
+ cse_var_17: T.int32 = i_outer_inner * 1024 + i_inner * 16 + 15
+ compute_2[cse_var_17] = compute_2[cse_var_17] + placeholder_6[placeholder_5[i0_outer_i1_outer_fused] * 16 + elem_idx * 16 + 15] * T.max(placeholder_7[i_outer_inner * 16384 + i_inner * 256 + placeholder_8[placeholder_5[i0_outer_i1_outer_fused] + elem_idx]], T.float32(0))
+ for i0_inner in range(128):
+ cse_var_18: T.int32 = i0_inner * 512 + i0_outer_i1_outer_fused * 16
compute_3 = T.Buffer((65536,), data=compute.data)
placeholder_5 = T.Buffer((65536,), data=placeholder_4.data)
- compute_3[cse_var_4] = T.max(compute_2[i0_inner * 32 + i1_inner] + placeholder_5[cse_var_4], T.float32(0))
+ compute_3[cse_var_18:cse_var_18 + 16] = T.max(compute_2[i0_inner * 16:i0_inner * 16 + 16] + placeholder_5[cse_var_18:cse_var_18 + 16], T.Broadcast(T.float32(0), 16))
</pre></div>
</div>
</div>
@@ -682,7 +742,7 @@ class Module:
<span class="p">)</span>
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.670 ms
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Execution time of this operator: 1.827 ms
</pre></div>
</div>
<div class="admonition note">
diff --git a/docs/how_to/tune_with_autotvm/sg_execution_times.html b/docs/how_to/tune_with_autotvm/sg_execution_times.html
index beb02e26d4..bb6d8d3f42 100644
--- a/docs/how_to/tune_with_autotvm/sg_execution_times.html
+++ b/docs/how_to/tune_with_autotvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-tune-with-autotvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:52.417</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
+<p><strong>00:27.181</strong> total execution time for <strong>how_to_tune_with_autotvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,11 +349,11 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_conv2d_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-conv2d-cuda-py"><span class="std std-ref">Tuning High Performance Convolution on NVIDIA GPUs</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_conv2d_cuda.py</span></code>)</p></td>
-<td><p>00:52.385</p></td>
+<td><p>00:27.145</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tune_relay_x86.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-x86-py"><span class="std std-ref">Auto-tuning a Convolutional Network for x86 CPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_x86.py</span></code>)</p></td>
-<td><p>00:00.019</p></td>
+<td><p>00:00.022</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="tune_relay_cuda.html#sphx-glr-how-to-tune-with-autotvm-tune-relay-cuda-py"><span class="std std-ref">Auto-tuning a Convolutional Network for NVIDIA GPU</span></a> (<code class="docutils literal notranslate"><span class="pre">tune_relay_cuda.py</span></code>)</p></td>
diff --git a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
index df73619522..88b5495929 100644
--- a/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
+++ b/docs/how_to/tune_with_autotvm/tune_conv2d_cuda.html
@@ -690,7 +690,7 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 2, 32]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6302753
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 2, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6500953
No: 2 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -813,7 +813,7 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 256, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3162506
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 8, 2]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,5717883
No: 3 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -936,7 +936,7 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 4, 16]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 128]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7533856
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 16, 2]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10418626
No: 4 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
@@ -1059,8 +1059,9 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 4, 16]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,7872598
-No: 5 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 32]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4282049
+No: 5 GFLOPS: 55.23/55.23 result: MeasureResult(costs=(0.004191678666666667,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.290860414505005, timestamp=1674648479.445705) [('tile_f', [-1, 1, 64, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 4, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9109233
+No: 6 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1182,8 +1183,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 16]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1353389
-No: 6 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 2, 128]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,207234
+No: 7 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1305,8 +1306,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 2, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9242972
-No: 7 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 256, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 128, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2029993
+No: 8 GFLOPS: 0.00/55.23 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1428,8 +1429,12 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 128, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8544741
-No: 8 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 16]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 16, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1103922
+No: 9 GFLOPS: 27.51/55.23 result: MeasureResult(costs=(0.008414402583333333,), error_no=MeasureErrorNo.NO_ERROR, all_cost=3.1393074989318848, timestamp=1674648483.7522187) [('tile_f', [-1, 64, 1, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9877126
+No: 10 GFLOPS: 31.85/55.23 result: MeasureResult(costs=(0.0072692777857142855,), error_no=MeasureErrorNo.NO_ERROR, all_cost=2.873112916946411, timestamp=1674648484.5286293) [('tile_f', [-1, 32, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 16, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9309544
+No: 11 GFLOPS: 228.53/228.53 result: MeasureResult(costs=(0.0010130230606060606,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.5445051193237305, timestamp=1674648485.2542942) [('tile_f', [-1, 4, 4, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4844421
+No: 12 GFLOPS: 436.89/436.89 result: MeasureResult(costs=(0.0005298873639344263,), error_no=MeasureErrorNo.NO_ERROR, all_cost=1.8932838439941406, timestamp=1674648486.267551) [('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9945801
+No: 13 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1551,26 +1556,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 8, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 1, 32]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4982370
-No: 9 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 142, in build
- res = future.result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 435, in result
- return self.__get_result()
- File "/usr/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
- raise self._exception
- File "/usr/lib/python3.7/concurrent/futures/thread.py", line 57, in run
- result = self.fn(*self.args, **self.kwargs)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 432, in <lambda>
- worker = lambda *args: self._worker_run(*args)
- File "/workspace/python/tvm/contrib/popen_pool.py", line 401, in _worker_run
- return proc.recv()
- File "/workspace/python/tvm/contrib/popen_pool.py", line 309, in recv
- raise TimeoutError()
-TimeoutError
-
- [('tile_f', [-1, 4, 2, 4]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 1]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2346190
-No: 10 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 2, 64, 4]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 8, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2597234
+No: 14 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1692,8 +1679,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 16, 1, 8]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 512, 1]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4485720
-No: 11 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 256, 1, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,10377628
+No: 15 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1815,8 +1802,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3423339
-No: 12 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,2054152
+No: 16 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -1938,8 +1925,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 8, 4, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 64, 8]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8637442
-No: 13 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 4]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 8, 32]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 1)],None,8283346
+No: 17 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2061,8 +2048,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 16, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 32, 4]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,3959938
-No: 14 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 64, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 1, 256]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9091325
+No: 18 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2184,284 +2171,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 1, 8]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 128]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6762941
-No: 15 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 742, in __call__
- yield remote, remote.load_module(os.path.split(build_result.filename)[1])
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 706, in run_through_rpc
- costs = time_f(*args).results
- File "/workspace/python/tvm/runtime/module.py", line 357, in evaluator
- blob = feval(*args)
- File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
- File "tvm/_ffi/_cython/./packed_func.pxi", line 262, in tvm._ffi._cy3.core.FuncCall
- File "tvm/_ffi/_cython/./packed_func.pxi", line 251, in tvm._ffi._cy3.core.FuncCall3
- File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
-tvm._ffi.base.TVMError: Traceback (most recent call last):
- 4: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 3: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 2: tvm::runtime::RPCWrappedFunc::operator()(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../src/runtime/rpc/rpc_module.cc:129
- 1: tvm::runtime::RPCClientSession::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)> const&)
- at ../src/runtime/rpc/rpc_endpoint.cc:1012
- 0: tvm::runtime::RPCEndpoint::CallFunc(void*, TVMValue const*, int const*, int, std::function<void (tvm::runtime::TVMArgs)>)
- at ../src/runtime/rpc/rpc_endpoint.cc:804
- File "../src/runtime/rpc/rpc_endpoint.cc", line 804
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
- Check failed: (code == RPCCode::kReturn) is false: code=kShutdown
-
-During handling of the above exception, another exception occurred:
-
-Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 706, in run_through_rpc
- costs = time_f(*args).results
- File "/usr/lib/python3.7/contextlib.py", line 130, in __exit__
- self.gen.throw(type, value, traceback)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 746, in __call__
- remote.remove(build_result.filename)
- File "/workspace/python/tvm/rpc/client.py", line 144, in remove
- self._remote_funcs["remove"] = self.get_function("tvm.rpc.server.remove")
- File "/workspace/python/tvm/rpc/client.py", line 72, in get_function
- return self._sess.get_function(name)
- File "/workspace/python/tvm/runtime/module.py", line 171, in get_function
- self.handle, c_str(name), ctypes.c_int(query_imports), ctypes.byref(ret_handle)
- File "/workspace/python/tvm/_ffi/base.py", line 348, in check_call
- raise get_last_ffi_error()
-tvm._ffi.base.TVMError: Traceback (most recent call last):
- 52: 0xffffffffffffffff
- 51: _start
- 50: __libc_start_main
- 49: _Py_UnixMain
- 48: 0x0000000000650da0
- 47: 0x0000000000650afa
- 46: _PyFunction_FastCallDict
- 45: _PyEval_EvalCodeWithName
- 44: _PyEval_EvalFrameDefault
- 43: _PyFunction_FastCallKeywords
- 42: _PyEval_EvalCodeWithName
- 41: _PyEval_EvalFrameDefault
- 40: _PyMethodDef_RawFastCallKeywords
- 39: 0x0000000000546369
- 38: _PyEval_EvalCodeWithName
- 37: _PyEval_EvalFrameDefault
- 36: _PyFunction_FastCallKeywords
- 35: _PyEval_EvalCodeWithName
- 34: _PyEval_EvalFrameDefault
- 33: _PyFunction_FastCallDict
- 32: _PyEval_EvalCodeWithName
- 31: _PyEval_EvalFrameDefault
- 30: _PyObject_FastCallDict
- 29: 0x00000000004c06e1
- 28: _PyFunction_FastCallDict
- 27: _PyEval_EvalFrameDefault
- 26: _PyMethodDescr_FastCallKeywords
- 25: 0x00000000005dcb58
- 24: 0x00000000005dc83f
- 23: 0x00000000004ba127
- 22: _PyEval_EvalFrameDefault
- 21: _PyFunction_FastCallKeywords
- 20: _PyEval_EvalFrameDefault
- 19: _PyFunction_FastCallKeywords
- 18: _PyEval_EvalFrameDefault
- 17: _PyFunction_FastCallKeywords
- 16: _PyEval_EvalCodeWithName
- 15: _PyEval_EvalFrameDefault
- 14: 0x0000000000537c30
- 13: _PyObject_FastCallKeywords
- 12: 0x00007f9dc4af1fa2
- 11: _ctypes_callproc
- 10: ffi_call
- 9: ffi_call_unix64
- 8: TVMModGetFunction
- at ../src/runtime/c_runtime_api.cc:408
- 7: tvm::runtime::ModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, bool)
- at ../src/runtime/module.cc:66
- 6: tvm::runtime::RPCModuleNode::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, tvm::runtime::ObjectPtr<tvm::runtime::Object> const&)
- at ../src/runtime/rpc/rpc_module.cc:185
- 5: tvm::runtime::RPCClientSession::GetFunction(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
- at ../src/runtime/rpc/rpc_endpoint.cc:1007
- 4: tvm::runtime::TVMRetValue tvm::runtime::RPCEndpoint::SysCallRemote<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(tvm::runtime::RPCCode, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)
- at ../src/runtime/rpc/rpc_endpoint.h:223
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&>(int&&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/rpc/rpc_endpoint.cc:684
- File "../src/runtime/rpc/rpc_endpoint.cc", line 684
-TVMError:
----------------------------------------------------------------
-An error occurred during the execution of TVM.
-For more information, please see: https://tvm.apache.org/docs/errors.html
----------------------------------------------------------------
- Check failed: (code == RPCCode::kReturn) is false: code=1
-
-Traceback (most recent call last):
- 52: 0xffffffffffffffff
- 51: _start
- 50: __libc_start_main
- 49: _Py_UnixMain
- 48: 0x0000000000650da0
- 47: 0x0000000000650afa
- 46: _PyFunction_FastCallDict
- 45: _PyEval_EvalCodeWithName
- 44: _PyEval_EvalFrameDefault
- 43: _PyFunction_FastCallKeywords
- 42: _PyEval_EvalCodeWithName
- 41: _PyEval_EvalFrameDefault
- 40: _PyMethodDef_RawFastCallKeywords
- 39: 0x0000000000546369
- 38: _PyEval_EvalCodeWithName
- 37: _PyEval_EvalFrameDefault
- 36: _PyFunction_FastCallKeywords
- 35: _PyEval_EvalCodeWithName
- 34: _PyEval_EvalFrameDefault
- 33: _PyFunction_FastCallDict
- 32: _PyEval_EvalCodeWithName
- 31: _PyEval_EvalFrameDefault
- 30: _PyObject_FastCallDict
- 29: 0x00000000004c06e1
- 28: _PyFunction_FastCallDict
- 27: _PyEval_EvalFrameDefault
- 26: _PyMethodDescr_FastCallKeywords
- 25: 0x00000000005dcb58
- 24: 0x00000000005dc83f
- 23: 0x00000000004ba127
- 22: _PyEval_EvalFrameDefault
- 21: _PyFunction_FastCallKeywords
- 20: _PyEval_EvalFrameDefault
- 19: _PyFunction_FastCall [('tile_f', [-1, 4, 1, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 2]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4301358
-No: 16 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
- func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
- func = build(s, args, target_host=task.target_host, runtime=runtime)
- File "/workspace/python/tvm/driver/build_module.py", line 227, in build
- input_mod = lower(inputs, args, name=name, binds=binds)
- File "/workspace/python/tvm/driver/build_module.py", line 134, in lower
- return ffi.lower_schedule(inp, args, name, binds, simple_mode)
- File "tvm/_ffi/_cython/./packed_func.pxi", line 331, in tvm._ffi._cy3.core.PackedFuncBase.__call__
- File "tvm/_ffi/_cython/./packed_func.pxi", line 276, in tvm._ffi._cy3.core.FuncCall
- File "tvm/_ffi/_cython/./base.pxi", line 181, in tvm._ffi._cy3.core.CHECK_CALL
-tvm._ffi.base.TVMError: Traceback (most recent call last):
- 24: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 22: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 21: operator()
- at ../include/tvm/runtime/packed_func.h:1730
- 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
- at ../include/tvm/runtime/packed_func.h:1670
- 19: run<>
- at ../include/tvm/runtime/packed_func.h:1630
- 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1645
- 13: operator()
- at ../src/driver/driver_api.cc:395
- 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:381
- 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:276
- 10: tvm::transform::Pass::operator()(tvm::IRModule) const
- at ../src/ir/transform.cc:258
- 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:451
- 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/tir/ir/transform.cc:100
- 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
- at ../include/tvm/runtime/packed_func.h:1749
- 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
- at ../include/tvm/runtime/packed_func.h:1693
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/c_runtime_api.cc:534
- File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
- raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel
-
-Traceback (most recent call last):
- 24: TVMFuncCall
- at ../src/runtime/c_runtime_api.cc:477
- 23: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 22: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 21: operator()
- at ../include/tvm/runtime/packed_func.h:1730
- 20: unpack_call<tvm::IRModule, 5, tvm::<lambda(tvm::te::Schedule, const tvm::runtime::Array<tvm::runtime::ObjectRef>&, const tvm::runtime::String&, const tvm::runtime::Map<tvm::te::Tensor, tvm::tir::Buffer>&, bool)> >
- at ../include/tvm/runtime/packed_func.h:1670
- 19: run<>
- at ../include/tvm/runtime/packed_func.h:1630
- 18: run<tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 17: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 16: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 15: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1630
- 14: run<tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_, tvm::runtime::TVMMovableArgValueWithContext_>
- at ../include/tvm/runtime/packed_func.h:1645
- 13: operator()
- at ../src/driver/driver_api.cc:395
- 12: tvm::LowerSchedule(tvm::te::Schedule, tvm::runtime::Array<tvm::runtime::ObjectRef, void> const&, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unordered_map<tvm::te::Tensor, tvm::tir::Buffer, std::hash<tvm::te::Tensor>, std::equal_to<tvm::te::Tensor>, std::allocator<std::pair<tvm::te::Tensor const, tvm::tir::Buffer> > > const&, tvm::GlobalVarSupply, bool)
- at ../src/driver/driver_api.cc:381
- 11: tvm::LowerWithPassList(tvm::IRModule, tvm::runtime::Array<tvm::transform::Pass, void>)
- at ../src/driver/driver_api.cc:276
- 10: tvm::transform::Pass::operator()(tvm::IRModule) const
- at ../src/ir/transform.cc:258
- 9: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 8: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:451
- 7: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/ir/transform.cc:274
- 6: tvm::tir::transform::PrimFuncPassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
- at ../src/tir/ir/transform.cc:100
- 5: tvm::runtime::TypedPackedFunc<tvm::tir::PrimFunc (tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext)>::operator()(tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext) const
- at ../include/tvm/runtime/packed_func.h:1749
- 4: tvm::tir::PrimFunc tvm::runtime::detail::typed_packed_call_dispatcher<tvm::tir::PrimFunc>::run<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::runtime::PackedFunc const&, tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&)
- at ../include/tvm/runtime/packed_func.h:1693
- 3: tvm::runtime::TVMRetValue tvm::runtime::PackedFunc::operator()<tvm::tir::PrimFunc, tvm::IRModule, tvm::transform::PassContext>(tvm::tir::PrimFunc&&, tvm::IRModule&&, tvm::transform::PassContext&&) const
- at ../include/tvm/runtime/packed_func.h:1617
- 2: tvm::runtime::PackedFuncObj::CallPacked(tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*) const
- at ../include/tvm/runtime/packed_func.h:1217
- 1: Call
- at ../include/tvm/runtime/packed_func.h:1213
- 0: operator()
- at ../src/runtime/c_runtime_api.cc:534
- File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
- File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
- raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 64, 1]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 2, 32]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,532885
-No: 17 GFLOPS: 0.00/0.00 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 1, 256]), ('tile_y', [-1, 1, 7, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 8, 16]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,1875056
+No: 19 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2583,10 +2294,8 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 64, 4, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 1, 512]), ('tile_ry', [-1, 1, 3]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9872525
-No: 18 GFLOPS: 1.76/1.76 result: MeasureResult(costs=(0.131499069,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.7559709548950195, timestamp=1674629081.3925776) [('tile_f', [-1, 1, 8, 64]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 1, 1, 7]), ('tile_rc', [-1, 2, 1]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 0)],None,4846589
-No: 19 GFLOPS: 70.51/70.51 result: MeasureResult(costs=(0.0032830254193548384,), error_no=MeasureErrorNo.NO_ERROR, all_cost=5.035698413848877, timestamp=1674629082.1421092) [('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8980632
-No: 20 GFLOPS: 0.00/70.51 result: Traceback (most recent call last):
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 32, 2, 8]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 16, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 3, 1]), ('auto_unroll_max_step', 0), ('unroll_explicit', 1)],None,6111528
+No: 20 GFLOPS: 0.00/436.89 result: Traceback (most recent call last):
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 592, in __call__
func, arg_info = _build_func_common(measure_input, self.runtime, **kwargs)
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 544, in _build_func_common
@@ -2708,7 +2417,7 @@ Traceback (most recent call last):
File "tvm/_ffi/_cython/./packed_func.pxi", line 56, in tvm._ffi._cy3.core.tvm_callback
File "/workspace/python/tvm/autotvm/measure/measure_methods.py", line 875, in verify_pass
raise InstantiationError("Skipped because of invalid gpu kernel")
-tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 1, 256]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 1, 1]), ('tile_rc', [-1, 4, 8]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 512), ('unroll_explicit', 0)],None,3199896
+tvm.autotvm.task.space.InstantiationError: Skipped because of invalid gpu kernel [('tile_f', [-1, 1, 32, 1]), ('tile_y', [-1, 1, 1, 7]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 64]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 0), ('unroll_explicit', 0)],None,1328620
</pre></div>
</div>
<p>Finally we can inspect the best config from log file, check correctness,
@@ -2747,9 +2456,9 @@ and measure running time.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Finish loading 20 records
Best config:
-[('tile_f', [-1, 4, 2, 1]), ('tile_y', [-1, 7, 1, 1]), ('tile_x', [-1, 7, 1, 1]), ('tile_rc', [-1, 4, 4]), ('tile_ry', [-1, 3, 1]), ('tile_rx', [-1, 1, 1]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,8980632
+[('tile_f', [-1, 2, 32, 1]), ('tile_y', [-1, 1, 1, 1]), ('tile_x', [-1, 1, 7, 1]), ('tile_rc', [-1, 2, 4]), ('tile_ry', [-1, 1, 1]), ('tile_rx', [-1, 1, 3]), ('auto_unroll_max_step', 1500), ('unroll_explicit', 1)],None,9945801
Finish loading 20 records
-Time cost of this operator: 0.003554
+Time cost of this operator: 0.000917
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-tune-with-autotvm-tune-conv2d-cuda-py">
diff --git a/docs/how_to/work_with_microtvm/micro_autotune.html b/docs/how_to/work_with_microtvm/micro_autotune.html
index 75d381c296..a843c6f433 100644
--- a/docs/how_to/work_with_microtvm/micro_autotune.html
+++ b/docs/how_to/work_with_microtvm/micro_autotune.html
@@ -647,10 +647,10 @@ the tuned operator.</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build without Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 310.8 98.72 (1, 2, 10, 10, 3) 2 1 [310.8]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.04 0.966 (1, 6, 10, 10) 1 1 [3.04]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.991 0.315 (1, 1, 10, 10, 3) 1 1 [0.991]
-Total_time - 314.831 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 309.3 98.645 (1, 2, 10, 10, 3) 2 1 [309.3]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 3.286 1.048 (1, 6, 10, 10) 1 1 [3.286]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.961 0.307 (1, 1, 10, 10, 3) 1 1 [0.961]
+Total_time - 313.547 - - - - -
</pre></div>
</div>
</div>
@@ -702,10 +702,10 @@ Total_time -
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>########## Build with Autotuning ##########
Node Name Ops Time(us) Time(%) Shape Inputs Outputs Measurements(us)
--------- --- -------- ------- ----- ------ ------- ----------------
-tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 102.7 97.495 (1, 6, 10, 10, 1) 2 1 [102.7]
-tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.775 1.685 (1, 6, 10, 10) 1 1 [1.775]
-tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.863 0.82 (1, 3, 10, 10, 1) 1 1 [0.863]
-Total_time - 105.339 - - - - -
+tvmgen_default_fused_nn_contrib_conv2d_NCHWc tvmgen_default_fused_nn_contrib_conv2d_NCHWc 104.6 97.511 (1, 6, 10, 10, 1) 2 1 [104.6]
+tvmgen_default_fused_layout_transform_1 tvmgen_default_fused_layout_transform_1 1.83 1.706 (1, 6, 10, 10) 1 1 [1.83]
+tvmgen_default_fused_layout_transform tvmgen_default_fused_layout_transform 0.84 0.783 (1, 3, 10, 10, 1) 1 1 [0.84]
+Total_time - 107.27 - - - - -
</pre></div>
</div>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-autotune-py">
diff --git a/docs/how_to/work_with_microtvm/micro_pytorch.html b/docs/how_to/work_with_microtvm/micro_pytorch.html
index 5f5eaf65e5..435b6dfe04 100644
--- a/docs/how_to/work_with_microtvm/micro_pytorch.html
+++ b/docs/how_to/work_with_microtvm/micro_pytorch.html
@@ -454,8 +454,8 @@ download a cat image and preprocess it to use as the model input.</p>
Downloading: "https://download.pytorch.org/models/quantized/mobilenet_v2_qnnpack_37f702c5.pth" to /workspace/.cache/torch/hub/checkpoints/mobilenet_v2_qnnpack_37f702c5.pth
0%| | 0.00/3.42M [00:00<?, ?B/s]
- 61%|###### | 2.09M/3.42M [00:00<00:00, 18.8MB/s]
-100%|##########| 3.42M/3.42M [00:00<00:00, 29.1MB/s]
+ 61%|###### | 2.09M/3.42M [00:00<00:00, 19.6MB/s]
+100%|##########| 3.42M/3.42M [00:00<00:00, 30.5MB/s]
/workspace/python/tvm/relay/frontend/pytorch_utils.py:47: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
return LooseVersion(torch_ver) > ver
/venv/apache-tvm-py3.7/lib/python3.7/site-packages/setuptools/_distutils/version.py:346: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.
@@ -579,7 +579,7 @@ via the host <cite>main.cc`</cite> or if a Zephyr emulated board is selected as
Torch top-1 id: 282, class name: tiger cat
</pre></div>
</div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 8.448 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 10.597 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-pytorch-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/12b9ecc04c41abaa12022061771821d1/micro_pytorch.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_pytorch.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/micro_train.html b/docs/how_to/work_with_microtvm/micro_train.html
index dd7a9ea234..749aad40cc 100644
--- a/docs/how_to/work_with_microtvm/micro_train.html
+++ b/docs/how_to/work_with_microtvm/micro_train.html
@@ -524,7 +524,7 @@ take about <strong>2 minutes</strong> to download the Stanford Cars, while COCO
<a href="https://docs.python.org/3/library/shutil.html#shutil.move" title="shutil.move" class="sphx-glr-backref-module-shutil sphx-glr-backref-type-py-function"><span class="n">shutil</span><span class="o">.</span><span class="n">move</span></a><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><a href="https://docs.python.org/3/library/stdtypes.html#str" title="builtins.str" class="sphx-glr-backref-module-builtins sphx-glr-backref-typ [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmp84sny8mt/images/random'
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>'/tmp/tmpp9l12y8g/images/random'
</pre></div>
</div>
</div>
@@ -584,8 +584,8 @@ objects to other stuff? We can display some examples from our datasets using <co
<span class="n">plt</span><span class="o">.</span><span class="n">axis</span><span class="p">(</span><span class="s2">"off"</span><span class="p">)</span>
</pre></div>
</div>
-<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmp84sny8mt/images/target contains 8144 images
-/tmp/tmp84sny8mt/images/random contains 5000 images
+<img src="../../_images/sphx_glr_micro_train_001.png" srcset="../../_images/sphx_glr_micro_train_001.png" alt="[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0]" class = "sphx-glr-single-img"/><div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>/tmp/tmpp9l12y8g/images/target contains 8144 images
+/tmp/tmpp9l12y8g/images/random contains 5000 images
</pre></div>
</div>
</div>
@@ -697,13 +697,13 @@ the time on our validation set).</p>
</pre></div>
</div>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>Epoch 1/3
-328/328 - 47s - loss: 0.2188 - accuracy: 0.9217 - val_loss: 0.1127 - val_accuracy: 0.9585 - 47s/epoch - 142ms/step
+328/328 - 47s - loss: 0.2189 - accuracy: 0.9204 - val_loss: 0.1003 - val_accuracy: 0.9645 - 47s/epoch - 143ms/step
Epoch 2/3
-328/328 - 43s - loss: 0.0993 - accuracy: 0.9647 - val_loss: 0.1226 - val_accuracy: 0.9585 - 43s/epoch - 132ms/step
+328/328 - 43s - loss: 0.0913 - accuracy: 0.9649 - val_loss: 0.1056 - val_accuracy: 0.9611 - 43s/epoch - 131ms/step
Epoch 3/3
-328/328 - 43s - loss: 0.0665 - accuracy: 0.9744 - val_loss: 0.1547 - val_accuracy: 0.9483 - 43s/epoch - 131ms/step
+328/328 - 43s - loss: 0.0709 - accuracy: 0.9755 - val_loss: 0.1266 - val_accuracy: 0.9603 - 43s/epoch - 131ms/step
-<keras.callbacks.History object at 0x7f334d039090>
+<keras.callbacks.History object at 0x7fcaab8ab250>
</pre></div>
</div>
</div>
@@ -963,7 +963,7 @@ as intended.</p>
<p>From here, we could modify the model to read live images from the camera - we have another
Arduino tutorial for how to do that <a class="reference external" href="https://github.com/guberti/tvm-arduino-demos/tree/master/examples/person_detection">on GitHub</a>. Alternatively, we could also
<a class="reference external" href="https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_autotune.html">use TVM’s autotuning capabilities</a> to dramatically improve the model’s performance.</p>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes 10.800 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 4 minutes 44.562 seconds)</p>
<div class="sphx-glr-footer sphx-glr-footer-example docutils container" id="sphx-glr-download-how-to-work-with-microtvm-micro-train-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/b52cec46baf4f78d6bcd94cbe269c8a6/micro_train.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">micro_train.py</span></code></a></p>
diff --git a/docs/how_to/work_with_microtvm/sg_execution_times.html b/docs/how_to/work_with_microtvm/sg_execution_times.html
index 3f745cbe04..e53c9f7fdb 100644
--- a/docs/how_to/work_with_microtvm/sg_execution_times.html
+++ b/docs/how_to/work_with_microtvm/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-microtvm-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>07:23.349</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
+<p><strong>06:59.895</strong> total execution time for <strong>how_to_work_with_microtvm</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,23 +349,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_train.html#sphx-glr-how-to-work-with-microtvm-micro-train-py"><span class="std std-ref">Training Vision Models for microTVM on Arduino</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_train.py</span></code>)</p></td>
-<td><p>05:10.800</p></td>
+<td><p>04:44.562</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_pytorch.html#sphx-glr-how-to-work-with-microtvm-micro-pytorch-py"><span class="std std-ref">microTVM PyTorch Tutorial</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_pytorch.py</span></code>)</p></td>
-<td><p>01:08.448</p></td>
+<td><p>01:10.597</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_autotune.html#sphx-glr-how-to-work-with-microtvm-micro-autotune-py"><span class="std std-ref">Autotuning with microTVM</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_autotune.py</span></code>)</p></td>
-<td><p>00:51.362</p></td>
+<td><p>00:51.796</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_aot.html#sphx-glr-how-to-work-with-microtvm-micro-aot-py"><span class="std std-ref">microTVM Host-Driven AoT</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_aot.py</span></code>)</p></td>
-<td><p>00:08.938</p></td>
+<td><p>00:09.134</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="micro_tflite.html#sphx-glr-how-to-work-with-microtvm-micro-tflite-py"><span class="std std-ref">microTVM with TFLite Models</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tflite.py</span></code>)</p></td>
-<td><p>00:03.800</p></td>
+<td><p>00:03.806</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="micro_tvmc.html#sphx-glr-how-to-work-with-microtvm-micro-tvmc-py"><span class="std std-ref">Executing a Tiny Model with TVMC Micro</span></a> (<code class="docutils literal notranslate"><span class="pre">micro_tvmc.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_relay/sg_execution_times.html b/docs/how_to/work_with_relay/sg_execution_times.html
index c35775a880..978ce3c4ec 100644
--- a/docs/how_to/work_with_relay/sg_execution_times.html
+++ b/docs/how_to/work_with_relay/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-relay-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:44.393</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
+<p><strong>00:44.279</strong> total execution time for <strong>how_to_work_with_relay</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 84%" />
@@ -349,15 +349,15 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="using_pipeline_executor.html#sphx-glr-how-to-work-with-relay-using-pipeline-executor-py"><span class="std std-ref">Using Pipeline Executor in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_pipeline_executor.py</span></code>)</p></td>
-<td><p>00:32.379</p></td>
+<td><p>00:32.499</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_external_lib.html#sphx-glr-how-to-work-with-relay-using-external-lib-py"><span class="std std-ref">Using External Libraries in Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_external_lib.py</span></code>)</p></td>
-<td><p>00:10.447</p></td>
+<td><p>00:10.427</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="build_gcn.html#sphx-glr-how-to-work-with-relay-build-gcn-py"><span class="std std-ref">Building a Graph Convolutional Network</span></a> (<code class="docutils literal notranslate"><span class="pre">build_gcn.py</span></code>)</p></td>
-<td><p>00:01.560</p></td>
+<td><p>00:01.347</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="using_relay_viz.html#sphx-glr-how-to-work-with-relay-using-relay-viz-py"><span class="std std-ref">Use Relay Visualizer to Visualize Relay</span></a> (<code class="docutils literal notranslate"><span class="pre">using_relay_viz.py</span></code>)</p></td>
diff --git a/docs/how_to/work_with_schedules/intrin_math.html b/docs/how_to/work_with_schedules/intrin_math.html
index 3cc72cb308..767da8458f 100644
--- a/docs/how_to/work_with_schedules/intrin_math.html
+++ b/docs/how_to/work_with_schedules/intrin_math.html
@@ -535,7 +535,7 @@ The following example customizes CUDA lowering rule for <code class="code docuti
<a href="../../reference/api/python/ir.html#tvm.ir.register_intrin_lowering" title="tvm.ir.register_intrin_lowering" class="sphx-glr-backref-module-tvm-ir sphx-glr-backref-type-py-function"><span class="n">register_intrin_lowering</span></a><span class="p">(</span><span class="s2">"tir.exp"</span><span class="p">,</span> <span class="n">target</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">,</span> <span class="n">f</span><span class="o">= [...]
</pre></div>
</div>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7f334dccf560>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span><function my_cuda_math_rule at 0x7fcaab686b90>
</pre></div>
</div>
<p>Register the rule to TVM with override option to override existing rule.
diff --git a/docs/how_to/work_with_schedules/sg_execution_times.html b/docs/how_to/work_with_schedules/sg_execution_times.html
index 5ad011f4cd..bc1188ab9b 100644
--- a/docs/how_to/work_with_schedules/sg_execution_times.html
+++ b/docs/how_to/work_with_schedules/sg_execution_times.html
@@ -340,7 +340,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-how-to-work-with-schedules-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>00:06.549</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
+<p><strong>00:07.755</strong> total execution time for <strong>how_to_work_with_schedules</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 83%" />
@@ -349,23 +349,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="intrin_math.html#sphx-glr-how-to-work-with-schedules-intrin-math-py"><span class="std std-ref">Intrinsics and Math Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">intrin_math.py</span></code>)</p></td>
-<td><p>00:04.039</p></td>
+<td><p>00:05.255</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tensorize.html#sphx-glr-how-to-work-with-schedules-tensorize-py"><span class="std std-ref">Use Tensorize to Leverage Hardware Intrinsics</span></a> (<code class="docutils literal notranslate"><span class="pre">tensorize.py</span></code>)</p></td>
-<td><p>00:01.150</p></td>
+<td><p>00:01.140</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="reduction.html#sphx-glr-how-to-work-with-schedules-reduction-py"><span class="std std-ref">Reduction</span></a> (<code class="docutils literal notranslate"><span class="pre">reduction.py</span></code>)</p></td>
-<td><p>00:00.580</p></td>
+<td><p>00:00.576</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="scan.html#sphx-glr-how-to-work-with-schedules-scan-py"><span class="std std-ref">Scan and Recurrent Kernel</span></a> (<code class="docutils literal notranslate"><span class="pre">scan.py</span></code>)</p></td>
-<td><p>00:00.560</p></td>
+<td><p>00:00.558</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="extern_op.html#sphx-glr-how-to-work-with-schedules-extern-op-py"><span class="std std-ref">External Tensor Functions</span></a> (<code class="docutils literal notranslate"><span class="pre">extern_op.py</span></code>)</p></td>
-<td><p>00:00.115</p></td>
+<td><p>00:00.119</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="schedule_primitives.html#sphx-glr-how-to-work-with-schedules-schedule-primitives-py"><span class="std std-ref">Schedule Primitives in TVM</span></a> (<code class="docutils literal notranslate"><span class="pre">schedule_primitives.py</span></code>)</p></td>
@@ -377,7 +377,7 @@
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="tuple_inputs.html#sphx-glr-how-to-work-with-schedules-tuple-inputs-py"><span class="std std-ref">Compute and Reduce with Tuple Inputs</span></a> (<code class="docutils literal notranslate"><span class="pre">tuple_inputs.py</span></code>)</p></td>
-<td><p>00:00.023</p></td>
+<td><p>00:00.024</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>
diff --git a/docs/how_to/work_with_schedules/tensorize.html b/docs/how_to/work_with_schedules/tensorize.html
index 2b6c9da556..312f400bf1 100644
--- a/docs/how_to/work_with_schedules/tensorize.html
+++ b/docs/how_to/work_with_schedules/tensorize.html
@@ -574,7 +574,7 @@ class Module:
def main(A: T.Buffer((1024, 64), "float32"), B: T.Buffer((512, 64), "float32"), C: T.Buffer((1024, 512), "float32")):
T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
i = T.var("int32")
- T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmp7fxwoglv/input0.cc'\nsource_filename = \"/tmp/tmp7fxwoglv/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca [...]
+ T.attr(T.iter_var(i, None, "DataPar", ""), "pragma_import_llvm", "; ModuleID = '/tmp/tmpkharrg48/input0.cc'\nsource_filename = \"/tmp/tmpkharrg48/input0.cc\"\ntarget datalayout = \"e-m:e-i64:64-f80:128-n8:16:32:64-S128\"\ntarget triple = \"x86_64-pc-linux-gnu\"\n\n; Function Attrs: noinline nounwind optnone uwtable\ndefine dso_local i32 @gemv_update(float*, float*, float*, i32, i32, i32) #0 {\n %7 = alloca [...]
for i, j_outer in T.grid(1024, 32):
T.call_extern("int32", "gemv_update", T.tvm_access_ptr(T.type_annotation("float32"), C.data, i * 512 + j_outer * 16, 16, 2), T.tvm_access_ptr(T.type_annotation("float32"), A.data, i * 64, 64, 1), T.tvm_access_ptr(T.type_annotation("float32"), B.data, j_outer * 1024, 1024, 1), 16, 64, 64)
</pre></div>
diff --git a/docs/install/nnpack.html b/docs/install/nnpack.html
index 23d2181e9d..1ef28de467 100644
--- a/docs/install/nnpack.html
+++ b/docs/install/nnpack.html
@@ -229,17 +229,7 @@
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="index.html">Installing TVM</a><ul class="current">
-<li class="toctree-l2 current"><a class="reference internal" href="from_source.html">Install from Source</a><ul class="current">
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#developers-get-source-from-github">Developers: Get Source from Github</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#build-the-shared-library">Build the Shared Library</a></li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#python-package-installation">Python Package Installation</a></li>
-<li class="toctree-l3 current"><a class="reference internal" href="from_source.html#install-contrib-libraries">Install Contrib Libraries</a><ul class="current">
-<li class="toctree-l4 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a></li>
-</ul>
-</li>
-<li class="toctree-l3"><a class="reference internal" href="from_source.html#enable-c-tests">Enable C++ Tests</a></li>
-</ul>
-</li>
+<li class="toctree-l2"><a class="reference internal" href="from_source.html">Install from Source</a></li>
<li class="toctree-l2"><a class="reference internal" href="docker.html">Docker Images</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">NNPACK Contrib Installation</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#conditions">Conditions</a></li>
diff --git a/docs/reference/api/doxygen/affine__type_8h.html b/docs/reference/api/doxygen/affine__type_8h.html
index c578eb7dc3..b42693a07f 100644
--- a/docs/reference/api/doxygen/affine__type_8h.html
+++ b/docs/reference/api/doxygen/affine__type_8h.html
@@ -77,7 +77,7 @@ $(function() {
</div><div class="textblock"><div class="dynheader">
Include dependency graph for affine_type.h:</div>
<div class="dyncontent">
-<div class="center"><iframe scrolling="no" frameborder="0" src="affine__type_8h__incl.svg" width="4323" height="1231"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
+<div class="center"><iframe scrolling="no" frameborder="0" src="affine__type_8h__incl.svg" width="4248" height="1231"><p><b>This browser is not able to show SVG: try Firefox, Chrome, Safari, or Opera instead.</b></p></iframe>
</div>
</div>
</div>
diff --git a/docs/reference/api/doxygen/affine__type_8h__incl.svg b/docs/reference/api/doxygen/affine__type_8h__incl.svg
index e8e777ba4a..079a09032d 100644
--- a/docs/reference/api/doxygen/affine__type_8h__incl.svg
+++ b/docs/reference/api/doxygen/affine__type_8h__incl.svg
@@ -4,1409 +4,1417 @@
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: include/tvm/ir/affine_type.h Pages: 1 -->
-<svg width="3242pt" height="923pt"
- viewBox="0.00 0.00 3242.00 923.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<svg width="3186pt" height="923pt"
+ viewBox="0.00 0.00 3186.00 923.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 919)">
<title>include/tvm/ir/affine_type.h</title>
-<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-919 3238,-919 3238,4 -4,4"/>
+<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-919 3182,-919 3182,4 -4,4"/>
<!-- Node0 -->
<g id="node1" class="node">
<title>Node0</title>
-<polygon fill="#bfbfbf" stroke="#000000" points="1266.5,-884.5 1266.5,-914.5 1381.5,-914.5 1381.5,-884.5 1266.5,-884.5"/>
-<text text-anchor="start" x="1274.5" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/affine</text>
-<text text-anchor="middle" x="1324" y="-891.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_type.h</text>
+<polygon fill="#bfbfbf" stroke="#000000" points="1664.5,-884.5 1664.5,-914.5 1779.5,-914.5 1779.5,-884.5 1664.5,-884.5"/>
+<text text-anchor="start" x="1672.5" y="-902.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">include/tvm/ir/affine</text>
+<text text-anchor="middle" x="1722" y="-891.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_type.h</text>
</g>
<!-- Node1 -->
<g id="node2" class="node">
<title>Node1</title>
<g id="a_node2"><a xlink:href="ir_2expr_8h.html" target="_top" xlink:title="Base expr nodes in TVM. ">
-<polygon fill="#ffffff" stroke="#000000" points="1250.5,-828.5 1250.5,-847.5 1329.5,-847.5 1329.5,-828.5 1250.5,-828.5"/>
-<text text-anchor="middle" x="1290" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/expr.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1648.5,-828.5 1648.5,-847.5 1727.5,-847.5 1727.5,-828.5 1648.5,-828.5"/>
+<text text-anchor="middle" x="1688" y="-835.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/expr.h</text>
</a>
</g>
</g>
<!-- Node0->Node1 -->
<g id="edge1" class="edge">
<title>Node0->Node1</title>
-<path fill="none" stroke="#191970" d="M1315.5955,-884.2977C1310.9743,-875.9388 1305.2031,-865.4997 1300.3111,-856.6509"/>
-<polygon fill="#191970" stroke="#191970" points="1303.3452,-854.9051 1295.4438,-847.8469 1297.2191,-858.292 1303.3452,-854.9051"/>
+<path fill="none" stroke="#191970" d="M1713.5955,-884.2977C1708.9743,-875.9388 1703.2031,-865.4997 1698.3111,-856.6509"/>
+<polygon fill="#191970" stroke="#191970" points="1701.3452,-854.9051 1693.4438,-847.8469 1695.2191,-858.292 1701.3452,-854.9051"/>
</g>
-<!-- Node50 -->
+<!-- Node52 -->
<g id="node50" class="node">
-<title>Node50</title>
+<title>Node52</title>
<g id="a_node50"><a xlink:href="ir_2type_8h.html" target="_top" xlink:title="IR/AST nodes for the unified type system in TVM. ">
-<polygon fill="#ffffff" stroke="#000000" points="1318,-772.5 1318,-791.5 1398,-791.5 1398,-772.5 1318,-772.5"/>
-<text text-anchor="middle" x="1358" y="-779.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/type.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1682,-772.5 1682,-791.5 1762,-791.5 1762,-772.5 1682,-772.5"/>
+<text text-anchor="middle" x="1722" y="-779.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/type.h</text>
</a>
</g>
</g>
-<!-- Node0->Node50 -->
-<g id="edge169" class="edge">
-<title>Node0->Node50</title>
-<path fill="none" stroke="#191970" d="M1328.4757,-884.18C1331.4641,-873.9396 1335.4773,-860.1652 1339,-848 1343.5084,-832.4308 1348.5862,-814.7865 1352.3574,-801.6608"/>
-<polygon fill="#191970" stroke="#191970" points="1355.7807,-802.4204 1355.1769,-791.8428 1349.0526,-800.4882 1355.7807,-802.4204"/>
+<!-- Node0->Node52 -->
+<g id="edge171" class="edge">
+<title>Node0->Node52</title>
+<path fill="none" stroke="#191970" d="M1728.0692,-884.2277C1733.0702,-869.7986 1738.9043,-847.5174 1736,-828 1734.6673,-819.0439 1731.9506,-809.419 1729.2619,-801.3"/>
+<polygon fill="#191970" stroke="#191970" points="1732.4936,-799.9402 1725.8624,-791.6772 1725.8933,-802.272 1732.4936,-799.9402"/>
</g>
<!-- Node2 -->
<g id="node3" class="node">
<title>Node2</title>
-<g id="a_node3"><a xlink:href="ir_2span_8h.html" target="_top" xlink:title="Span information for debugging purposes. ">
-<polygon fill="#ffffff" stroke="#000000" points="1283.5,-716.5 1283.5,-735.5 1364.5,-735.5 1364.5,-716.5 1283.5,-716.5"/>
-<text text-anchor="middle" x="1324" y="-723.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/span.h</text>
+<g id="a_node3"><a xlink:href="source__map_8h.html" target="_top" xlink:title="A map from source names to source code. ">
+<polygon fill="#ffffff" stroke="#000000" points="1528.5,-716.5 1528.5,-735.5 1645.5,-735.5 1645.5,-716.5 1528.5,-716.5"/>
+<text text-anchor="middle" x="1587" y="-723.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/ir/source_map.h</text>
</a>
</g>
</g>
<!-- Node1->Node2 -->
<g id="edge2" class="edge">
<title>Node1->Node2</title>
-<path fill="none" stroke="#191970" d="M1292.7259,-828.1282C1296.3231,-815.1967 1302.9132,-791.8343 1309,-772 1311.6731,-763.2896 1314.7738,-753.7022 1317.46,-745.5448"/>
-<polygon fill="#191970" stroke="#191970" points="1320.8531,-746.4324 1320.6815,-735.8389 1314.2095,-744.2272 1320.8531,-746.4324"/>
+<path fill="none" stroke="#191970" d="M1679.3888,-828.4509C1662.3196,-809.5227 1623.9859,-767.014 1602.4457,-743.1279"/>
+<polygon fill="#191970" stroke="#191970" points="1604.8856,-740.6073 1595.5894,-735.5249 1599.6872,-745.2952 1604.8856,-740.6073"/>
</g>
<!-- Node3 -->
<g id="node4" class="node">
<title>Node3</title>
<g id="a_node4"><a xlink:href="node_8h.html" target="_top" xlink:title="Definitions and helper macros for IR/AST nodes. ">
-<polygon fill="#ffffff" stroke="#000000" points="1343.5,-660.5 1343.5,-679.5 1442.5,-679.5 1442.5,-660.5 1343.5,-660.5"/>
-<text text-anchor="middle" x="1393" y="-667.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/node.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1720.5,-660.5 1720.5,-679.5 1819.5,-679.5 1819.5,-660.5 1720.5,-660.5"/>
+<text text-anchor="middle" x="1770" y="-667.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/node.h</text>
</a>
</g>
</g>
<!-- Node1->Node3 -->
-<g id="edge162" class="edge">
+<g id="edge164" class="edge">
<title>Node1->Node3</title>
-<path fill="none" stroke="#191970" d="M1284.8689,-828.4296C1273.8472,-806.4257 1250.7967,-751.4126 1275,-716 1283.5552,-703.4827 1317.1376,-691.1021 1346.2996,-682.3437"/>
-<polygon fill="#191970" stroke="#191970" points="1347.3594,-685.6804 1355.9772,-679.5173 1345.397,-678.9611 1347.3594,-685.6804"/>
+<path fill="none" stroke="#191970" d="M1721.8965,-828.4632C1739.8701,-821.4893 1760.3802,-810.0275 1771,-792 1789.7302,-760.205 1782.5039,-715.1829 1775.9754,-689.6624"/>
+<polygon fill="#191970" stroke="#191970" points="1779.2616,-688.4158 1773.2093,-679.7198 1772.5177,-690.2921 1779.2616,-688.4158"/>
</g>
<!-- Node8 -->
<g id="node9" class="node">
<title>Node8</title>
<g id="a_node9"><a xlink:href="object_8h.html" target="_top" xlink:title="A managed object in the TVM runtime. ">
-<polygon fill="#ffffff" stroke="#000000" points="2004.5,-123.5 2004.5,-142.5 2123.5,-142.5 2123.5,-123.5 2004.5,-123.5"/>
-<text text-anchor="middle" x="2064" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/object.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1870.5,-123.5 1870.5,-142.5 1989.5,-142.5 1989.5,-123.5 1870.5,-123.5"/>
+<text text-anchor="middle" x="1930" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/object.h</text>
</a>
</g>
</g>
<!-- Node1->Node8 -->
-<g id="edge164" class="edge">
+<g id="edge166" class="edge">
<title>Node1->Node8</title>
-<path fill="none" stroke="#191970" d="M1329.924,-837.7096C1597.5434,-835.5086 3120,-818.9046 3120,-726 3120,-726 3120,-726 3120,-558 3120,-376.7397 3020.7587,-337.1158 2871,-235 2808.4061,-192.3192 2784.2062,-194.2675 2710,-179 2601.4429,-156.6651 2276.9788,-141.4501 2133.5231,-135.6346"/>
-<polygon fill="#191970" stroke="#191970" points="2133.6451,-132.1368 2123.5125,-135.2322 2133.3638,-139.1311 2133.6451,-132.1368"/>
+<path fill="none" stroke="#191970" d="M1727.6971,-836.9012C1953.3066,-830.4021 3064,-794.9077 3064,-726 3064,-726 3064,-726 3064,-670 3064,-262.2465 2690.0801,-252.4688 2289,-179 2189.2943,-160.7362 2072.6947,-147.2243 2000.0834,-139.7278"/>
+<polygon fill="#191970" stroke="#191970" points="2000.0009,-136.2012 1989.6967,-138.6655 1999.2886,-143.1649 2000.0009,-136.2012"/>
</g>
<!-- Node14 -->
<g id="node15" class="node">
<title>Node14</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="320,-62 320,-81 364,-81 364,-62 320,-62"/>
-<text text-anchor="middle" x="342" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2813,-62 2813,-81 2857,-81 2857,-62 2813,-62"/>
+<text text-anchor="middle" x="2835" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string</text>
</g>
<!-- Node1->Node14 -->
-<g id="edge167" class="edge">
+<g id="edge169" class="edge">
<title>Node1->Node14</title>
-<path fill="none" stroke="#191970" d="M1250.2332,-837.1734C1047.8782,-832.8802 139.8992,-812.5346 84,-792 39.4333,-775.6284 0,-773.4787 0,-726 0,-726 0,-726 0,-189 0,-143.2481 37.6975,-142.6809 79,-123 156.2495,-86.1902 258.2474,-75.6896 309.5622,-72.6947"/>
-<polygon fill="#191970" stroke="#191970" points="309.9126,-76.1812 319.7157,-72.1627 309.5463,-69.1908 309.9126,-76.1812"/>
+<path fill="none" stroke="#191970" d="M1727.7743,-837.0329C1899.8276,-832.7439 2580.9196,-814.6477 2794,-792 2966.1992,-773.6974 3178,-899.1692 3178,-726 3178,-726 3178,-726 3178,-189 3178,-147.8295 3149.3628,-142.307 3113,-123 3070.6858,-100.5331 2931.2605,-82.3849 2867.4274,-75.0325"/>
+<polygon fill="#191970" stroke="#191970" points="2867.472,-71.5152 2857.1413,-73.8661 2866.6832,-78.4706 2867.472,-71.5152"/>
</g>
<!-- Node15 -->
<g id="node16" class="node">
<title>Node15</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2408.5,-62 2408.5,-81 2477.5,-81 2477.5,-62 2408.5,-62"/>
-<text text-anchor="middle" x="2443" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">type_traits</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="844.5,-62 844.5,-81 913.5,-81 913.5,-62 844.5,-62"/>
+<text text-anchor="middle" x="879" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">type_traits</text>
</g>
<!-- Node1->Node15 -->
-<g id="edge168" class="edge">
+<g id="edge170" class="edge">
<title>Node1->Node15</title>
-<path fill="none" stroke="#191970" d="M1329.5491,-837.805C1543.7009,-836.6052 2566.3199,-828.9999 2880,-792 3022.4872,-775.193 3196,-869.475 3196,-726 3196,-726 3196,-726 3196,-373.5 3196,-208.1719 3075.9552,-180.6392 2921,-123 2842.1535,-93.6712 2591.1087,-78.5992 2487.9332,-73.5164"/>
-<polygon fill="#191970" stroke="#191970" points="2487.9967,-70.0155 2477.8392,-73.0279 2487.6583,-77.0073 2487.9967,-70.0155"/>
+<path fill="none" stroke="#191970" d="M1648.2355,-837.7643C1456.6482,-836.4521 627.5644,-828.7164 372,-792 254.8438,-775.1684 114,-844.3592 114,-726 114,-726 114,-726 114,-373.5 114,-159.9583 323.1322,-195.4718 524,-123 580.3186,-102.6806 752.3005,-83.832 834.4305,-75.7074"/>
+<polygon fill="#191970" stroke="#191970" points="834.8485,-79.1833 844.4594,-74.7245 834.1657,-72.2166 834.8485,-79.1833"/>
</g>
<!-- Node24 -->
<g id="node25" class="node">
<title>Node24</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="937,-179.5 937,-198.5 1001,-198.5 1001,-179.5 937,-179.5"/>
-<text text-anchor="middle" x="969" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">algorithm</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="668,-179.5 668,-198.5 732,-198.5 732,-179.5 668,-179.5"/>
+<text text-anchor="middle" x="700" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">algorithm</text>
</g>
<!-- Node1->Node24 -->
-<g id="edge165" class="edge">
+<g id="edge167" class="edge">
<title>Node1->Node24</title>
-<path fill="none" stroke="#191970" d="M1250.4008,-836.9538C1096.7801,-832.7522 540.1083,-816.16 365,-792 234.485,-773.9926 76,-857.7514 76,-726 76,-726 76,-726 76,-373.5 76,-285.4628 750.2366,-211.1184 926.9927,-193.1411"/>
-<polygon fill="#191970" stroke="#191970" points="927.3571,-196.6222 936.9544,-192.1342 926.6531,-189.6577 927.3571,-196.6222"/>
+<path fill="none" stroke="#191970" d="M1648.318,-837.1673C1411.0283,-831.9342 190,-801.3454 190,-726 190,-726 190,-726 190,-502 190,-283.2021 535.1872,-212.5647 657.5896,-194.3948"/>
+<polygon fill="#191970" stroke="#191970" points="658.3113,-197.827 667.7103,-192.9374 657.3135,-190.8985 658.3113,-197.827"/>
</g>
<!-- Node26 -->
<g id="node27" class="node">
<title>Node26</title>
<g id="a_node27"><a xlink:href="string_8h.html" target="_top" xlink:title="Runtime String container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="934,-291.5 934,-321.5 1060,-321.5 1060,-291.5 934,-291.5"/>
-<text text-anchor="start" x="942" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="997" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/string.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1738,-291.5 1738,-321.5 1864,-321.5 1864,-291.5 1738,-291.5"/>
+<text text-anchor="start" x="1746" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="1801" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/string.h</text>
</a>
</g>
</g>
<!-- Node1->Node26 -->
-<g id="edge163" class="edge">
+<g id="edge165" class="edge">
<title>Node1->Node26</title>
-<path fill="none" stroke="#191970" d="M1250.497,-836.2286C1049.7342,-826.9718 152,-782.4242 152,-726 152,-726 152,-726 152,-558 152,-214.907 546.1174,-360.8822 887,-322 898.9012,-320.6425 911.5024,-319.0314 923.7634,-317.3695"/>
-<polygon fill="#191970" stroke="#191970" points="924.5304,-320.797 933.9598,-315.9661 923.5759,-313.8624 924.5304,-320.797"/>
+<path fill="none" stroke="#191970" d="M1648.2596,-834.5024C1474.1274,-818.7841 784.4162,-752.2897 723,-680 717.2448,-673.2258 721.346,-668.7336 723,-660 727.9758,-633.7264 724.2458,-620.862 745,-604 1050.0948,-356.1226 1258.9081,-618.1611 1617,-456 1634.9947,-447.8511 1729.4726,-367.8532 1775.6055,-328.3475"/>
+<polygon fill="#191970" stroke="#191970" points="1778.0521,-330.8602 1783.3652,-321.6938 1773.4956,-325.5462 1778.0521,-330.8602"/>
</g>
<!-- Node45 -->
-<g id="node45" class="node">
+<g id="node43" class="node">
<title>Node45</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2696,-364 2696,-383 2740,-383 2740,-364 2696,-364"/>
-<text text-anchor="middle" x="2718" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">limits</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2650,-364 2650,-383 2694,-383 2694,-364 2650,-364"/>
+<text text-anchor="middle" x="2672" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">limits</text>
</g>
<!-- Node1->Node45 -->
-<g id="edge166" class="edge">
+<g id="edge168" class="edge">
<title>Node1->Node45</title>
-<path fill="none" stroke="#191970" d="M1329.8914,-837.5457C1587.5412,-834.3573 3006,-812.8595 3006,-726 3006,-726 3006,-726 3006,-558 3006,-437.4856 2824.9876,-392.0952 2749.9723,-378.459"/>
-<polygon fill="#191970" stroke="#191970" points="2750.5733,-375.011 2740.1207,-376.7411 2749.3707,-381.907 2750.5733,-375.011"/>
+<path fill="none" stroke="#191970" d="M1727.5334,-832.6289C1892.3986,-809.7808 2524.77,-717.5522 2701,-624 2769.9835,-587.3799 2850.4651,-562.568 2817,-492 2793.3798,-442.1919 2737.4384,-406.3784 2702.3529,-387.8716"/>
+<polygon fill="#191970" stroke="#191970" points="2703.5508,-384.553 2693.0542,-383.1135 2700.3621,-390.7846 2703.5508,-384.553"/>
</g>
-<!-- Node1->Node50 -->
-<g id="edge155" class="edge">
-<title>Node1->Node50</title>
-<path fill="none" stroke="#191970" d="M1301.8447,-828.2455C1311.888,-819.9746 1326.4995,-807.9416 1338.3004,-798.2232"/>
-<polygon fill="#191970" stroke="#191970" points="1340.7967,-800.7015 1346.291,-791.6427 1336.3467,-795.298 1340.7967,-800.7015"/>
+<!-- Node1->Node52 -->
+<g id="edge157" class="edge">
+<title>Node1->Node52</title>
+<path fill="none" stroke="#191970" d="M1693.9223,-828.2455C1698.6027,-820.5367 1705.2673,-809.5598 1710.9289,-800.2348"/>
+<polygon fill="#191970" stroke="#191970" points="1713.9475,-802.007 1716.1455,-791.6427 1707.9639,-798.3741 1713.9475,-802.007"/>
</g>
<!-- Node2->Node3 -->
<g id="edge3" class="edge">
<title>Node2->Node3</title>
-<path fill="none" stroke="#191970" d="M1336.0189,-716.2455C1346.2098,-707.9746 1361.0362,-695.9416 1373.0107,-686.2232"/>
-<polygon fill="#191970" stroke="#191970" points="1375.5598,-688.662 1381.1188,-679.6427 1371.1486,-683.2268 1375.5598,-688.662"/>
+<path fill="none" stroke="#191970" d="M1618.4588,-716.3733C1649.0207,-707.021 1695.5826,-692.7725 1729.0261,-682.5385"/>
+<polygon fill="#191970" stroke="#191970" points="1730.4136,-685.7742 1738.9517,-679.5011 1728.3652,-679.0806 1730.4136,-685.7742"/>
</g>
<!-- Node2->Node8 -->
-<g id="edge153" class="edge">
+<g id="edge145" class="edge">
<title>Node2->Node8</title>
-<path fill="none" stroke="#191970" d="M1364.5023,-725.355C1613.8509,-721.1256 2930,-694.9667 2930,-614 2930,-614 2930,-614 2930,-558 2930,-409.7806 2957.0766,-334.2579 2847,-235 2763.7253,-159.9097 2711.7924,-196.2708 2601,-179 2434.5218,-153.0486 2236.5602,-140.8464 2133.7695,-135.8908"/>
-<polygon fill="#191970" stroke="#191970" points="2133.6752,-132.3825 2123.5209,-135.4056 2133.3441,-139.3747 2133.6752,-132.3825"/>
+<path fill="none" stroke="#191970" d="M1645.5664,-724.3038C1906.4531,-716.3701 2950,-680.4803 2950,-614 2950,-614 2950,-614 2950,-502 2950,-326.4346 2400.3203,-217.3739 2229,-179 2151.3625,-161.61 2060.91,-148.6935 1999.9945,-141.0454"/>
+<polygon fill="#191970" stroke="#191970" points="2000.0821,-137.5295 1989.7271,-139.7711 1999.2199,-144.4762 2000.0821,-137.5295"/>
</g>
<!-- Node2->Node14 -->
<g id="edge154" class="edge">
<title>Node2->Node14</title>
-<path fill="none" stroke="#191970" d="M1283.3951,-725.324C1072.7355,-720.7492 114,-686.4202 114,-440.5 114,-440.5 114,-440.5 114,-189 114,-102.0732 246.5702,-79.3944 309.5337,-73.5253"/>
-<polygon fill="#191970" stroke="#191970" points="310.0527,-76.994 319.7233,-72.6663 309.4646,-70.0188 310.0527,-76.994"/>
+<path fill="none" stroke="#191970" d="M1645.7509,-725.3941C1843.0144,-723.0686 2480.6052,-713.1139 2683,-680 2771.1437,-665.5788 2790.2438,-648.9632 2876,-624 2976.4698,-594.7537 3102,-662.64 3102,-558 3102,-558 3102,-558 3102,-189 3102,-152.7294 3084.2095,-143.0734 3054,-123 3023.5021,-102.735 2920.1981,-84.5283 2867.0979,-76.2358"/>
+<polygon fill="#191970" stroke="#191970" points="2867.5994,-72.7719 2857.1837,-74.7118 2866.5358,-79.6906 2867.5994,-72.7719"/>
+</g>
+<!-- Node16 -->
+<g id="node17" class="node">
+<title>Node16</title>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="628.5,-62 628.5,-81 673.5,-81 673.5,-62 628.5,-62"/>
+<text text-anchor="middle" x="651" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">utility</text>
+</g>
+<!-- Node2->Node16 -->
+<g id="edge155" class="edge">
+<title>Node2->Node16</title>
+<path fill="none" stroke="#191970" d="M1528.1753,-724.7102C1330.2204,-720.1667 695.6569,-703.8794 607,-680 454.0708,-638.8091 0,-464.8793 0,-306.5 0,-306.5 0,-306.5 0,-189 0,-150.8166 21.1514,-140.6705 55,-123 105.1084,-96.8412 501.0121,-77.9293 618.3414,-72.8565"/>
+<polygon fill="#191970" stroke="#191970" points="618.6552,-76.3464 628.4965,-72.4221 618.356,-69.3528 618.6552,-76.3464"/>
+</g>
+<!-- Node18 -->
+<g id="node19" class="node">
+<title>Node18</title>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="646.5,-235.5 646.5,-254.5 693.5,-254.5 693.5,-235.5 646.5,-235.5"/>
+<text text-anchor="middle" x="670" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">vector</text>
+</g>
+<!-- Node2->Node18 -->
+<g id="edge156" class="edge">
+<title>Node2->Node18</title>
+<path fill="none" stroke="#191970" d="M1528.4918,-725.2143C1322.9075,-721.6897 646,-703.3041 646,-614 646,-614 646,-614 646,-373.5 646,-334.1591 657.0044,-289.1772 664.1102,-264.2323"/>
+<polygon fill="#191970" stroke="#191970" points="667.4917,-265.1392 666.9604,-254.5577 660.777,-263.161 667.4917,-265.1392"/>
+</g>
+<!-- Node41 -->
+<g id="node38" class="node">
+<title>Node41</title>
+<g id="a_node38"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type-erased function used across TVM API. ">
+<polygon fill="#ffffff" stroke="#000000" points="1492,-425.5 1492,-455.5 1608,-455.5 1608,-425.5 1492,-425.5"/>
+<text text-anchor="start" x="1500" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/packed</text>
+<text text-anchor="middle" x="1550" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_func.h</text>
+</a>
+</g>
+</g>
+<!-- Node2->Node41 -->
+<g id="edge146" class="edge">
+<title>Node2->Node41</title>
+<path fill="none" stroke="#191970" d="M1528.3831,-723.4185C1351.6366,-714.8618 836,-683.9079 836,-614 836,-614 836,-614 836,-558 836,-492.9559 1303.3019,-456.2515 1481.8092,-444.6034"/>
+<polygon fill="#191970" stroke="#191970" points="1482.1196,-448.0908 1491.8731,-443.9533 1481.6683,-441.1053 1482.1196,-448.0908"/>
+</g>
+<!-- Node50 -->
+<g id="node48" class="node">
+<title>Node50</title>
+<g id="a_node48"><a xlink:href="registry_8h.html" target="_top" xlink:title="This file defines the TVM global function registry. ">
+<polygon fill="#ffffff" stroke="#000000" points="492.5,-492.5 492.5,-511.5 617.5,-511.5 617.5,-492.5 492.5,-492.5"/>
+<text text-anchor="middle" x="555" y="-499.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/registry.h</text>
+</a>
+</g>
+</g>
+<!-- Node2->Node50 -->
+<g id="edge147" class="edge">
+<title>Node2->Node50</title>
+<path fill="none" stroke="#191970" d="M1528.446,-724.8782C1326.4285,-720.788 668.9395,-705.5615 632,-680 578.5542,-643.0164 561.9287,-560.6027 556.9834,-522.2974"/>
+<polygon fill="#191970" stroke="#191970" points="560.4107,-521.4618 555.7959,-511.9248 553.4561,-522.258 560.4107,-521.4618"/>
+</g>
+<!-- Node51 -->
+<g id="node49" class="node">
+<title>Node51</title>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1559.5,-660.5 1559.5,-679.5 1614.5,-679.5 1614.5,-660.5 1559.5,-660.5"/>
+<text text-anchor="middle" x="1587" y="-667.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">fstream</text>
+</g>
+<!-- Node2->Node51 -->
+<g id="edge153" class="edge">
+<title>Node2->Node51</title>
+<path fill="none" stroke="#191970" d="M1587,-716.2455C1587,-708.9382 1587,-698.6944 1587,-689.7046"/>
+<polygon fill="#191970" stroke="#191970" points="1590.5001,-689.6426 1587,-679.6427 1583.5001,-689.6427 1590.5001,-689.6426"/>
</g>
<!-- Node4 -->
<g id="node5" class="node">
<title>Node4</title>
<g id="a_node5"><a xlink:href="reflection_8h.html" target="_top" xlink:title="Reflection and serialization of compiler IR/AST nodes. ">
-<polygon fill="#ffffff" stroke="#000000" points="2021.5,-492.5 2021.5,-511.5 2142.5,-511.5 2142.5,-492.5 2021.5,-492.5"/>
-<text text-anchor="middle" x="2082" y="-499.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/reflection.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1937.5,-492.5 1937.5,-511.5 2058.5,-511.5 2058.5,-492.5 1937.5,-492.5"/>
+<text text-anchor="middle" x="1998" y="-499.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/reflection.h</text>
</a>
</g>
</g>
<!-- Node3->Node4 -->
<g id="edge4" class="edge">
<title>Node3->Node4</title>
-<path fill="none" stroke="#191970" d="M1442.7346,-666.7172C1510.6581,-661.4866 1635.8924,-649.1709 1740,-624 1857.8272,-595.512 1991.6086,-540.9857 2050.8945,-515.6122"/>
-<polygon fill="#191970" stroke="#191970" points="2052.2862,-518.8237 2060.0877,-511.6552 2049.5186,-512.3941 2052.2862,-518.8237"/>
+<path fill="none" stroke="#191970" d="M1782.9115,-660.4862C1820.5238,-632.772 1930.2738,-551.9036 1976.78,-517.6358"/>
+<polygon fill="#191970" stroke="#191970" points="1978.9731,-520.3674 1984.9475,-511.6177 1974.8207,-514.732 1978.9731,-520.3674"/>
</g>
<!-- Node5 -->
<g id="node6" class="node">
<title>Node5</title>
<g id="a_node6"><a xlink:href="structural__equal_8h.html" target="_top" xlink:title="Structural equality comparison. ">
-<polygon fill="#ffffff" stroke="#000000" points="1092.5,-425.5 1092.5,-455.5 1205.5,-455.5 1205.5,-425.5 1092.5,-425.5"/>
-<text text-anchor="start" x="1100.5" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
-<text text-anchor="middle" x="1149" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_equal.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2120.5,-425.5 2120.5,-455.5 2233.5,-455.5 2233.5,-425.5 2120.5,-425.5"/>
+<text text-anchor="start" x="2128.5" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
+<text text-anchor="middle" x="2177" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_equal.h</text>
</a>
</g>
</g>
<!-- Node3->Node5 -->
-<g id="edge144" class="edge">
+<g id="edge136" class="edge">
<title>Node3->Node5</title>
-<path fill="none" stroke="#191970" d="M1343.4983,-660.567C1317.7202,-653.7423 1286.8931,-642.3901 1264,-624 1209.918,-580.5557 1174.055,-504.4118 1158.2428,-465.2732"/>
-<polygon fill="#191970" stroke="#191970" points="1161.402,-463.7427 1154.4855,-455.717 1154.8874,-466.3042 1161.402,-463.7427"/>
+<path fill="none" stroke="#191970" d="M1789.0294,-660.4367C1807.2914,-651.2206 1835.6175,-636.8251 1860,-624 1952.5775,-575.3047 1977.0754,-565.4358 2067,-512 2094.3634,-495.7399 2124.6311,-475.9258 2146.4783,-461.2833"/>
+<polygon fill="#191970" stroke="#191970" points="2148.6575,-464.0355 2154.9993,-455.5483 2144.749,-458.2283 2148.6575,-464.0355"/>
</g>
<!-- Node3->Node8 -->
-<g id="edge148" class="edge">
+<g id="edge140" class="edge">
<title>Node3->Node8</title>
-<path fill="none" stroke="#191970" d="M1442.6128,-669.2965C1677.3759,-664.7338 2671.5726,-631.1599 2822,-389 2991.544,-116.066 2422.7887,-179.9498 2418,-179 2320.2242,-159.6064 2205.573,-146.4242 2133.847,-139.3037"/>
-<polygon fill="#191970" stroke="#191970" points="2133.8769,-135.7899 2123.583,-138.2971 2133.1936,-142.7565 2133.8769,-135.7899"/>
+<path fill="none" stroke="#191970" d="M1797.6393,-660.4732C1822.7551,-651.6255 1860.7063,-637.7777 1893,-624 2104.4712,-533.7781 2257.5407,-545.124 2313,-322 2333.2217,-240.6438 2268.6372,-208.0508 2190,-179 2172.3769,-172.4895 2070.702,-155.4534 1999.8527,-144.0423"/>
+<polygon fill="#191970" stroke="#191970" points="2000.0653,-140.5317 1989.6368,-142.4015 1998.9552,-147.4431 2000.0653,-140.5317"/>
</g>
<!-- Node9 -->
<g id="node10" class="node">
<title>Node9</title>
<g id="a_node10"><a xlink:href="c__runtime__api_8h.html" target="_top" xlink:title="tvm/runtime/c_runtime\l_api.h">
-<polygon fill="#ffffff" stroke="#000000" points="2903.5,-56.5 2903.5,-86.5 3032.5,-86.5 3032.5,-56.5 2903.5,-56.5"/>
-<text text-anchor="start" x="2911.5" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/c_runtime</text>
-<text text-anchor="middle" x="2968" y="-63.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_api.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2371.5,-56.5 2371.5,-86.5 2500.5,-86.5 2500.5,-56.5 2371.5,-56.5"/>
+<text text-anchor="start" x="2379.5" y="-74.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/c_runtime</text>
+<text text-anchor="middle" x="2436" y="-63.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_api.h</text>
</a>
</g>
</g>
<!-- Node3->Node9 -->
-<g id="edge146" class="edge">
+<g id="edge138" class="edge">
<title>Node3->Node9</title>
-<path fill="none" stroke="#191970" d="M1442.7597,-669.434C1592.399,-667.3797 2051.3218,-658.5036 2430,-624 2610.9629,-607.5114 2655.8909,-598.6494 2835,-568 2979.4225,-543.2862 3158,-648.5218 3158,-502 3158,-502 3158,-502 3158,-440.5 3158,-347.2765 3234,-338.2235 3234,-245 3234,-245 3234,-245 3234,-189 3234,-146.9397 3121.3129,-109.854 3042.6286,-89.0855"/>
-<polygon fill="#191970" stroke="#191970" points="3043.2898,-85.6411 3032.7308,-86.5112 3041.5277,-92.4157 3043.2898,-85.6411"/>
+<path fill="none" stroke="#191970" d="M1819.788,-669.5798C2001.1403,-667.7709 2620.0177,-659.0639 2701,-624 2766.7646,-595.5251 2788.9955,-577.9663 2817,-512 2899.532,-317.5909 2593.7362,-147.1712 2478.3638,-91.0196"/>
+<polygon fill="#191970" stroke="#191970" points="2479.6262,-87.7429 2469.0979,-86.5569 2476.5887,-94.0495 2479.6262,-87.7429"/>
</g>
<!-- Node3->Node14 -->
-<g id="edge149" class="edge">
+<g id="edge141" class="edge">
<title>Node3->Node14</title>
-<path fill="none" stroke="#191970" d="M1343.3863,-667.9868C1200.5462,-662.0179 790.0718,-643.6069 657,-624 549.0662,-608.0969 521.9288,-601.1877 418,-568 278.4854,-523.4487 152,-519.9552 152,-373.5 152,-373.5 152,-373.5 152,-189 152,-115.7342 255.4585,-86.6153 309.6884,-76.3846"/>
-<polygon fill="#191970" stroke="#191970" points="310.5989,-79.7778 319.8327,-74.5828 309.3747,-72.8856 310.5989,-79.7778"/>
+<path fill="none" stroke="#191970" d="M1819.7735,-669.6496C1991.4659,-668.1258 2558.1452,-660.3936 2736,-624 2888.3778,-592.8196 3064,-657.5352 3064,-502 3064,-502 3064,-502 3064,-189 3064,-101.7718 2930.8484,-79.2512 2867.6087,-73.4743"/>
+<polygon fill="#191970" stroke="#191970" points="2867.6282,-69.9641 2857.3744,-72.6305 2867.053,-76.9404 2867.6282,-69.9641"/>
</g>
<!-- Node3->Node15 -->
-<g id="edge150" class="edge">
+<g id="edge142" class="edge">
<title>Node3->Node15</title>
-<path fill="none" stroke="#191970" d="M1442.8138,-668.6216C1696.4657,-661.123 2836.7516,-621.4719 2945,-512 3049.7024,-406.1141 3064.8599,-302.7282 2982,-179 2926.9171,-96.7489 2607.0496,-77.209 2487.7973,-72.757"/>
-<polygon fill="#191970" stroke="#191970" points="2487.6787,-69.2507 2477.5608,-72.3938 2487.4304,-76.2463 2487.6787,-69.2507"/>
-</g>
-<!-- Node16 -->
-<g id="node17" class="node">
-<title>Node16</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="871.5,-62 871.5,-81 916.5,-81 916.5,-62 871.5,-62"/>
-<text text-anchor="middle" x="894" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">utility</text>
+<path fill="none" stroke="#191970" d="M1720.349,-666.2882C1691.8697,-664.2429 1655.4382,-661.768 1623,-660 1428.0144,-649.3723 935.4624,-667.0863 745,-624 621.4833,-596.0581 585.4782,-586.3999 483,-512 380.7511,-437.7666 339.778,-333.4343 419,-235 472.9678,-167.9443 496.0986,-152.3929 577,-123 664.7154,-91.1314 774.0643,-78.7549 834.1836,-74.1218"/>
+<polygon fill="#191970" stroke="#191970" points="834.7473,-77.5899 844.4648,-73.3684 834.2357,-70.6087 834.7473,-77.5899"/>
</g>
<!-- Node3->Node16 -->
-<g id="edge151" class="edge">
+<g id="edge143" class="edge">
<title>Node3->Node16</title>
-<path fill="none" stroke="#191970" d="M1343.2198,-668.6173C1116.1938,-660.994 190,-615.0889 190,-373.5 190,-373.5 190,-373.5 190,-306.5 190,-163.0246 722.7535,-91.3353 861.36,-75.1118"/>
-<polygon fill="#191970" stroke="#191970" points="861.8411,-78.5796 871.374,-73.9564 861.0387,-71.6257 861.8411,-78.5796"/>
-</g>
-<!-- Node18 -->
-<g id="node19" class="node">
-<title>Node18</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2004.5,-235.5 2004.5,-254.5 2051.5,-254.5 2051.5,-235.5 2004.5,-235.5"/>
-<text text-anchor="middle" x="2028" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">vector</text>
+<path fill="none" stroke="#191970" d="M1720.3514,-666.2436C1691.8728,-664.1839 1655.4414,-661.7094 1623,-660 1409.1472,-648.7315 869.3618,-669.0293 660,-624 466.2693,-582.3326 428.5595,-535.397 247,-456 214.8472,-441.9394 131.5695,-410.7125 104,-389 67.1105,-359.9475 38,-353.4562 38,-306.5 38,-306.5 38,-306.5 38,-189 38,-151.9295 56.5058,-140.8423 89,-123 135.4876,-97.4741 505.3483,-78.2872 618.3339,-72.9758"/>
+<polygon fill="#191970" stroke="#191970" points="618.6201,-76.4663 628.4467,-72.5053 618.2948,-69.4739 618.6201,-76.4663"/>
</g>
<!-- Node3->Node18 -->
-<g id="edge152" class="edge">
+<g id="edge144" class="edge">
<title>Node3->Node18</title>
-<path fill="none" stroke="#191970" d="M1442.8535,-663.7847C1535.7493,-650.8029 1726,-617.0125 1726,-558 1726,-558 1726,-558 1726,-440.5 1726,-357.6159 1802.9974,-370.7896 1870,-322 1887.0688,-309.5709 1887.8437,-301.7785 1906,-291 1934.0564,-274.3443 1969.1955,-261.8614 1994.547,-254.1326"/>
-<polygon fill="#191970" stroke="#191970" points="1995.828,-257.4037 2004.424,-251.2103 1993.8419,-250.6913 1995.828,-257.4037"/>
+<path fill="none" stroke="#191970" d="M1720.3447,-666.3644C1691.864,-664.3438 1655.4326,-661.8681 1623,-660 1453.5367,-650.2392 1024.608,-665.4403 860,-624 794.0703,-607.4021 722,-625.9869 722,-558 722,-558 722,-558 722,-373.5 722,-331.6451 698.001,-287.3534 682.6106,-263.2527"/>
+<polygon fill="#191970" stroke="#191970" points="685.4439,-261.1917 677.013,-254.7753 679.6024,-265.0488 685.4439,-261.1917"/>
</g>
<!-- Node22 -->
<g id="node23" class="node">
<title>Node22</title>
<g id="a_node23"><a xlink:href="runtime_2memory_8h.html" target="_top" xlink:title="Runtime memory management. ">
-<polygon fill="#ffffff" stroke="#000000" points="1425.5,-179.5 1425.5,-198.5 1554.5,-198.5 1554.5,-179.5 1425.5,-179.5"/>
-<text text-anchor="middle" x="1490" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/memory.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1071.5,-179.5 1071.5,-198.5 1200.5,-198.5 1200.5,-179.5 1071.5,-179.5"/>
+<text text-anchor="middle" x="1136" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/memory.h</text>
</a>
</g>
</g>
<!-- Node3->Node22 -->
-<g id="edge147" class="edge">
+<g id="edge139" class="edge">
<title>Node3->Node22</title>
-<path fill="none" stroke="#191970" d="M1370.5538,-660.4443C1337.0867,-644.5115 1278,-609.1402 1278,-558 1278,-558 1278,-558 1278,-373.5 1278,-328.6788 1297.5273,-313.9955 1336,-291 1393.6345,-256.5513 1591.4395,-305.2275 1636,-255 1662.5969,-225.0207 1613.0139,-207.8811 1564.5735,-198.6246"/>
-<polygon fill="#191970" stroke="#191970" points="1565.0665,-195.1571 1554.6033,-196.8221 1563.8211,-202.0454 1565.0665,-195.1571"/>
+<path fill="none" stroke="#191970" d="M1720.4621,-663.6755C1522.3874,-637.7307 798,-536.298 798,-440.5 798,-440.5 798,-440.5 798,-306.5 798,-251.1675 961.6523,-215.948 1061.2088,-199.6418"/>
+<polygon fill="#191970" stroke="#191970" points="1061.9135,-203.0735 1071.2301,-198.0287 1060.8009,-196.1625 1061.9135,-203.0735"/>
</g>
<!-- Node34 -->
<g id="node35" class="node">
<title>Node34</title>
<g id="a_node35"><a xlink:href="structural__hash_8h.html" target="_top" xlink:title="tvm/node/structural\l_hash.h">
-<polygon fill="#ffffff" stroke="#000000" points="2241.5,-425.5 2241.5,-455.5 2354.5,-455.5 2354.5,-425.5 2241.5,-425.5"/>
-<text text-anchor="start" x="2249.5" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
-<text text-anchor="middle" x="2298" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_hash.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1941.5,-425.5 1941.5,-455.5 2054.5,-455.5 2054.5,-425.5 1941.5,-425.5"/>
+<text text-anchor="start" x="1949.5" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/structural</text>
+<text text-anchor="middle" x="1998" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_hash.h</text>
</a>
</g>
</g>
<!-- Node3->Node34 -->
-<g id="edge145" class="edge">
+<g id="edge137" class="edge">
<title>Node3->Node34</title>
-<path fill="none" stroke="#191970" d="M1442.7315,-667.1302C1518.0772,-662.1272 1665.3733,-649.7803 1788,-624 1953.6418,-589.1765 1994.6207,-574.3028 2152,-512 2190.3606,-496.8139 2232.5166,-475.6577 2261.6259,-460.3009"/>
-<polygon fill="#191970" stroke="#191970" points="2263.4089,-463.317 2270.5972,-455.5336 2260.1241,-457.1355 2263.4089,-463.317"/>
+<path fill="none" stroke="#191970" d="M1777.2616,-660.4887C1798.6422,-632.7959 1863.8953,-550.4833 1928,-492 1939.954,-481.0943 1954.1779,-470.2873 1966.6717,-461.4105"/>
+<polygon fill="#191970" stroke="#191970" points="1968.8021,-464.1918 1974.9957,-455.5959 1964.7934,-458.4532 1968.8021,-464.1918"/>
</g>
<!-- Node47 -->
-<g id="node47" class="node">
+<g id="node45" class="node">
<title>Node47</title>
-<g id="a_node47"><a xlink:href="repr__printer_8h.html" target="_top" xlink:title="Printer class to print repr string of each AST/IR nodes. ">
-<polygon fill="#ffffff" stroke="#000000" points="1327.5,-604.5 1327.5,-623.5 1458.5,-623.5 1458.5,-604.5 1327.5,-604.5"/>
-<text text-anchor="middle" x="1393" y="-611.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/repr_printer.h</text>
+<g id="a_node45"><a xlink:href="repr__printer_8h.html" target="_top" xlink:title="Printer class to print repr string of each AST/IR nodes. ">
+<polygon fill="#ffffff" stroke="#000000" points="2561.5,-604.5 2561.5,-623.5 2692.5,-623.5 2692.5,-604.5 2561.5,-604.5"/>
+<text text-anchor="middle" x="2627" y="-611.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/repr_printer.h</text>
</a>
</g>
</g>
<!-- Node3->Node47 -->
-<g id="edge133" class="edge">
+<g id="edge125" class="edge">
<title>Node3->Node47</title>
-<path fill="none" stroke="#191970" d="M1393,-660.2455C1393,-652.9382 1393,-642.6944 1393,-633.7046"/>
-<polygon fill="#191970" stroke="#191970" points="1396.5001,-633.6426 1393,-623.6427 1389.5001,-633.6427 1396.5001,-633.6426"/>
+<path fill="none" stroke="#191970" d="M1819.9146,-667.7342C1941.6041,-662.0466 2263.2155,-646.03 2531,-624 2537.6202,-623.4554 2544.4919,-622.8396 2551.3786,-622.1879"/>
+<polygon fill="#191970" stroke="#191970" points="2551.8361,-625.66 2561.4516,-621.2113 2551.1605,-618.6927 2551.8361,-625.66"/>
</g>
<!-- Node4->Node5 -->
<g id="edge5" class="edge">
<title>Node4->Node5</title>
-<path fill="none" stroke="#191970" d="M2021.1432,-497.9885C1854.1309,-486.9797 1389.5462,-456.3559 1215.7509,-444.9"/>
-<polygon fill="#191970" stroke="#191970" points="1215.9241,-441.4039 1205.7156,-444.2385 1215.4637,-448.3887 1215.9241,-441.4039"/>
+<path fill="none" stroke="#191970" d="M2025.9688,-492.3906C2051.9887,-483.4508 2091.457,-469.8905 2123.5214,-458.8739"/>
+<polygon fill="#191970" stroke="#191970" points="2124.8916,-462.1041 2133.2117,-455.5446 2122.617,-455.4839 2124.8916,-462.1041"/>
</g>
<!-- Node4->Node8 -->
-<g id="edge97" class="edge">
+<g id="edge88" class="edge">
<title>Node4->Node8</title>
-<path fill="none" stroke="#191970" d="M2142.5851,-497.213C2200.5008,-491.4546 2289.7404,-479.471 2364,-456 2500.6349,-412.8141 2723.4511,-408.2004 2641,-291 2582.3343,-207.6095 2274.265,-159.1949 2133.8544,-141.1574"/>
-<polygon fill="#191970" stroke="#191970" points="2133.9957,-137.6473 2123.6346,-139.8602 2133.1142,-144.5916 2133.9957,-137.6473"/>
+<path fill="none" stroke="#191970" d="M2016.1527,-492.5C2030.3505,-484.3456 2049.8275,-471.4752 2063,-456 2094.4363,-419.0682 2073.7422,-389.0493 2111,-358 2156.3982,-320.1668 2203.3152,-369.1056 2239,-322 2247.3196,-311.0177 2244.2151,-303.7526 2239,-291 2206.0058,-210.3191 2158.4984,-212.4369 2078,-179 2045.214,-165.3816 2006.9092,-153.5816 1977.4837,-145.3433"/>
+<polygon fill="#191970" stroke="#191970" points="1978.0402,-141.8661 1967.4694,-142.5807 1976.1786,-148.6141 1978.0402,-141.8661"/>
</g>
<!-- Node4->Node9 -->
-<g id="edge93" class="edge">
+<g id="edge84" class="edge">
<title>Node4->Node9</title>
-<path fill="none" stroke="#191970" d="M2142.7413,-500.2559C2285.1146,-495.7764 2636.9971,-482.3266 2752,-456 2829.9563,-438.1541 2855.1497,-437.1538 2919,-389 3015.314,-316.3632 2983.1268,-243.2079 2973,-123 2972.2869,-114.5354 2971.4184,-105.2973 2970.6066,-97.012"/>
-<polygon fill="#191970" stroke="#191970" points="2974.0648,-96.4178 2969.5899,-86.8144 2967.0993,-97.1123 2974.0648,-96.4178"/>
+<path fill="none" stroke="#191970" d="M2058.7148,-499.3879C2130.585,-495.1094 2253.3451,-484.0523 2355,-456 2426.9521,-436.1443 2443.6765,-425.1137 2509,-389 2555.1751,-363.4723 2582.3761,-368.6633 2607,-322 2654.7825,-231.4503 2531.3361,-133.9428 2469.2004,-92.2824"/>
+<polygon fill="#191970" stroke="#191970" points="2470.7804,-89.1319 2460.5054,-86.5485 2466.9267,-94.9756 2470.7804,-89.1319"/>
</g>
<!-- Node4->Node14 -->
-<g id="edge130" class="edge">
+<g id="edge122" class="edge">
<title>Node4->Node14</title>
-<path fill="none" stroke="#191970" d="M2021.3505,-500.291C1720.5441,-491.5366 400.7256,-449.7558 340,-389 258.1674,-307.1267 311.9373,-145.927 333.903,-90.6394"/>
-<polygon fill="#191970" stroke="#191970" points="337.2791,-91.6298 337.8224,-81.0489 330.7994,-88.9816 337.2791,-91.6298"/>
+<path fill="none" stroke="#191970" d="M2058.6613,-501.0832C2196.4342,-498.4443 2529.2419,-488.7758 2636,-456 2664.4577,-447.2632 2849.4073,-345.2484 2868,-322 2890.3884,-294.0054 2892,-280.846 2892,-245 2892,-245 2892,-245 2892,-189 2892,-150.341 2866.7138,-111.0543 2849.825,-89.1259"/>
+<polygon fill="#191970" stroke="#191970" points="2852.3871,-86.7255 2843.4105,-81.0977 2846.9183,-91.095 2852.3871,-86.7255"/>
</g>
<!-- Node4->Node15 -->
-<g id="edge131" class="edge">
+<g id="edge123" class="edge">
<title>Node4->Node15</title>
-<path fill="none" stroke="#191970" d="M2142.6395,-499.4324C2287.2929,-493.025 2644.3408,-475.442 2695,-456 2838.168,-401.0549 2968,-398.3494 2968,-245 2968,-245 2968,-245 2968,-189 2968,-139.9995 2615.2234,-92.3785 2488.0325,-76.8015"/>
-<polygon fill="#191970" stroke="#191970" points="2488.2025,-73.2965 2477.8533,-75.5644 2487.358,-80.2454 2488.2025,-73.2965"/>
+<path fill="none" stroke="#191970" d="M1937.2444,-499.2673C1820.0938,-493.5552 1567.1109,-478.9471 1483,-456 1416.0646,-437.7387 1386.3387,-442.3658 1342,-389 1313.2027,-354.3397 1350.3795,-320.1319 1316,-291 1251.2862,-236.164 1001.8469,-308.4701 936,-255 910.2073,-234.0554 890.0423,-134.6013 882.3638,-91.499"/>
+<polygon fill="#191970" stroke="#191970" points="885.7678,-90.6443 880.6093,-81.3901 878.8709,-91.8414 885.7678,-90.6443"/>
</g>
<!-- Node4->Node18 -->
-<g id="edge132" class="edge">
+<g id="edge124" class="edge">
<title>Node4->Node18</title>
-<path fill="none" stroke="#191970" d="M2021.4702,-494.7819C1959.2007,-483.7738 1865.2496,-456.3767 1821,-389 1813.4367,-377.4838 1813.3909,-369.486 1821,-358 1843.6577,-323.7978 1870.8633,-343.1796 1906,-322 1924.6669,-310.748 1925.9211,-303.1744 1944,-291 1961.5236,-279.1995 1982.3454,-267.7809 1998.8745,-259.2722"/>
-<polygon fill="#191970" stroke="#191970" points="2000.7173,-262.2621 2008.0531,-254.6177 1997.5513,-256.0189 2000.7173,-262.2621"/>
+<path fill="none" stroke="#191970" d="M1937.277,-499.2267C1739.5914,-489.6335 1116.3031,-454.7854 927,-389 833.1869,-356.3987 733.8213,-290.4003 691.3061,-260.4296"/>
+<polygon fill="#191970" stroke="#191970" points="693.2966,-257.5504 683.1191,-254.6061 689.2391,-263.2546 693.2966,-257.5504"/>
</g>
<!-- Node4->Node22 -->
-<g id="edge95" class="edge">
+<g id="edge86" class="edge">
<title>Node4->Node22</title>
-<path fill="none" stroke="#191970" d="M2021.4144,-497.6028C1925.1715,-486.8049 1741.9083,-449.6266 1680,-322 1673.9869,-309.6037 1672.3434,-302.4544 1680,-291 1703.2548,-256.2103 1743.7452,-289.7897 1767,-255 1771.9397,-247.61 1772.7475,-241.7807 1767,-235 1753.9718,-219.6299 1642.2428,-204.9269 1564.7305,-196.4398"/>
-<polygon fill="#191970" stroke="#191970" points="1564.8273,-192.9299 1554.509,-195.3347 1564.0748,-199.8894 1564.8273,-192.9299"/>
+<path fill="none" stroke="#191970" d="M1979.0906,-492.411C1898.5739,-451.5947 1587.9064,-294.2558 1577,-291 1444.0922,-251.3235 1399.8661,-294.816 1267,-255 1227.0233,-243.0202 1184.5472,-219.3327 1159.153,-203.8355"/>
+<polygon fill="#191970" stroke="#191970" points="1160.9492,-200.831 1150.6064,-198.5335 1157.259,-206.7794 1160.9492,-200.831"/>
</g>
<!-- Node33 -->
<g id="node34" class="node">
<title>Node33</title>
<g id="a_node34"><a xlink:href="data__type_8h.html" target="_top" xlink:title="tvm/runtime/data_type.h">
-<polygon fill="#ffffff" stroke="#000000" points="2688,-297 2688,-316 2826,-316 2826,-297 2688,-297"/>
-<text text-anchor="middle" x="2757" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/data_type.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2460,-297 2460,-316 2598,-316 2598,-297 2460,-297"/>
+<text text-anchor="middle" x="2529" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/data_type.h</text>
</a>
</g>
</g>
<!-- Node4->Node33 -->
-<g id="edge94" class="edge">
+<g id="edge85" class="edge">
<title>Node4->Node33</title>
-<path fill="none" stroke="#191970" d="M2142.7535,-500.7123C2268.353,-497.4517 2552.4989,-486.8122 2644,-456 2696.4632,-438.3335 2720.0769,-436.2011 2749,-389 2760.5881,-370.0887 2761.1225,-344.0442 2759.7128,-326.3109"/>
-<polygon fill="#191970" stroke="#191970" points="2763.1802,-325.8186 2758.6339,-316.2488 2756.22,-326.565 2763.1802,-325.8186"/>
+<path fill="none" stroke="#191970" d="M2058.5272,-494.2008C2172.876,-478.4578 2415.106,-439.9915 2481,-389 2502.022,-372.7323 2515.5454,-344.4594 2522.7136,-325.6134"/>
+<polygon fill="#191970" stroke="#191970" points="2526.0761,-326.6035 2526.1291,-316.0088 2519.4807,-324.2581 2526.0761,-326.6035"/>
</g>
<!-- Node4->Node34 -->
<g id="edge69" class="edge">
<title>Node4->Node34</title>
-<path fill="none" stroke="#191970" d="M2115.75,-492.3906C2147.6897,-483.2967 2196.422,-469.4215 2235.4604,-458.3064"/>
-<polygon fill="#191970" stroke="#191970" points="2236.5012,-461.6493 2245.1605,-455.5446 2234.5842,-454.9168 2236.5012,-461.6493"/>
+<path fill="none" stroke="#191970" d="M1998,-492.3906C1998,-485.2234 1998,-475.0862 1998,-465.671"/>
+<polygon fill="#191970" stroke="#191970" points="2001.5001,-465.5446 1998,-455.5446 1994.5001,-465.5446 2001.5001,-465.5446"/>
</g>
<!-- Node35 -->
<g id="node36" class="node">
<title>Node35</title>
<g id="a_node36"><a xlink:href="ndarray_8h.html" target="_top" xlink:title="A device-independent managed NDArray abstraction. ">
-<polygon fill="#ffffff" stroke="#000000" points="2235.5,-364 2235.5,-383 2360.5,-383 2360.5,-364 2235.5,-364"/>
-<text text-anchor="middle" x="2298" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/ndarray.h</text>
+<polygon fill="#ffffff" stroke="#ff0000" points="1824.5,-364 1824.5,-383 1949.5,-383 1949.5,-364 1824.5,-364"/>
+<text text-anchor="middle" x="1887" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/ndarray.h</text>
</a>
</g>
</g>
<!-- Node4->Node35 -->
-<g id="edge96" class="edge">
+<g id="edge87" class="edge">
<title>Node4->Node35</title>
-<path fill="none" stroke="#191970" d="M2098.2351,-492.3416C2135.6981,-470.0546 2228.3898,-414.9116 2273.0967,-388.3152"/>
-<polygon fill="#191970" stroke="#191970" points="2275.0664,-391.2159 2281.8711,-383.0952 2271.4875,-385.2 2275.0664,-391.2159"/>
-</g>
-<!-- Node41 -->
-<g id="node42" class="node">
-<title>Node41</title>
-<g id="a_node42"><a xlink:href="packed__func_8h.html" target="_top" xlink:title="Type-erased function used across TVM API. ">
-<polygon fill="#ffffff" stroke="#000000" points="2024,-425.5 2024,-455.5 2140,-455.5 2140,-425.5 2024,-425.5"/>
-<text text-anchor="start" x="2032" y="-443.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/packed</text>
-<text text-anchor="middle" x="2082" y="-432.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">_func.h</text>
-</a>
-</g>
+<path fill="none" stroke="#191970" d="M1979.7462,-492.3989C1965.4307,-484.1873 1945.7027,-471.2894 1932,-456 1914.8875,-436.9061 1901.8266,-410.1642 1894.2919,-392.3601"/>
+<polygon fill="#191970" stroke="#191970" points="1897.5107,-390.9839 1890.5053,-383.0356 1891.025,-393.6177 1897.5107,-390.9839"/>
</g>
<!-- Node4->Node41 -->
-<g id="edge98" class="edge">
+<g id="edge89" class="edge">
<title>Node4->Node41</title>
-<path fill="none" stroke="#191970" d="M2082,-492.3906C2082,-485.2234 2082,-475.0862 2082,-465.671"/>
-<polygon fill="#191970" stroke="#191970" points="2085.5001,-465.5446 2082,-455.5446 2078.5001,-465.5446 2085.5001,-465.5446"/>
+<path fill="none" stroke="#191970" d="M1937.1145,-493.6418C1853.9227,-482.2215 1705.1708,-461.8013 1618.2842,-449.8738"/>
+<polygon fill="#191970" stroke="#191970" points="1618.6457,-446.3907 1608.2626,-448.4981 1617.6936,-453.3257 1618.6457,-446.3907"/>
</g>
<!-- Node6 -->
<g id="node7" class="node">
<title>Node6</title>
<g id="a_node7"><a xlink:href="functor_8h.html" target="_top" xlink:title="Defines the Functor data structures. ">
-<polygon fill="#ffffff" stroke="#000000" points="1344.5,-297 1344.5,-316 1453.5,-316 1453.5,-297 1344.5,-297"/>
-<text text-anchor="middle" x="1399" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/functor.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1198.5,-297 1198.5,-316 1307.5,-316 1307.5,-297 1198.5,-297"/>
+<text text-anchor="middle" x="1253" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/functor.h</text>
</a>
</g>
</g>
<!-- Node5->Node6 -->
<g id="edge6" class="edge">
<title>Node5->Node6</title>
-<path fill="none" stroke="#191970" d="M1176.8899,-425.3917C1207.5359,-408.8067 1258.229,-381.4203 1302,-358 1325.5491,-345.3997 1352.3694,-331.1693 1371.8303,-320.8652"/>
-<polygon fill="#191970" stroke="#191970" points="1373.5447,-323.9179 1380.746,-316.1467 1370.2703,-317.7309 1373.5447,-323.9179"/>
+<path fill="none" stroke="#191970" d="M2120.2702,-431.9484C2102.0181,-429.4219 2081.6847,-426.8426 2063,-425 1933.3828,-412.2176 1600.1506,-434.2022 1478,-389 1456.5753,-381.0717 1457.4685,-368.1449 1437,-358 1399.1013,-339.2161 1353.307,-326.3674 1316.8851,-318.2017"/>
+<polygon fill="#191970" stroke="#191970" points="1317.391,-314.7299 1306.8752,-316.0217 1315.9013,-321.5696 1317.391,-314.7299"/>
</g>
<!-- Node5->Node14 -->
<g id="edge68" class="edge">
<title>Node5->Node14</title>
-<path fill="none" stroke="#191970" d="M1092.2329,-438.5406C916.9381,-432.2585 394.2625,-411.7005 366,-389 336.2278,-365.0869 342,-344.6867 342,-306.5 342,-306.5 342,-306.5 342,-189 342,-154.6399 342,-114.628 342,-91.2764"/>
-<polygon fill="#191970" stroke="#191970" points="345.5001,-91.2489 342,-81.2489 338.5001,-91.249 345.5001,-91.2489"/>
+<path fill="none" stroke="#191970" d="M2233.6011,-431.7138C2370.7034,-410.0336 2712.41,-353.4706 2754,-322 2828.403,-265.7001 2835.6517,-139.9073 2835.5348,-91.2473"/>
+<polygon fill="#191970" stroke="#191970" points="2839.0327,-91.0579 2835.3955,-81.107 2832.0333,-91.1541 2839.0327,-91.0579"/>
</g>
<!-- Node19 -->
<g id="node20" class="node">
<title>Node19</title>
<g id="a_node20"><a xlink:href="object__path_8h.html" target="_top" xlink:title="tvm/node/object_path.h">
-<polygon fill="#ffffff" stroke="#000000" points="484,-364 484,-383 616,-383 616,-364 484,-364"/>
-<text text-anchor="middle" x="550" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/object_path.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="2120,-364 2120,-383 2252,-383 2252,-364 2120,-364"/>
+<text text-anchor="middle" x="2186" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/node/object_path.h</text>
</a>
</g>
</g>
<!-- Node5->Node19 -->
<g id="edge21" class="edge">
<title>Node5->Node19</title>
-<path fill="none" stroke="#191970" d="M1092.344,-436.6022C996.7121,-429.6377 797.3179,-413.4569 630,-389 621.9389,-387.8217 613.4564,-386.3892 605.1499,-384.8765"/>
-<polygon fill="#191970" stroke="#191970" points="605.6187,-381.4035 595.1453,-383.0032 604.3304,-388.284 605.6187,-381.4035"/>
+<path fill="none" stroke="#191970" d="M2179.0422,-425.2967C2180.322,-415.7699 2181.9842,-403.3954 2183.363,-393.1306"/>
+<polygon fill="#191970" stroke="#191970" points="2186.8539,-393.4319 2184.7165,-383.055 2179.9162,-392.5 2186.8539,-393.4319"/>
</g>
<!-- Node32 -->
<g id="node33" class="node">
<title>Node32</title>
<g id="a_node33"><a xlink:href="array_8h.html" target="_top" xlink:title="Runtime Array container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="1086,-358.5 1086,-388.5 1212,-388.5 1212,-358.5 1086,-358.5"/>
-<text text-anchor="start" x="1094" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="1149" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="442,-358.5 442,-388.5 568,-388.5 568,-358.5 442,-358.5"/>
+<text text-anchor="start" x="450" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="505" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/array.h</text>
</a>
</g>
</g>
<!-- Node5->Node32 -->
<g id="edge55" class="edge">
<title>Node5->Node32</title>
-<path fill="none" stroke="#191970" d="M1149,-425.2967C1149,-417.5013 1149,-407.7991 1149,-398.9064"/>
-<polygon fill="#191970" stroke="#191970" points="1152.5001,-398.6431 1149,-388.6432 1145.5001,-398.6432 1152.5001,-398.6431"/>
+<path fill="none" stroke="#191970" d="M2120.2995,-431.6275C2102.049,-429.0839 2081.7091,-426.5756 2063,-425 1578.2305,-384.1757 1455.2182,-405.121 969,-389 830.931,-384.4222 669.7542,-379.0264 578.2998,-375.9599"/>
+<polygon fill="#191970" stroke="#191970" points="578.1537,-372.4531 568.042,-375.6159 577.9191,-379.4492 578.1537,-372.4531"/>
</g>
<!-- Node5->Node33 -->
<g id="edge63" class="edge">
<title>Node5->Node33</title>
-<path fill="none" stroke="#191970" d="M1205.5587,-439.2152C1430.5516,-433.9541 2255.4349,-413.185 2370,-389 2406.9043,-381.2094 2412.647,-368.0535 2449,-358 2491.3192,-346.2965 2603.3905,-328.7857 2680.0041,-317.5018"/>
-<polygon fill="#191970" stroke="#191970" points="2680.7135,-320.9352 2690.0995,-316.0206 2679.6973,-314.0094 2680.7135,-320.9352"/>
+<path fill="none" stroke="#191970" d="M2233.7907,-433.5934C2301.4709,-424.6802 2409.9431,-408.0368 2446,-389 2475.6248,-373.3591 2501.5202,-343.6413 2516.2004,-324.4688"/>
+<polygon fill="#191970" stroke="#191970" points="2519.1507,-326.3664 2522.3016,-316.2509 2513.5304,-322.1937 2519.1507,-326.3664"/>
</g>
<!-- Node7 -->
<g id="node8" class="node">
<title>Node7</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1057.5,-179.5 1057.5,-198.5 1146.5,-198.5 1146.5,-179.5 1057.5,-179.5"/>
-<text text-anchor="middle" x="1102" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/logging.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1549.5,-179.5 1549.5,-198.5 1638.5,-198.5 1638.5,-179.5 1549.5,-179.5"/>
+<text text-anchor="middle" x="1594" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/logging.h</text>
</g>
<!-- Node6->Node7 -->
<g id="edge7" class="edge">
<title>Node6->Node7</title>
-<path fill="none" stroke="#191970" d="M1365.0383,-296.9261C1332.5271,-287.4281 1282.3111,-271.8834 1240,-255 1229.1598,-250.6744 1168.1354,-221.1682 1130.9597,-203.1032"/>
-<polygon fill="#191970" stroke="#191970" points="1132.1413,-199.786 1121.6176,-198.5605 1129.0802,-206.0812 1132.1413,-199.786"/>
+<path fill="none" stroke="#191970" d="M1307.5147,-300.7956C1378.1728,-292.6816 1496.8528,-276.3874 1535,-255 1555.2054,-243.6718 1572.0697,-222.8054 1582.5235,-207.5586"/>
+<polygon fill="#191970" stroke="#191970" points="1585.6256,-209.2102 1588.1766,-198.9271 1579.7698,-205.375 1585.6256,-209.2102"/>
</g>
<!-- Node6->Node8 -->
<g id="edge8" class="edge">
<title>Node6->Node8</title>
-<path fill="none" stroke="#191970" d="M1453.5698,-304.7436C1531.7312,-301.0565 1679.4325,-289.71 1800,-255 1892.4079,-228.3968 1993.4375,-173.8532 2039.1152,-147.6663"/>
-<polygon fill="#191970" stroke="#191970" points="2041.0945,-150.5649 2047.9992,-142.529 2037.5902,-144.5051 2041.0945,-150.5649"/>
+<path fill="none" stroke="#191970" d="M1279.9966,-296.9722C1307.2751,-287.1476 1350.4918,-271.0405 1387,-255 1456.5135,-224.458 1467.2263,-200.6546 1540,-179 1598.2413,-161.6696 1764.4644,-146.171 1860.054,-138.3557"/>
+<polygon fill="#191970" stroke="#191970" points="1860.5286,-141.8288 1870.2131,-137.5322 1859.963,-134.8516 1860.5286,-141.8288"/>
</g>
<!-- Node6->Node15 -->
<g id="edge18" class="edge">
<title>Node6->Node15</title>
-<path fill="none" stroke="#191970" d="M1453.8187,-301.5667C1541.8249,-293.1886 1710.478,-275.1182 1767,-255 1783.1571,-249.2491 1785.2202,-243.6996 1800,-235 1843.2319,-209.5531 1948.3332,-138.6338 1996,-123 2070.0346,-98.718 2300.1043,-81.0079 2398.0833,-74.3716"/>
-<polygon fill="#191970" stroke="#191970" points="2398.5097,-77.851 2408.2535,-73.6905 2398.0419,-70.8666 2398.5097,-77.851"/>
+<path fill="none" stroke="#191970" d="M1198.4866,-298.1476C1183.1316,-295.8135 1166.414,-293.2899 1151,-291 1095.9158,-282.8167 942.0992,-294.6546 903,-255 859.4563,-210.8379 867.9336,-128.9442 874.7384,-91.3291"/>
+<polygon fill="#191970" stroke="#191970" points="878.2412,-91.6553 876.7371,-81.1677 871.3728,-90.3042 878.2412,-91.6553"/>
</g>
<!-- Node6->Node16 -->
<g id="edge19" class="edge">
<title>Node6->Node16</title>
-<path fill="none" stroke="#191970" d="M1355.4277,-296.9819C1314.0408,-287.5808 1250.2556,-272.1535 1196,-255 1187.6673,-252.3655 1056.5571,-203.389 1049,-199 993.5684,-166.8063 937.7602,-115.0389 910.7429,-88.4558"/>
-<polygon fill="#191970" stroke="#191970" points="912.8721,-85.6369 903.3149,-81.064 907.9344,-90.5987 912.8721,-85.6369"/>
+<path fill="none" stroke="#191970" d="M1198.2359,-298.0102C1182.9538,-295.6867 1166.3362,-293.2029 1151,-291 1032.912,-274.0379 996.8257,-296.5633 885,-255 879.6558,-253.0137 717.2983,-146.7458 713,-143 694.1548,-126.5774 675.8111,-104.4214 664.0288,-89.1472"/>
+<polygon fill="#191970" stroke="#191970" points="666.7216,-86.9055 657.8955,-81.0445 661.1402,-91.1302 666.7216,-86.9055"/>
</g>
<!-- Node6->Node18 -->
<g id="edge20" class="edge">
<title>Node6->Node18</title>
-<path fill="none" stroke="#191970" d="M1453.7193,-301.1499C1580.4473,-288.7591 1891.1119,-258.3841 1994.1647,-248.3082"/>
-<polygon fill="#191970" stroke="#191970" points="1994.6963,-251.773 2004.3082,-247.3164 1994.0151,-244.8062 1994.6963,-251.773"/>
+<path fill="none" stroke="#191970" d="M1198.2888,-297.6146C1183.007,-295.2884 1166.3764,-292.9021 1151,-291 983.6241,-270.2955 782.4862,-253.7406 703.8422,-247.587"/>
+<polygon fill="#191970" stroke="#191970" points="703.8158,-244.0745 693.5744,-246.7884 703.2729,-251.0534 703.8158,-244.0745"/>
</g>
<!-- Node8->Node9 -->
<g id="edge9" class="edge">
<title>Node8->Node9</title>
-<path fill="none" stroke="#191970" d="M2123.6636,-128.941C2282.9391,-118.1054 2717.9233,-88.513 2893.2656,-76.5843"/>
-<polygon fill="#191970" stroke="#191970" points="2893.6915,-80.0635 2903.4309,-75.8927 2893.2163,-73.0796 2893.6915,-80.0635"/>
+<path fill="none" stroke="#191970" d="M1989.5326,-125.7643C2081.9732,-114.5289 2259.8876,-92.905 2360.9799,-80.6181"/>
+<polygon fill="#191970" stroke="#191970" points="2361.6124,-84.067 2371.117,-79.386 2360.7678,-77.1182 2361.6124,-84.067"/>
</g>
<!-- Node13 -->
<g id="node14" class="node">
<title>Node13</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1761.5,-62 1761.5,-81 1886.5,-81 1886.5,-62 1761.5,-62"/>
-<text text-anchor="middle" x="1824" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/logging.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1420.5,-62 1420.5,-81 1545.5,-81 1545.5,-62 1420.5,-62"/>
+<text text-anchor="middle" x="1483" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/logging.h</text>
</g>
<!-- Node8->Node13 -->
<g id="edge13" class="edge">
<title>Node8->Node13</title>
-<path fill="none" stroke="#191970" d="M2026.7633,-123.4581C1984.9006,-112.7308 1916.8372,-95.2895 1871.2585,-83.61"/>
-<polygon fill="#191970" stroke="#191970" points="1871.8799,-80.1562 1861.324,-81.0643 1870.1422,-86.9371 1871.8799,-80.1562"/>
+<path fill="none" stroke="#191970" d="M1870.1794,-124.7697C1788.8279,-113.577 1643.4318,-93.5728 1555.8299,-81.5202"/>
+<polygon fill="#191970" stroke="#191970" points="1556.0875,-78.0228 1545.7038,-80.127 1555.1333,-84.9574 1556.0875,-78.0228"/>
</g>
<!-- Node8->Node14 -->
<g id="edge14" class="edge">
<title>Node8->Node14</title>
-<path fill="none" stroke="#191970" d="M2004.3227,-129.5267C1967.3717,-127.4503 1918.9404,-124.8627 1876,-123 1568.5067,-109.6616 566.858,-78.4603 374.2144,-72.4953"/>
-<polygon fill="#191970" stroke="#191970" points="374.2695,-68.9954 364.1659,-72.1843 374.0529,-75.9921 374.2695,-68.9954"/>
+<path fill="none" stroke="#191970" d="M1989.7296,-128.941C2164.2942,-117.0784 2669.6731,-82.7349 2802.6336,-73.6995"/>
+<polygon fill="#191970" stroke="#191970" points="2803.2089,-77.1686 2812.9485,-72.9985 2802.7342,-70.1847 2803.2089,-77.1686"/>
</g>
<!-- Node8->Node15 -->
<g id="edge15" class="edge">
<title>Node8->Node15</title>
-<path fill="none" stroke="#191970" d="M2122.8029,-123.4581C2198.9428,-111.103 2329.9677,-89.8416 2398.4157,-78.7346"/>
-<polygon fill="#191970" stroke="#191970" points="2399.1362,-82.1636 2408.4464,-77.107 2398.0149,-75.254 2399.1362,-82.1636"/>
+<path fill="none" stroke="#191970" d="M1870.4824,-129.5173C1680.5575,-118.4037 1091.6073,-83.9409 923.9497,-74.1303"/>
+<polygon fill="#191970" stroke="#191970" points="923.9396,-70.6238 913.7522,-73.5335 923.5306,-77.6118 923.9396,-70.6238"/>
</g>
<!-- Node8->Node16 -->
<g id="edge16" class="edge">
<title>Node8->Node16</title>
-<path fill="none" stroke="#191970" d="M2004.3089,-129.8136C1967.3528,-127.8436 1918.921,-125.267 1876,-123 1503.2771,-103.3136 1050.9623,-79.6911 926.8435,-73.2135"/>
-<polygon fill="#191970" stroke="#191970" points="927.0063,-69.7174 916.8374,-72.6914 926.6414,-76.7079 927.0063,-69.7174"/>
+<path fill="none" stroke="#191970" d="M1870.3087,-131.3749C1705.4165,-126.695 1229.7426,-111.9067 835,-87 782.0025,-83.6561 720.5512,-78.1499 683.7761,-74.6798"/>
+<polygon fill="#191970" stroke="#191970" points="683.9253,-71.1782 673.6388,-73.7157 683.2625,-78.1468 683.9253,-71.1782"/>
</g>
<!-- Node17 -->
<g id="node18" class="node">
<title>Node17</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2044,-62 2044,-81 2094,-81 2094,-62 2044,-62"/>
-<text text-anchor="middle" x="2069" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">atomic</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1949,-62 1949,-81 1999,-81 1999,-62 1949,-62"/>
+<text text-anchor="middle" x="1974" y="-69" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">atomic</text>
</g>
<!-- Node8->Node17 -->
<g id="edge17" class="edge">
<title>Node8->Node17</title>
-<path fill="none" stroke="#191970" d="M2064.7813,-123.3906C2065.4743,-114.8657 2066.509,-102.1392 2067.3802,-91.4235"/>
-<polygon fill="#191970" stroke="#191970" points="2070.8858,-91.4956 2068.2077,-81.2449 2063.9088,-90.9283 2070.8858,-91.4956"/>
+<path fill="none" stroke="#191970" d="M1936.875,-123.3906C1943.3593,-114.3273 1953.2414,-100.5149 1961.1801,-89.4188"/>
+<polygon fill="#191970" stroke="#191970" points="1964.0559,-91.4143 1967.0281,-81.2449 1958.3629,-87.3412 1964.0559,-91.4143"/>
</g>
<!-- Node10 -->
<g id="node11" class="node">
<title>Node10</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2828.5,-.5 2828.5,-19.5 2921.5,-19.5 2921.5,-.5 2828.5,-.5"/>
-<text text-anchor="middle" x="2875" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dlpack/dlpack.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2296.5,-.5 2296.5,-19.5 2389.5,-19.5 2389.5,-.5 2296.5,-.5"/>
+<text text-anchor="middle" x="2343" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dlpack/dlpack.h</text>
</g>
<!-- Node9->Node10 -->
<g id="edge10" class="edge">
<title>Node9->Node10</title>
-<path fill="none" stroke="#191970" d="M2945.0112,-56.2977C2930.8034,-46.9022 2912.6215,-34.8787 2898.3475,-25.4395"/>
-<polygon fill="#191970" stroke="#191970" points="2899.8845,-22.2598 2889.6128,-19.6633 2896.0233,-28.0986 2899.8845,-22.2598"/>
+<path fill="none" stroke="#191970" d="M2413.0112,-56.2977C2398.8034,-46.9022 2380.6215,-34.8787 2366.3475,-25.4395"/>
+<polygon fill="#191970" stroke="#191970" points="2367.8845,-22.2598 2357.6128,-19.6633 2364.0233,-28.0986 2367.8845,-22.2598"/>
</g>
<!-- Node11 -->
<g id="node12" class="node">
<title>Node11</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2940,-.5 2940,-19.5 2996,-19.5 2996,-.5 2940,-.5"/>
-<text text-anchor="middle" x="2968" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">stddef.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2408,-.5 2408,-19.5 2464,-19.5 2464,-.5 2408,-.5"/>
+<text text-anchor="middle" x="2436" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">stddef.h</text>
</g>
<!-- Node9->Node11 -->
<g id="edge11" class="edge">
<title>Node9->Node11</title>
-<path fill="none" stroke="#191970" d="M2968,-56.2977C2968,-48.3834 2968,-38.6043 2968,-30.0759"/>
-<polygon fill="#191970" stroke="#191970" points="2971.5001,-29.8469 2968,-19.8469 2964.5001,-29.847 2971.5001,-29.8469"/>
+<path fill="none" stroke="#191970" d="M2436,-56.2977C2436,-48.3834 2436,-38.6043 2436,-30.0759"/>
+<polygon fill="#191970" stroke="#191970" points="2439.5001,-29.8469 2436,-19.8469 2432.5001,-29.847 2439.5001,-29.8469"/>
</g>
<!-- Node12 -->
<g id="node13" class="node">
<title>Node12</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="3014.5,-.5 3014.5,-19.5 3067.5,-19.5 3067.5,-.5 3014.5,-.5"/>
-<text text-anchor="middle" x="3041" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">stdint.h</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2482.5,-.5 2482.5,-19.5 2535.5,-19.5 2535.5,-.5 2482.5,-.5"/>
+<text text-anchor="middle" x="2509" y="-7.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">stdint.h</text>
</g>
<!-- Node9->Node12 -->
<g id="edge12" class="edge">
<title>Node9->Node12</title>
-<path fill="none" stroke="#191970" d="M2986.0449,-56.2977C2996.8114,-47.2274 3010.4851,-35.7077 3021.4995,-26.4285"/>
-<polygon fill="#191970" stroke="#191970" points="3023.9191,-28.9667 3029.3118,-19.8469 3019.4089,-23.6132 3023.9191,-28.9667"/>
+<path fill="none" stroke="#191970" d="M2454.0449,-56.2977C2464.8114,-47.2274 2478.4851,-35.7077 2489.4995,-26.4285"/>
+<polygon fill="#191970" stroke="#191970" points="2491.9191,-28.9667 2497.3118,-19.8469 2487.4089,-23.6132 2491.9191,-28.9667"/>
</g>
<!-- Node19->Node8 -->
<g id="edge53" class="edge">
<title>Node19->Node8</title>
-<path fill="none" stroke="#191970" d="M616.2613,-369.5091C736.309,-361.8503 985.1114,-343.8935 1069,-322 1102.3163,-313.305 1106.7414,-299.9134 1140,-291 1278.2408,-253.951 1322.0401,-293.0818 1460,-255 1480.7066,-249.2843 1483.5001,-241.4177 1504,-235 1550.578,-220.4182 1858.7621,-167.5983 1997.049,-144.2412"/>
-<polygon fill="#191970" stroke="#191970" points="1997.8269,-147.6595 2007.105,-142.544 1996.6619,-140.7571 1997.8269,-147.6595"/>
+<path fill="none" stroke="#191970" d="M2231.5665,-363.9182C2254.4856,-356.5473 2280.254,-343.7836 2294,-322 2316.1545,-286.8911 2290.6327,-263.0195 2260,-235 2218.3469,-196.9003 2199.3447,-197.4638 2146,-179 2098.3691,-162.5138 2042.6973,-150.8293 1999.8033,-143.3624"/>
+<polygon fill="#191970" stroke="#191970" points="2000.1901,-139.8779 1989.7441,-141.6476 1999.0137,-146.7784 2000.1901,-139.8779"/>
</g>
<!-- Node19->Node14 -->
<g id="edge54" class="edge">
<title>Node19->Node14</title>
-<path fill="none" stroke="#191970" d="M531.0063,-363.978C515.2024,-355.1351 493.2246,-340.5508 480,-322 459.1919,-292.8116 456,-280.846 456,-245 456,-245 456,-245 456,-189 456,-157.7875 452.1056,-146.8744 432,-123 416.5246,-104.6236 392.7259,-91.4111 373.4874,-82.9725"/>
-<polygon fill="#191970" stroke="#191970" points="374.7661,-79.7137 364.1889,-79.1023 372.0762,-86.1763 374.7661,-79.7137"/>
+<path fill="none" stroke="#191970" d="M2211.5724,-363.871C2237.4152,-353.9677 2278.3661,-337.8007 2313,-322 2490.4002,-241.0661 2526.1701,-202.9855 2704,-123 2737.398,-107.978 2776.5841,-92.8959 2803.3377,-82.9715"/>
+<polygon fill="#191970" stroke="#191970" points="2804.7716,-86.1732 2812.9445,-79.4313 2802.3511,-79.605 2804.7716,-86.1732"/>
</g>
<!-- Node20 -->
<g id="node21" class="node">
<title>Node20</title>
<g id="a_node21"><a xlink:href="optional_8h.html" target="_top" xlink:title="Runtime Optional container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="752,-291.5 752,-321.5 878,-321.5 878,-291.5 752,-291.5"/>
-<text text-anchor="start" x="760" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="815" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/optional.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1016,-291.5 1016,-321.5 1142,-321.5 1142,-291.5 1016,-291.5"/>
+<text text-anchor="start" x="1024" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="1079" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/optional.h</text>
</a>
</g>
</g>
<!-- Node19->Node20 -->
<g id="edge22" class="edge">
<title>Node19->Node20</title>
-<path fill="none" stroke="#191970" d="M587.6866,-363.9717C628.6794,-353.6075 695.098,-336.8148 745.5809,-324.0512"/>
-<polygon fill="#191970" stroke="#191970" points="746.6157,-327.3998 755.4527,-321.5553 744.8999,-320.6134 746.6157,-327.3998"/>
+<path fill="none" stroke="#191970" d="M2119.6771,-368.6204C2074.1958,-365.3653 2012.4631,-361.1257 1958,-358 1616.41,-338.3955 1529.6613,-353.905 1189,-322 1177.1489,-320.8901 1164.6184,-319.4144 1152.4251,-317.8139"/>
+<polygon fill="#191970" stroke="#191970" points="1152.6626,-314.3143 1142.2843,-316.4453 1151.7264,-321.2514 1152.6626,-314.3143"/>
</g>
<!-- Node19->Node26 -->
<g id="edge36" class="edge">
<title>Node19->Node26</title>
-<path fill="none" stroke="#191970" d="M613.5695,-363.9717C695.4135,-351.7042 837.3437,-330.4306 923.6651,-317.492"/>
-<polygon fill="#191970" stroke="#191970" points="924.2791,-320.9392 933.6498,-315.9954 923.2414,-314.0165 924.2791,-320.9392"/>
+<path fill="none" stroke="#191970" d="M2131.2478,-363.9717C2063.9334,-352.2572 1949.4276,-332.3303 1874.5006,-319.291"/>
+<polygon fill="#191970" stroke="#191970" points="1874.6518,-315.7648 1864.1997,-317.4984 1873.4516,-322.6611 1874.6518,-315.7648"/>
</g>
<!-- Node20->Node16 -->
<g id="edge23" class="edge">
<title>Node20->Node16</title>
-<path fill="none" stroke="#191970" d="M755.3215,-291.4109C736.951,-283.6621 718.7509,-272.1059 708,-255 676.7821,-205.3289 672.4798,-169.6915 708,-123 726.522,-98.6527 813.2539,-82.8365 861.4655,-75.7686"/>
-<polygon fill="#191970" stroke="#191970" points="862.0061,-79.227 871.4138,-74.3541 861.0207,-72.2967 862.0061,-79.227"/>
+<path fill="none" stroke="#191970" d="M1015.8374,-297.9037C951.0222,-288.3673 855.1213,-272.0612 822,-255 778.1148,-232.3941 780.9914,-207.9417 741,-179 713.9316,-159.4106 696.4652,-168.6064 675,-143 662.6808,-128.3041 656.6021,-107.0223 653.6556,-91.4997"/>
+<polygon fill="#191970" stroke="#191970" points="657.0723,-90.7033 652.0192,-81.3911 650.1623,-91.822 657.0723,-90.7033"/>
</g>
<!-- Node21 -->
<g id="node22" class="node">
<title>Node21</title>
<g id="a_node22"><a xlink:href="runtime_2container_2base_8h.html" target="_top" xlink:title="Base utilities for common POD(plain old data) container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="1248.5,-235.5 1248.5,-254.5 1303.5,-254.5 1303.5,-235.5 1248.5,-235.5"/>
-<text text-anchor="middle" x="1276" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">./base.h</text>
+<polygon fill="#ffffff" stroke="#000000" points="1323.5,-235.5 1323.5,-254.5 1378.5,-254.5 1378.5,-235.5 1323.5,-235.5"/>
+<text text-anchor="middle" x="1351" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">./base.h</text>
</a>
</g>
</g>
<!-- Node20->Node21 -->
<g id="edge24" class="edge">
<title>Node20->Node21</title>
-<path fill="none" stroke="#191970" d="M878.0988,-297.502C893.4427,-295.3434 909.8075,-293.0658 925,-291 1038.9313,-275.5078 1173.7989,-258.0918 1238.155,-249.8378"/>
-<polygon fill="#191970" stroke="#191970" points="1238.7579,-253.2892 1248.2318,-248.5464 1237.868,-246.346 1238.7579,-253.2892"/>
+<path fill="none" stroke="#191970" d="M1142.0859,-292.2361C1195.1164,-280.2457 1269.1464,-263.5073 1313.3586,-253.5108"/>
+<polygon fill="#191970" stroke="#191970" points="1314.4089,-256.8618 1323.3908,-251.2425 1312.8651,-250.0341 1314.4089,-256.8618"/>
</g>
<!-- Node21->Node7 -->
<g id="edge25" class="edge">
<title>Node21->Node7</title>
-<path fill="none" stroke="#191970" d="M1248.4292,-236.1266C1219.5973,-226.8474 1174.0852,-212.1998 1141.4535,-201.6977"/>
-<polygon fill="#191970" stroke="#191970" points="1142.3641,-198.314 1131.7726,-198.582 1140.2195,-204.9774 1142.3641,-198.314"/>
+<path fill="none" stroke="#191970" d="M1378.8752,-238.5761C1418.7625,-229.384 1492.8079,-212.32 1542.7428,-200.8124"/>
+<polygon fill="#191970" stroke="#191970" points="1543.7424,-204.1738 1552.701,-198.5175 1542.1704,-197.3526 1543.7424,-204.1738"/>
</g>
<!-- Node21->Node8 -->
<g id="edge32" class="edge">
<title>Node21->Node8</title>
-<path fill="none" stroke="#191970" d="M1291.3539,-235.4895C1316.5223,-220.447 1368.607,-191.583 1417,-179 1524.4959,-151.0492 1850.4991,-138.8607 1994.4313,-134.7392"/>
-<polygon fill="#191970" stroke="#191970" points="1994.5768,-138.2366 2004.4742,-134.4563 1994.3796,-131.2393 1994.5768,-138.2366"/>
+<path fill="none" stroke="#191970" d="M1341.2685,-235.3242C1328.3536,-221.3329 1308.8508,-195.3427 1324,-179 1342.0004,-159.5814 1704.4652,-142.3294 1860.0474,-135.7885"/>
+<polygon fill="#191970" stroke="#191970" points="1860.5922,-139.2689 1870.4375,-135.3547 1860.3002,-132.275 1860.5922,-139.2689"/>
</g>
<!-- Node21->Node13 -->
<g id="edge26" class="edge">
<title>Node21->Node13</title>
-<path fill="none" stroke="#191970" d="M1280.6307,-235.2551C1292.5734,-211.3389 1326.9523,-149.7512 1376,-123 1438.9851,-88.6472 1641.4098,-77.0862 1751.3324,-73.29"/>
-<polygon fill="#191970" stroke="#191970" points="1751.4717,-76.7874 1761.3498,-72.9567 1751.2389,-69.7913 1751.4717,-76.7874"/>
+<path fill="none" stroke="#191970" d="M1338.9293,-235.1552C1324.646,-222.2528 1304.0964,-198.9219 1315,-179 1341.8772,-129.8931 1401.8093,-99.9372 1442.3272,-84.5286"/>
+<polygon fill="#191970" stroke="#191970" points="1443.7394,-87.7387 1451.9242,-81.0112 1441.3305,-81.1662 1443.7394,-87.7387"/>
</g>
<!-- Node21->Node16 -->
<g id="edge35" class="edge">
<title>Node21->Node16</title>
-<path fill="none" stroke="#191970" d="M1275.968,-235.031C1275.3699,-220.6746 1272.1185,-194.234 1257,-179 1209.7003,-131.3388 1006.6588,-91.2771 926.6177,-77.046"/>
-<polygon fill="#191970" stroke="#191970" points="927.0606,-73.5703 916.6058,-75.287 925.8493,-80.4647 927.0606,-73.5703"/>
+<path fill="none" stroke="#191970" d="M1340.5628,-235.1907C1314.2413,-211.1274 1242.0328,-149.2307 1169,-123 1146.0809,-114.7683 794.4511,-83.8818 683.9614,-74.3316"/>
+<polygon fill="#191970" stroke="#191970" points="684.0107,-70.823 673.7468,-73.4502 683.4088,-77.7971 684.0107,-70.823"/>
</g>
<!-- Node21->Node22 -->
<g id="edge27" class="edge">
<title>Node21->Node22</title>
-<path fill="none" stroke="#191970" d="M1303.5387,-237.7936C1338.967,-228.5227 1401.0216,-212.2841 1443.7489,-201.1031"/>
-<polygon fill="#191970" stroke="#191970" points="1444.8405,-204.4354 1453.6287,-198.5177 1443.0684,-197.6634 1444.8405,-204.4354"/>
+<path fill="none" stroke="#191970" d="M1323.3327,-237.7936C1287.7388,-228.5227 1225.3942,-212.2841 1182.4673,-201.1031"/>
+<polygon fill="#191970" stroke="#191970" points="1183.1006,-197.6514 1172.5413,-198.5177 1181.3362,-204.4253 1183.1006,-197.6514"/>
</g>
<!-- Node21->Node24 -->
<g id="edge33" class="edge">
<title>Node21->Node24</title>
-<path fill="none" stroke="#191970" d="M1248.4856,-239.9811C1194.672,-230.1649 1075.18,-208.3683 1011.2634,-196.7093"/>
-<polygon fill="#191970" stroke="#191970" points="1011.5921,-193.2116 1001.1263,-194.8602 1010.3359,-200.0979 1011.5921,-193.2116"/>
+<path fill="none" stroke="#191970" d="M1323.4432,-242.6295C1221.5038,-233.8605 865.3259,-203.2216 742.2307,-192.6328"/>
+<polygon fill="#191970" stroke="#191970" points="742.3369,-189.129 732.0737,-191.759 741.7369,-196.1033 742.3369,-189.129"/>
</g>
<!-- Node25 -->
<g id="node26" class="node">
<title>Node25</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1164.5,-179.5 1164.5,-198.5 1247.5,-198.5 1247.5,-179.5 1164.5,-179.5"/>
-<text text-anchor="middle" x="1206" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">initializer_list</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1333.5,-179.5 1333.5,-198.5 1416.5,-198.5 1416.5,-179.5 1333.5,-179.5"/>
+<text text-anchor="middle" x="1375" y="-186.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">initializer_list</text>
</g>
<!-- Node21->Node25 -->
<g id="edge34" class="edge">
<title>Node21->Node25</title>
-<path fill="none" stroke="#191970" d="M1263.8069,-235.2455C1253.3679,-226.8943 1238.1345,-214.7076 1225.926,-204.9408"/>
-<polygon fill="#191970" stroke="#191970" points="1228.0485,-202.1566 1218.0533,-198.6427 1223.6756,-207.6227 1228.0485,-202.1566"/>
+<path fill="none" stroke="#191970" d="M1355.1805,-235.2455C1358.381,-227.7776 1362.896,-217.2427 1366.8084,-208.1137"/>
+<polygon fill="#191970" stroke="#191970" points="1370.1452,-209.2129 1370.8674,-198.6427 1363.7112,-206.4554 1370.1452,-209.2129"/>
</g>
<!-- Node22->Node8 -->
<g id="edge28" class="edge">
<title>Node22->Node8</title>
-<path fill="none" stroke="#191970" d="M1554.7293,-182.6849C1663.4812,-172.075 1881.9439,-150.7616 1994.08,-139.8215"/>
-<polygon fill="#191970" stroke="#191970" points="1994.6518,-143.2824 2004.2647,-138.8278 1993.9721,-136.3155 1994.6518,-143.2824"/>
+<path fill="none" stroke="#191970" d="M1200.6397,-184.441C1348.0312,-174.0457 1707.9712,-148.6595 1860.1938,-137.9234"/>
+<polygon fill="#191970" stroke="#191970" points="1860.6236,-141.4018 1870.3525,-137.2069 1860.131,-134.4192 1860.6236,-141.4018"/>
</g>
<!-- Node22->Node15 -->
<g id="edge30" class="edge">
<title>Node22->Node15</title>
-<path fill="none" stroke="#191970" d="M1546.3952,-179.4648C1629.0608,-165.7061 1788.0736,-140.0676 1924,-123 2098.9995,-101.0261 2308.1648,-82.6756 2398.1867,-75.1563"/>
-<polygon fill="#191970" stroke="#191970" points="2398.6343,-78.6312 2408.3099,-74.3145 2398.0542,-71.6553 2398.6343,-78.6312"/>
+<path fill="none" stroke="#191970" d="M1127.2119,-179.4726C1113.5315,-165.1886 1085.7176,-138.2737 1057,-123 1014.5522,-100.4237 961.0017,-86.6554 923.7783,-79.1052"/>
+<polygon fill="#191970" stroke="#191970" points="924.2214,-75.6255 913.7351,-77.1384 922.876,-82.4951 924.2214,-75.6255"/>
</g>
<!-- Node22->Node16 -->
<g id="edge31" class="edge">
<title>Node22->Node16</title>
-<path fill="none" stroke="#191970" d="M1491.3317,-179.2058C1492.7091,-164.5732 1492.8072,-137.2092 1477,-123 1435.918,-86.071 1043.9506,-74.72 926.8663,-72.1393"/>
-<polygon fill="#191970" stroke="#191970" points="926.7959,-68.6371 916.7235,-71.9227 926.6464,-75.6355 926.7959,-68.6371"/>
+<path fill="none" stroke="#191970" d="M1096.7212,-179.484C1003.3226,-156.8565 769.3727,-100.1779 683.3741,-79.3432"/>
+<polygon fill="#191970" stroke="#191970" points="684.1822,-75.9378 673.6392,-76.9848 682.534,-82.741 684.1822,-75.9378"/>
</g>
<!-- Node23 -->
<g id="node24" class="node">
<title>Node23</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1418.5,-123.5 1418.5,-142.5 1467.5,-142.5 1467.5,-123.5 1418.5,-123.5"/>
-<text text-anchor="middle" x="1443" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstdlib</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1111.5,-123.5 1111.5,-142.5 1160.5,-142.5 1160.5,-123.5 1111.5,-123.5"/>
+<text text-anchor="middle" x="1136" y="-130.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstdlib</text>
</g>
<!-- Node22->Node23 -->
<g id="edge29" class="edge">
<title>Node22->Node23</title>
-<path fill="none" stroke="#191970" d="M1481.8132,-179.2455C1475.1411,-171.2958 1465.5521,-159.8706 1457.5747,-150.3656"/>
-<polygon fill="#191970" stroke="#191970" points="1460.2026,-148.0524 1451.093,-142.6427 1454.8408,-152.5525 1460.2026,-148.0524"/>
+<path fill="none" stroke="#191970" d="M1136,-179.2455C1136,-171.9382 1136,-161.6944 1136,-152.7046"/>
+<polygon fill="#191970" stroke="#191970" points="1139.5001,-152.6426 1136,-142.6427 1132.5001,-152.6427 1139.5001,-152.6426"/>
</g>
<!-- Node26->Node7 -->
<g id="edge37" class="edge">
<title>Node26->Node7</title>
-<path fill="none" stroke="#191970" d="M937.6845,-291.4422C905.1267,-281.8483 869.4745,-268.6834 860,-255 854.9399,-247.692 854.3318,-241.8472 860,-235 871.8673,-220.6645 979.4773,-204.5569 1047.1451,-195.7147"/>
-<polygon fill="#191970" stroke="#191970" points="1047.763,-199.164 1057.2321,-194.4116 1046.8661,-192.2217 1047.763,-199.164"/>
+<path fill="none" stroke="#191970" d="M1821.5792,-291.4801C1839.6946,-276.4464 1861.0303,-253.0223 1846,-235 1821.4296,-205.5386 1715.5292,-194.8947 1649.0754,-191.0858"/>
+<polygon fill="#191970" stroke="#191970" points="1648.9408,-187.5736 1638.7672,-190.5313 1648.5648,-194.5635 1648.9408,-187.5736"/>
</g>
<!-- Node26->Node8 -->
<g id="edge41" class="edge">
<title>Node26->Node8</title>
-<path fill="none" stroke="#191970" d="M1060.2416,-297.7638C1077.0557,-295.4992 1095.2091,-293.1063 1112,-291 1249.6251,-273.7356 1287.7886,-290.0144 1422,-255 1444.3626,-249.1658 1447.8045,-241.4407 1470,-235 1655.3927,-181.2025 1882.0724,-151.9704 1994.2455,-139.837"/>
-<polygon fill="#191970" stroke="#191970" points="1994.8402,-143.2935 2004.4116,-138.7504 1994.0962,-136.3331 1994.8402,-143.2935"/>
+<path fill="none" stroke="#191970" d="M1737.7555,-292.8452C1687.2367,-281.299 1623.6509,-265.0717 1616,-255 1610.6231,-247.9218 1610.775,-242.1911 1616,-235 1645.7558,-194.0477 1783.9335,-161.049 1866.7514,-144.5242"/>
+<polygon fill="#191970" stroke="#191970" points="1867.6779,-147.9091 1876.8134,-142.5431 1866.3256,-141.0409 1867.6779,-147.9091"/>
</g>
<!-- Node26->Node13 -->
<g id="edge39" class="edge">
<title>Node26->Node13</title>
-<path fill="none" stroke="#191970" d="M1060.0829,-297.4662C1149.1728,-284.5282 1302.2889,-261.5569 1313,-255 1375.093,-216.9895 1347.1998,-159.8304 1410,-123 1466.2313,-90.022 1648.0734,-77.9584 1751.0786,-73.7022"/>
-<polygon fill="#191970" stroke="#191970" points="1751.2464,-77.1984 1761.0989,-73.3025 1750.9673,-70.2039 1751.2464,-77.1984"/>
+<path fill="none" stroke="#191970" d="M1737.5458,-292.4895C1690.7334,-281.5929 1631.4398,-266.4528 1609,-255 1573.8213,-237.0456 1564.0502,-230.3287 1540,-199 1513.9167,-165.0229 1496.6886,-117.2229 1488.5152,-90.858"/>
+<polygon fill="#191970" stroke="#191970" points="1491.8681,-89.8538 1485.653,-81.2734 1485.1608,-91.8568 1491.8681,-89.8538"/>
</g>
<!-- Node26->Node14 -->
<g id="edge47" class="edge">
<title>Node26->Node14</title>
-<path fill="none" stroke="#191970" d="M947.3236,-291.4963C915.5788,-281.6661 873.6419,-268.2293 837,-255 688.8104,-201.4972 656.4705,-175.7183 508,-123 461.8728,-106.6213 407.5496,-90.3209 373.939,-80.5762"/>
-<polygon fill="#191970" stroke="#191970" points="374.7834,-77.1771 364.2051,-77.7696 372.844,-83.9031 374.7834,-77.1771"/>
+<path fill="none" stroke="#191970" d="M1860.9595,-291.4396C1990.6586,-259.0447 2307.1233,-180.9841 2574,-123 2656.2037,-105.1397 2753.5515,-86.6248 2802.8839,-77.4275"/>
+<polygon fill="#191970" stroke="#191970" points="2803.751,-80.8263 2812.9427,-75.5573 2802.4714,-73.9443 2803.751,-80.8263"/>
</g>
<!-- Node26->Node15 -->
<g id="edge49" class="edge">
<title>Node26->Node15</title>
-<path fill="none" stroke="#191970" d="M1060.2659,-297.9508C1077.0797,-295.6848 1095.227,-293.2444 1112,-291 1232.4291,-274.8852 1267.031,-291.2498 1383,-255 1401.5925,-249.1883 1403.7764,-241.8817 1422,-235 1482.9095,-211.999 1502.0837,-219.1342 1564,-199 1586.7609,-191.5985 1590.8778,-185.1808 1614,-179 1723.2995,-149.7833 1754.7976,-163.8331 1866,-143 1904.5712,-135.7739 1913.3005,-129.5037 1952,-123 2115.2594,-95.5631 2311.6663,-80.2114 2398.2667,-74.3271"/>
-<polygon fill="#191970" stroke="#191970" points="2398.5303,-77.8174 2408.2738,-73.6562 2398.0621,-70.833 2398.5303,-77.8174"/>
+<path fill="none" stroke="#191970" d="M1737.6536,-293.1253C1733.0437,-292.3436 1728.4566,-291.6234 1724,-291 1523.9791,-263.022 1464.9967,-311.1819 1271,-255 1252.668,-249.691 1250.9543,-241.4723 1233,-235 1159.9364,-208.6615 1133.2906,-229.8165 1062,-199 995.4401,-170.2284 928.3346,-115.3246 897.098,-87.916"/>
+<polygon fill="#191970" stroke="#191970" points="899.4104,-85.2886 889.6092,-81.2652 894.7621,-90.5225 899.4104,-85.2886"/>
</g>
<!-- Node26->Node16 -->
<g id="edge51" class="edge">
<title>Node26->Node16</title>
-<path fill="none" stroke="#191970" d="M933.8397,-291.9339C899.1621,-282.5195 861.1695,-269.3708 851,-255 814.3905,-203.2661 857.7946,-124.7713 881.0918,-89.6259"/>
-<polygon fill="#191970" stroke="#191970" points="884.0338,-91.524 886.7843,-81.2924 878.2536,-87.5756 884.0338,-91.524"/>
+<path fill="none" stroke="#191970" d="M1737.6579,-293.0943C1733.0469,-292.3198 1728.4584,-291.6097 1724,-291 1507.2099,-261.3519 1448.8518,-290.846 1233,-255 1208.7536,-250.9735 1042.2591,-206.9443 1019,-199 997.517,-191.6624 993.2624,-186.954 972,-179 923.4245,-160.8285 910.2157,-159.3583 861,-143 797.8618,-122.0141 724.306,-96.7914 683.1455,-82.6069"/>
+<polygon fill="#191970" stroke="#191970" points="684.2435,-79.2834 673.6488,-79.332 681.9614,-85.9009 684.2435,-79.2834"/>
</g>
<!-- Node26->Node18 -->
<g id="edge52" class="edge">
<title>Node26->Node18</title>
-<path fill="none" stroke="#191970" d="M1060.3281,-302.7224C1255.8715,-291.0581 1845.8516,-255.8653 1993.9772,-247.0295"/>
-<polygon fill="#191970" stroke="#191970" points="1994.4807,-250.5058 2004.2545,-246.4164 1994.0638,-243.5182 1994.4807,-250.5058"/>
+<path fill="none" stroke="#191970" d="M1737.7938,-299.3233C1710.2439,-296.3954 1677.5542,-293.1905 1648,-291 1458.4551,-276.9516 853.7416,-252.3442 703.8585,-246.3464"/>
+<polygon fill="#191970" stroke="#191970" points="703.9967,-242.8492 693.8649,-245.9471 703.7172,-249.8436 703.9967,-242.8492"/>
</g>
<!-- Node26->Node21 -->
<g id="edge38" class="edge">
<title>Node26->Node21</title>
-<path fill="none" stroke="#191970" d="M1060.3094,-292.5447C1115.1989,-280.4454 1192.7902,-263.3419 1238.402,-253.2877"/>
-<polygon fill="#191970" stroke="#191970" points="1239.4407,-256.6429 1248.4528,-251.0722 1237.9338,-249.807 1239.4407,-256.6429"/>
+<path fill="none" stroke="#191970" d="M1737.6208,-293.3428C1733.0186,-292.5101 1728.4422,-291.7189 1724,-291 1594.4837,-270.0391 1561.2053,-271.1368 1431,-255 1417.2992,-253.302 1402.2803,-251.428 1388.9928,-249.7657"/>
+<polygon fill="#191970" stroke="#191970" points="1389.1232,-246.2548 1378.7659,-248.4853 1388.2536,-253.2005 1389.1232,-246.2548"/>
</g>
<!-- Node26->Node22 -->
<g id="edge40" class="edge">
<title>Node26->Node22</title>
-<path fill="none" stroke="#191970" d="M1060.2965,-297.6313C1157.16,-283.9919 1332.4573,-259.0138 1346,-255 1357.6674,-251.542 1422.0391,-221.3149 1460.6574,-202.9885"/>
-<polygon fill="#191970" stroke="#191970" points="1462.3432,-206.0626 1469.873,-198.6092 1459.3387,-199.7401 1462.3432,-206.0626"/>
+<path fill="none" stroke="#191970" d="M1737.6476,-293.1673C1733.0391,-292.3757 1728.4539,-291.6418 1724,-291 1542.9467,-264.9107 1490.7061,-302.2853 1314,-255 1293.6392,-249.5516 1290.5589,-242.8547 1271,-235 1239.9293,-222.5222 1204.0156,-210.3917 1177.0445,-201.7231"/>
+<polygon fill="#191970" stroke="#191970" points="1177.7649,-198.2792 1167.1742,-198.5768 1175.6389,-204.9486 1177.7649,-198.2792"/>
</g>
<!-- Node26->Node24 -->
<g id="edge42" class="edge">
<title>Node26->Node24</title>
-<path fill="none" stroke="#191970" d="M1060.3528,-294.6148C1102.7975,-283.703 1147.1777,-264.6614 1125,-235 1117.9999,-225.6378 1054.778,-209.1393 1011.0007,-198.6547"/>
-<polygon fill="#191970" stroke="#191970" points="1011.6982,-195.2231 1001.1603,-196.3203 1010.0824,-202.0341 1011.6982,-195.2231"/>
+<path fill="none" stroke="#191970" d="M1737.7537,-299.8078C1710.1968,-296.9656 1677.5126,-293.6938 1648,-291 1451.2798,-273.0441 1401.4392,-275.8058 1205,-255 1159.3296,-250.1628 855.1668,-209.7057 742.1662,-194.6314"/>
+<polygon fill="#191970" stroke="#191970" points="742.5308,-191.1491 732.1557,-193.2956 741.6049,-198.0876 742.5308,-191.1491"/>
</g>
<!-- Node26->Node25 -->
<g id="edge45" class="edge">
<title>Node26->Node25</title>
-<path fill="none" stroke="#191970" d="M944.3609,-291.3787C909.3024,-278.7048 873.1841,-258.8716 894,-235 930.8138,-192.7819 1082.8182,-207.2111 1154.347,-198.934"/>
-<polygon fill="#191970" stroke="#191970" points="1154.8765,-202.3941 1164.2932,-197.5386 1153.9039,-195.462 1154.8765,-202.3941"/>
+<path fill="none" stroke="#191970" d="M1737.6057,-293.4338C1733.007,-292.5796 1728.4355,-291.7589 1724,-291 1611.1821,-271.6983 1575.4528,-297.0504 1469,-255 1439.6744,-243.4159 1410.713,-220.9617 1392.8543,-205.4854"/>
+<polygon fill="#191970" stroke="#191970" points="1394.8255,-202.5547 1385.0263,-198.5267 1390.1748,-207.7865 1394.8255,-202.5547"/>
</g>
<!-- Node27 -->
<g id="node28" class="node">
<title>Node27</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="902.5,-235.5 902.5,-254.5 955.5,-254.5 955.5,-235.5 902.5,-235.5"/>
-<text text-anchor="middle" x="929" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstddef</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1784.5,-235.5 1784.5,-254.5 1837.5,-254.5 1837.5,-235.5 1784.5,-235.5"/>
+<text text-anchor="middle" x="1811" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstddef</text>
</g>
<!-- Node26->Node27 -->
<g id="edge43" class="edge">
<title>Node26->Node27</title>
-<path fill="none" stroke="#191970" d="M980.191,-291.2977C970.2603,-282.3163 957.6743,-270.9334 947.4673,-261.702"/>
-<polygon fill="#191970" stroke="#191970" points="949.652,-258.9588 939.8877,-254.8469 944.9566,-264.1505 949.652,-258.9588"/>
+<path fill="none" stroke="#191970" d="M1803.4719,-291.2977C1804.7733,-283.2945 1806.3847,-273.3843 1807.7823,-264.7889"/>
+<polygon fill="#191970" stroke="#191970" points="1811.2485,-265.2791 1809.3989,-254.8469 1804.3392,-264.1555 1811.2485,-265.2791"/>
</g>
<!-- Node28 -->
<g id="node29" class="node">
<title>Node28</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="974,-235.5 974,-254.5 1024,-254.5 1024,-235.5 974,-235.5"/>
-<text text-anchor="middle" x="999" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstring</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1625,-235.5 1625,-254.5 1675,-254.5 1675,-235.5 1625,-235.5"/>
+<text text-anchor="middle" x="1650" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">cstring</text>
</g>
<!-- Node26->Node28 -->
<g id="edge44" class="edge">
<title>Node26->Node28</title>
-<path fill="none" stroke="#191970" d="M997.4944,-291.2977C997.7518,-283.3834 998.0698,-273.6043 998.3471,-265.0759"/>
-<polygon fill="#191970" stroke="#191970" points="1001.8528,-264.9554 998.6798,-254.8469 994.8565,-264.7279 1001.8528,-264.9554"/>
+<path fill="none" stroke="#191970" d="M1764.0612,-291.4554C1739.198,-281.329 1706.7082,-268.0964 1682.7664,-258.3453"/>
+<polygon fill="#191970" stroke="#191970" points="1683.9812,-255.0609 1673.3996,-254.5303 1681.3407,-261.5438 1683.9812,-255.0609"/>
</g>
<!-- Node29 -->
<g id="node30" class="node">
<title>Node29</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1700,-235.5 1700,-254.5 1758,-254.5 1758,-235.5 1700,-235.5"/>
-<text text-anchor="middle" x="1729" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">memory</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="555,-235.5 555,-254.5 613,-254.5 613,-235.5 555,-235.5"/>
+<text text-anchor="middle" x="584" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">memory</text>
</g>
<!-- Node26->Node29 -->
<g id="edge46" class="edge">
<title>Node26->Node29</title>
-<path fill="none" stroke="#191970" d="M1060.1734,-301.3269C1172.0296,-292.1469 1412.256,-272.3399 1615,-255 1639.9499,-252.8661 1667.9656,-250.4095 1689.9804,-248.4651"/>
-<polygon fill="#191970" stroke="#191970" points="1690.2997,-251.9506 1699.9524,-247.5831 1689.6829,-244.9778 1690.2997,-251.9506"/>
+<path fill="none" stroke="#191970" d="M1737.79,-301.6044C1692.9373,-298.2583 1631.3014,-293.9179 1577,-291 1159.5182,-268.5669 1052.4466,-301.8867 637,-255 632.4814,-254.49 627.77,-253.8137 623.1014,-253.0525"/>
+<polygon fill="#191970" stroke="#191970" points="623.4965,-249.5683 613.0412,-251.282 622.2832,-256.4623 623.4965,-249.5683"/>
</g>
<!-- Node30 -->
<g id="node31" class="node">
<title>Node30</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1042.5,-235.5 1042.5,-254.5 1115.5,-254.5 1115.5,-235.5 1042.5,-235.5"/>
-<text text-anchor="middle" x="1079" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string_view</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1693.5,-235.5 1693.5,-254.5 1766.5,-254.5 1766.5,-235.5 1693.5,-235.5"/>
+<text text-anchor="middle" x="1730" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">string_view</text>
</g>
<!-- Node26->Node30 -->
<g id="edge48" class="edge">
<title>Node26->Node30</title>
-<path fill="none" stroke="#191970" d="M1017.2697,-291.2977C1029.6006,-282.0495 1045.3266,-270.2551 1057.8197,-260.8853"/>
-<polygon fill="#191970" stroke="#191970" points="1059.9708,-263.647 1065.8708,-254.8469 1055.7707,-258.047 1059.9708,-263.647"/>
+<path fill="none" stroke="#191970" d="M1783.4494,-291.2977C1772.9779,-282.2274 1759.6789,-270.7077 1748.9662,-261.4285"/>
+<polygon fill="#191970" stroke="#191970" points="1751.2182,-258.7487 1741.368,-254.8469 1746.6351,-264.0398 1751.2182,-258.7487"/>
</g>
<!-- Node31 -->
<g id="node32" class="node">
<title>Node31</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1512.5,-235.5 1512.5,-254.5 1605.5,-254.5 1605.5,-235.5 1512.5,-235.5"/>
-<text text-anchor="middle" x="1559" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">unordered_map</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="945.5,-235.5 945.5,-254.5 1038.5,-254.5 1038.5,-235.5 945.5,-235.5"/>
+<text text-anchor="middle" x="992" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">unordered_map</text>
</g>
<!-- Node26->Node31 -->
<g id="edge50" class="edge">
<title>Node26->Node31</title>
-<path fill="none" stroke="#191970" d="M1060.1041,-299.5945C1170.4132,-287.5233 1396.5996,-262.7716 1502.2631,-251.2088"/>
-<polygon fill="#191970" stroke="#191970" points="1502.8492,-254.6656 1512.4091,-250.0985 1502.0877,-247.7071 1502.8492,-254.6656"/>
+<path fill="none" stroke="#191970" d="M1737.7686,-299.6397C1710.2143,-296.7678 1677.528,-293.5192 1648,-291 1417.616,-271.345 1359.6374,-271.4164 1129,-255 1102.6699,-253.1259 1073.4969,-251.0029 1048.8619,-249.1967"/>
+<polygon fill="#191970" stroke="#191970" points="1048.8327,-245.6852 1038.6033,-248.4436 1048.3202,-252.6664 1048.8327,-245.6852"/>
</g>
<!-- Node32->Node15 -->
<g id="edge58" class="edge">
<title>Node32->Node15</title>
-<path fill="none" stroke="#191970" d="M1212.1912,-363.3893C1275.6303,-353.1792 1376.1472,-336.8395 1463,-322 1630.7501,-293.3386 1674.5221,-294.7322 1840,-255 2057.6448,-202.7422 2312.4549,-116.8437 2406.0908,-84.4254"/>
-<polygon fill="#191970" stroke="#191970" points="2407.3753,-87.6845 2415.6748,-81.0989 2405.08,-81.0715 2407.3753,-87.6845"/>
+<path fill="none" stroke="#191970" d="M501.2499,-358.2886C498.9341,-348.094 496.186,-334.3299 495,-322 491.1706,-282.1887 489.555,-263.1156 518,-235 605.4761,-148.5371 668.5769,-197.1722 779,-143 809.7111,-127.9335 841.1658,-103.7053 860.3942,-87.6937"/>
+<polygon fill="#191970" stroke="#191970" points="862.7153,-90.3145 868.0854,-81.1815 858.1919,-84.9723 862.7153,-90.3145"/>
</g>
<!-- Node32->Node16 -->
<g id="edge59" class="edge">
<title>Node32->Node16</title>
-<path fill="none" stroke="#191970" d="M1085.5794,-366.776C1039.248,-360.0294 975.8946,-346.9083 925,-322 905.4229,-312.4188 904.2964,-304.2626 887,-291 865.27,-274.3377 850.3448,-278.9112 837,-255 808.3103,-203.594 820.0152,-174.7931 848,-123 855.0345,-109.9809 865.98,-97.5433 875.4295,-88.1533"/>
-<polygon fill="#191970" stroke="#191970" points="878.0723,-90.4696 882.8964,-81.0367 873.2428,-85.4023 878.0723,-90.4696"/>
+<path fill="none" stroke="#191970" d="M441.8929,-366.8023C371.0842,-355.3849 266,-324.9695 266,-245 266,-245 266,-245 266,-189 266,-157.7875 265.7666,-142.6714 290,-123 339.9958,-82.416 538.4957,-73.8115 617.9096,-71.989"/>
+<polygon fill="#191970" stroke="#191970" points="618.3715,-75.4803 628.297,-71.7743 618.2268,-68.4818 618.3715,-75.4803"/>
</g>
<!-- Node32->Node18 -->
<g id="edge60" class="edge">
<title>Node32->Node18</title>
-<path fill="none" stroke="#191970" d="M1212.1125,-370.72C1370.7069,-363.4567 1777.3921,-343.0144 1835,-322 1857.1331,-313.9262 1856.9007,-301.4825 1878,-291 1898.5138,-280.8084 1956.8171,-264.1534 1994.4015,-253.9266"/>
-<polygon fill="#191970" stroke="#191970" points="1995.6637,-257.2112 2004.4042,-251.2234 1993.8374,-250.4536 1995.6637,-257.2112"/>
+<path fill="none" stroke="#191970" d="M519.6036,-358.1983C536.5895,-340.8336 565.9029,-312.1783 594,-291 609.5031,-279.3145 628.1348,-267.9992 643.0704,-259.5152"/>
+<polygon fill="#191970" stroke="#191970" points="644.8083,-262.5534 651.8312,-254.6206 641.3942,-256.4425 644.8083,-262.5534"/>
</g>
<!-- Node32->Node20 -->
<g id="edge62" class="edge">
<title>Node32->Node20</title>
-<path fill="none" stroke="#191970" d="M1085.8907,-360.8696C1033.864,-350.4533 958.1405,-335.2839 892,-322 890.7216,-321.7432 889.4312,-321.484 888.1316,-321.2229"/>
-<polygon fill="#191970" stroke="#191970" points="888.5248,-317.732 878.0312,-319.1928 887.1454,-324.5948 888.5248,-317.732"/>
+<path fill="none" stroke="#191970" d="M568.07,-366.1382C675.2124,-353.632 892.1797,-328.3066 1005.8242,-315.0414"/>
+<polygon fill="#191970" stroke="#191970" points="1006.299,-318.5099 1015.8257,-313.874 1005.4873,-311.5571 1006.299,-318.5099"/>
</g>
<!-- Node32->Node21 -->
<g id="edge61" class="edge">
<title>Node32->Node21</title>
-<path fill="none" stroke="#191970" d="M1162.5755,-358.2716C1177.7034,-341.4649 1203.0595,-313.7765 1226,-291 1236.0994,-280.9727 1247.7857,-270.1792 1257.3195,-261.5691"/>
-<polygon fill="#191970" stroke="#191970" points="1259.9402,-263.9202 1265.0497,-254.6388 1255.2675,-258.7081 1259.9402,-263.9202"/>
+<path fill="none" stroke="#191970" d="M568.3008,-362.4722C663.4069,-346.0453 848.8829,-314.5591 1007,-291 1117.9834,-274.4637 1249.5355,-257.6437 1313.0008,-249.7034"/>
+<polygon fill="#191970" stroke="#191970" points="1313.8251,-253.1278 1323.3147,-248.4163 1312.9582,-246.1817 1313.8251,-253.1278"/>
</g>
<!-- Node32->Node24 -->
<g id="edge56" class="edge">
<title>Node32->Node24</title>
-<path fill="none" stroke="#191970" d="M1164.5325,-358.1927C1185.2609,-338.8193 1224.0204,-306.1515 1264,-291 1411.7205,-235.017 1723.1004,-350.1945 1615,-235 1571.8784,-189.0485 1111.7381,-204.9108 1049,-199 1036.6978,-197.841 1023.3667,-196.2921 1011.1794,-194.7601"/>
-<polygon fill="#191970" stroke="#191970" points="1011.6231,-191.2884 1001.2587,-193.4861 1010.7315,-198.2314 1011.6231,-191.2884"/>
+<path fill="none" stroke="#191970" d="M501.5962,-358.4295C498.463,-341.523 495.3514,-313.5196 503,-291 513.0917,-261.2872 520.1565,-252.7989 546,-235 579.4891,-211.9354 624.7541,-200.2238 657.536,-194.4137"/>
+<polygon fill="#191970" stroke="#191970" points="658.4168,-197.8156 667.7101,-192.7278 657.2724,-190.9097 658.4168,-197.8156"/>
</g>
<!-- Node32->Node29 -->
<g id="edge57" class="edge">
<title>Node32->Node29</title>
-<path fill="none" stroke="#191970" d="M1180.137,-358.3954C1224.8065,-336.9644 1305.0948,-299.3932 1336,-291 1470.7073,-254.4161 1509.3981,-271.5517 1648,-255 1661.5439,-253.3826 1676.3555,-251.5676 1689.5663,-249.9322"/>
-<polygon fill="#191970" stroke="#191970" points="1690.2704,-253.3718 1699.763,-248.6665 1689.408,-246.4251 1690.2704,-253.3718"/>
+<path fill="none" stroke="#191970" d="M503.9713,-358.2764C503.5169,-340.726 505.0772,-311.649 518,-291 526.3472,-277.6622 540.1291,-267.1552 552.946,-259.5705"/>
+<polygon fill="#191970" stroke="#191970" points="554.952,-262.4611 562.0166,-254.5654 551.5701,-256.3322 554.952,-262.4611"/>
</g>
<!-- Node33->Node9 -->
<g id="edge64" class="edge">
<title>Node33->Node9</title>
-<path fill="none" stroke="#191970" d="M2767.8799,-296.8787C2788.3256,-278.6037 2833.5263,-237.27 2868,-199 2899.2947,-164.2591 2932.1066,-120.8837 2951.2735,-94.7305"/>
-<polygon fill="#191970" stroke="#191970" points="2954.1677,-96.7018 2957.2286,-86.5588 2948.5105,-92.5792 2954.1677,-96.7018"/>
+<path fill="none" stroke="#191970" d="M2522.405,-296.9731C2515.6329,-286.8884 2505.1066,-270.3398 2498,-255 2472.764,-200.5271 2452.5729,-132.7865 2442.6637,-96.7766"/>
+<polygon fill="#191970" stroke="#191970" points="2445.9185,-95.4057 2439.9243,-86.6696 2439.1623,-97.237 2445.9185,-95.4057"/>
</g>
<!-- Node33->Node13 -->
<g id="edge65" class="edge">
<title>Node33->Node13</title>
-<path fill="none" stroke="#191970" d="M2745.7022,-296.8466C2716.5338,-272.5092 2635.4164,-208.4895 2556,-179 2435.6703,-134.3182 2057.0657,-93.8813 1896.8038,-78.2952"/>
-<polygon fill="#191970" stroke="#191970" points="1896.8591,-74.7843 1886.5685,-77.3047 1896.1848,-81.7517 1896.8591,-74.7843"/>
+<path fill="none" stroke="#191970" d="M2520.5598,-296.8993C2489.6754,-262.0335 2382.2141,-143.0991 2336,-123 2300.5002,-107.5607 1755.737,-83.058 1555.8334,-74.5382"/>
+<polygon fill="#191970" stroke="#191970" points="1555.8632,-71.0364 1545.7235,-74.1085 1555.5658,-78.0301 1555.8632,-71.0364"/>
</g>
<!-- Node33->Node14 -->
<g id="edge66" class="edge">
<title>Node33->Node14</title>
-<path fill="none" stroke="#191970" d="M2726.7362,-296.9864C2676.8612,-281.6619 2574.0466,-251.5586 2485,-235 2041.84,-152.5923 1926.3864,-158.1235 1477,-123 1040.9351,-88.9177 508.7257,-75.2392 374.1045,-72.1888"/>
-<polygon fill="#191970" stroke="#191970" points="374.1277,-68.6886 364.0519,-71.9642 373.9712,-75.6868 374.1277,-68.6886"/>
+<path fill="none" stroke="#191970" d="M2560.7637,-296.9297C2584.4839,-288.6309 2616.6914,-274.8126 2640,-255 2693.3502,-209.6517 2673.5642,-169.4027 2726,-123 2748.3912,-103.1851 2779.7359,-89.4476 2803.0683,-81.1752"/>
+<polygon fill="#191970" stroke="#191970" points="2804.499,-84.3859 2812.8452,-77.8597 2802.2509,-77.7566 2804.499,-84.3859"/>
</g>
<!-- Node33->Node15 -->
<g id="edge67" class="edge">
<title>Node33->Node15</title>
-<path fill="none" stroke="#191970" d="M2753.263,-296.7422C2740.9923,-266.0457 2698.8512,-170.6666 2632,-123 2588.9865,-92.3303 2528.5859,-79.9187 2487.7354,-74.9"/>
-<polygon fill="#191970" stroke="#191970" points="2488.1231,-71.4216 2477.7934,-73.7767 2487.3372,-78.3774 2488.1231,-71.4216"/>
+<path fill="none" stroke="#191970" d="M2512.2685,-296.8697C2470.8382,-273.4065 2359.6129,-212.7411 2260,-179 2147.2196,-140.7987 2116.0962,-138.2332 1998,-123 1784.777,-95.4964 1105.194,-77.0862 923.8425,-72.5772"/>
+<polygon fill="#191970" stroke="#191970" points="923.6889,-69.0724 913.6055,-72.3243 923.516,-76.0703 923.6889,-69.0724"/>
</g>
<!-- Node34->Node6 -->
<g id="edge70" class="edge">
<title>Node34->Node6</title>
-<path fill="none" stroke="#191970" d="M2257.9658,-425.4793C2205.3403,-406.4158 2109.6377,-374.0826 2025,-358 1918.7863,-337.8177 1601.5445,-318.003 1463.9057,-310.0948"/>
-<polygon fill="#191970" stroke="#191970" points="1464.0996,-306.6003 1453.9161,-309.5236 1463.6999,-313.5889 1464.0996,-306.6003"/>
+<path fill="none" stroke="#191970" d="M1941.3497,-438.9147C1819.242,-434.9023 1535.0141,-422.1963 1445,-389 1422.5559,-380.7229 1422.2808,-368.9258 1401,-358 1368.275,-341.1986 1328.8309,-327.7783 1298.8951,-318.8546"/>
+<polygon fill="#191970" stroke="#191970" points="1299.7112,-315.4468 1289.131,-316.0023 1297.7484,-322.166 1299.7112,-315.4468"/>
</g>
<!-- Node34->Node14 -->
-<g id="edge92" class="edge">
+<g id="edge83" class="edge">
<title>Node34->Node14</title>
-<path fill="none" stroke="#191970" d="M2241.4885,-433.5167C2213.6789,-430.3623 2179.629,-426.9192 2149,-425 2056.1606,-419.1827 556.9068,-433.094 475,-389 435.7581,-367.8744 418,-351.0671 418,-306.5 418,-306.5 418,-306.5 418,-189 418,-147.4563 384.0901,-108.986 361.5719,-88.0193"/>
-<polygon fill="#191970" stroke="#191970" points="363.6556,-85.1865 353.875,-81.1133 358.9808,-90.3968 363.6556,-85.1865"/>
+<path fill="none" stroke="#191970" d="M2054.6064,-432.6555C2072.6282,-430.1794 2092.6483,-427.4511 2111,-425 2234.0537,-408.5644 2266.7477,-415.65 2388,-389 2433.0318,-379.1025 2442.1043,-368.4976 2487,-358 2587.3218,-334.5426 2631.2829,-380.6322 2716,-322 2796.8177,-266.0666 2824.4131,-140.0558 2832.2775,-91.2924"/>
+<polygon fill="#191970" stroke="#191970" points="2835.7796,-91.5391 2833.8057,-81.1298 2828.8575,-90.4981 2835.7796,-91.5391"/>
</g>
<!-- Node34->Node33 -->
<g id="edge71" class="edge">
<title>Node34->Node33</title>
-<path fill="none" stroke="#191970" d="M2349.7609,-425.389C2441.3434,-398.6525 2630.2231,-343.5111 2714.6308,-318.8692"/>
-<polygon fill="#191970" stroke="#191970" points="2715.7745,-322.1815 2724.3929,-316.0193 2713.8128,-315.462 2715.7745,-322.1815"/>
+<path fill="none" stroke="#191970" d="M2054.796,-432.5123C2143.0875,-419.9792 2305.8825,-396.3631 2332,-389 2394.5729,-371.3594 2463.8425,-339.1838 2501.2843,-320.6578"/>
+<polygon fill="#191970" stroke="#191970" points="2503.15,-323.6383 2510.5319,-316.0384 2500.0219,-317.3761 2503.15,-323.6383"/>
</g>
<!-- Node34->Node35 -->
<g id="edge72" class="edge">
<title>Node34->Node35</title>
-<path fill="none" stroke="#191970" d="M2298,-425.2967C2298,-415.7699 2298,-403.3954 2298,-393.1306"/>
-<polygon fill="#191970" stroke="#191970" points="2301.5001,-393.0549 2298,-383.055 2294.5001,-393.055 2301.5001,-393.0549"/>
+<path fill="none" stroke="#191970" d="M1973.0894,-425.4639C1954.7167,-414.3741 1929.84,-399.3584 1911.5526,-388.32"/>
+<polygon fill="#191970" stroke="#191970" points="1913.281,-385.2751 1902.911,-383.1039 1909.6636,-391.268 1913.281,-385.2751"/>
</g>
<!-- Node40 -->
-<g id="node41" class="node">
+<g id="node37" class="node">
<title>Node40</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2566.5,-297 2566.5,-316 2631.5,-316 2631.5,-297 2566.5,-297"/>
-<text text-anchor="middle" x="2599" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">functional</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="2148.5,-297 2148.5,-316 2213.5,-316 2213.5,-297 2148.5,-297"/>
+<text text-anchor="middle" x="2181" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">functional</text>
</g>
<!-- Node34->Node40 -->
-<g id="edge91" class="edge">
+<g id="edge82" class="edge">
<title>Node34->Node40</title>
-<path fill="none" stroke="#191970" d="M2331.9434,-425.389C2390.9516,-399.1196 2511.5548,-345.4291 2568.2523,-320.1883"/>
-<polygon fill="#191970" stroke="#191970" points="2569.7143,-323.3687 2577.4265,-316.1042 2566.8674,-316.9738 2569.7143,-323.3687"/>
+<path fill="none" stroke="#191970" d="M2010.0141,-425.2951C2024.6476,-407.4966 2050.7995,-377.9247 2078,-358 2099.3028,-342.3954 2126.042,-329.1838 2146.937,-320.0498"/>
+<polygon fill="#191970" stroke="#191970" points="2148.554,-323.1653 2156.38,-316.0236 2145.8085,-316.7261 2148.554,-323.1653"/>
</g>
<!-- Node35->Node8 -->
-<g id="edge81" class="edge">
+<g id="edge77" class="edge">
<title>Node35->Node8</title>
-<path fill="none" stroke="#191970" d="M2298.0185,-363.984C2297.7461,-348.243 2295.8951,-316.0365 2286,-291 2263.7039,-234.5864 2256.8054,-215.0228 2208,-179 2185.9101,-162.6957 2157.8274,-152.0351 2132.3385,-145.1284"/>
-<polygon fill="#191970" stroke="#191970" points="2132.9007,-141.6596 2122.3452,-142.572 2131.1658,-148.4412 2132.9007,-141.6596"/>
+<path fill="none" stroke="#191970" d="M1902.448,-363.8628C1915.6725,-354.7526 1934.0986,-339.8419 1944,-322 1974.948,-266.2328 1972.2253,-240.6806 1956,-179 1953.3888,-169.0734 1948.0933,-159.0366 1942.9588,-150.8822"/>
+<polygon fill="#191970" stroke="#191970" points="1945.864,-148.9303 1937.3819,-142.5817 1940.0537,-152.8342 1945.864,-148.9303"/>
</g>
<!-- Node35->Node9 -->
<g id="edge73" class="edge">
<title>Node35->Node9</title>
-<path fill="none" stroke="#191970" d="M2338.3424,-363.9771C2393.5991,-350.84 2487.748,-328.0863 2503,-322 2529.0615,-311.6001 2533.4018,-304.5013 2558,-291 2680.3618,-223.8386 2711.9733,-208.9354 2835,-143 2867.4506,-125.6083 2904.3243,-105.78 2931.0017,-91.4228"/>
-<polygon fill="#191970" stroke="#191970" points="2932.842,-94.4071 2939.9886,-86.5856 2929.5242,-88.2433 2932.842,-94.4071"/>
+<path fill="none" stroke="#191970" d="M1906.8901,-363.9072C1927.7415,-353.7688 1961.388,-337.1769 1990,-322 2144.9866,-239.7895 2326.3627,-135.2885 2401.106,-91.8553"/>
+<polygon fill="#191970" stroke="#191970" points="2403.216,-94.6771 2410.1007,-86.6241 2399.6967,-88.6261 2403.216,-94.6771"/>
</g>
<!-- Node35->Node16 -->
-<g id="edge89" class="edge">
+<g id="edge80" class="edge">
<title>Node35->Node16</title>
-<path fill="none" stroke="#191970" d="M2285.4975,-363.8349C2251.5596,-338.1453 2154.0048,-267.89 2061,-235 1842.2099,-157.6276 1780.5584,-157.0375 1551,-123 1313.1616,-87.7348 1023.0681,-75.6633 926.7866,-72.4705"/>
-<polygon fill="#191970" stroke="#191970" points="926.6399,-68.9641 916.5324,-72.1401 926.4144,-75.9605 926.6399,-68.9641"/>
+<path fill="none" stroke="#191970" d="M1856.2607,-363.9856C1824.3084,-353.9779 1772.9222,-337.5519 1729,-322 1692.6641,-309.1342 1685.6289,-299.3603 1648,-291 1430.0731,-242.5817 1366.1931,-297.3184 1147,-255 1066.8177,-239.5197 1049.7146,-224.0854 972,-199 895.4929,-174.3044 874.6268,-172.9007 800,-143 756.4494,-125.5506 707.7442,-101.2216 678.237,-85.9205"/>
+<polygon fill="#191970" stroke="#191970" points="679.4861,-82.6245 669.0014,-81.1006 676.2473,-88.8302 679.4861,-82.6245"/>
</g>
<!-- Node35->Node17 -->
-<g id="edge87" class="edge">
+<g id="edge78" class="edge">
<title>Node35->Node17</title>
-<path fill="none" stroke="#191970" d="M2336.0136,-363.997C2388.2532,-350.7665 2476.3547,-327.7674 2481,-322 2505.2803,-291.8547 2506.797,-266.9869 2485,-235 2439.8202,-168.6991 2196.2124,-102.8317 2104.209,-79.9549"/>
-<polygon fill="#191970" stroke="#191970" points="2104.8144,-76.4994 2094.2673,-77.5039 2103.1388,-83.2959 2104.8144,-76.4994"/>
+<path fill="none" stroke="#191970" d="M1910.5975,-363.9694C1928.4738,-355.5437 1952.2331,-341.5409 1966,-322 2017.5929,-248.7684 2017.3784,-210.4595 1998,-123 1995.466,-111.5636 1990.1188,-99.6558 1985.1134,-90.1747"/>
+<polygon fill="#191970" stroke="#191970" points="1988.0854,-88.318 1980.1534,-81.2943 1981.9741,-91.7315 1988.0854,-88.318"/>
</g>
<!-- Node35->Node18 -->
-<g id="edge90" class="edge">
+<g id="edge81" class="edge">
<title>Node35->Node18</title>
-<path fill="none" stroke="#191970" d="M2277.7061,-363.8416C2230.3875,-341.3215 2112.5812,-285.2544 2057.3953,-258.99"/>
-<polygon fill="#191970" stroke="#191970" points="2058.6947,-255.7323 2048.1611,-254.5952 2055.6865,-262.053 2058.6947,-255.7323"/>
+<path fill="none" stroke="#191970" d="M1851.4771,-363.9424C1771.5417,-342.4491 1583.9207,-292.0841 1577,-291 1489.6277,-277.3132 858.718,-252.2837 704.125,-246.3067"/>
+<polygon fill="#191970" stroke="#191970" points="703.959,-242.7978 693.8315,-245.9097 703.6891,-249.7926 703.959,-242.7978"/>
</g>
<!-- Node35->Node20 -->
<g id="edge74" class="edge">
<title>Node35->Node20</title>
-<path fill="none" stroke="#191970" d="M2235.4043,-369.518C2180.0848,-366.1011 2097.2041,-361.2308 2025,-358 1536.3383,-336.1346 1412.3281,-364.1864 925,-322 913.1413,-320.9734 900.607,-319.5409 888.4122,-317.9579"/>
-<polygon fill="#191970" stroke="#191970" points="888.647,-314.4582 878.2706,-316.5984 887.7168,-321.3961 888.647,-314.4582"/>
+<path fill="none" stroke="#191970" d="M1824.3151,-369.411C1676.8717,-359.7062 1311.3276,-335.1167 1189,-322 1177.0899,-320.7229 1164.4842,-319.1532 1152.2213,-317.5079"/>
+<polygon fill="#191970" stroke="#191970" points="1152.4061,-314.0007 1142.024,-316.1129 1151.4573,-320.9361 1152.4061,-314.0007"/>
</g>
<!-- Node35->Node26 -->
-<g id="edge79" class="edge">
+<g id="edge75" class="edge">
<title>Node35->Node26</title>
-<path fill="none" stroke="#191970" d="M2235.3891,-369.8264C2180.0599,-366.6084 2097.1734,-361.8567 2025,-358 1666.9554,-338.8674 1238.6438,-318.0887 1070.1728,-309.9981"/>
-<polygon fill="#191970" stroke="#191970" points="1070.3324,-306.5018 1060.1761,-309.5183 1069.9967,-313.4938 1070.3324,-306.5018"/>
+<path fill="none" stroke="#191970" d="M1874.6783,-363.9005C1862.6868,-354.5583 1844.0536,-340.0417 1828.5264,-327.945"/>
+<polygon fill="#191970" stroke="#191970" points="1830.3213,-324.9065 1820.2817,-321.5218 1826.0192,-330.4286 1830.3213,-324.9065"/>
</g>
<!-- Node35->Node33 -->
-<g id="edge80" class="edge">
+<g id="edge76" class="edge">
<title>Node35->Node33</title>
-<path fill="none" stroke="#191970" d="M2360.8596,-364.3244C2444.4317,-352.1254 2591.9671,-330.5898 2681.4828,-317.5232"/>
-<polygon fill="#191970" stroke="#191970" points="2682.1774,-320.959 2691.567,-316.0512 2681.1663,-314.0324 2682.1774,-320.959"/>
-</g>
-<!-- Node36 -->
-<g id="node37" class="node">
-<title>Node36</title>
-<g id="a_node37"><a xlink:href="shape__tuple_8h.html" target="_top" xlink:title="Runtime ShapeTuple container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="1700,-291.5 1700,-321.5 1826,-321.5 1826,-291.5 1700,-291.5"/>
-<text text-anchor="start" x="1708" y="-309.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="1763" y="-298.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/shape_tuple.h</text>
-</a>
-</g>
-</g>
-<!-- Node35->Node36 -->
-<g id="edge75" class="edge">
-<title>Node35->Node36</title>
-<path fill="none" stroke="#191970" d="M2235.3188,-365.6502C2135.7474,-353.1805 1941.9092,-328.9054 1836.1451,-315.6602"/>
-<polygon fill="#191970" stroke="#191970" points="1836.5484,-312.1835 1826.191,-314.4136 1835.6785,-319.1292 1836.5484,-312.1835"/>
-</g>
-<!-- Node37 -->
-<g id="node38" class="node">
-<title>Node37</title>
-<g id="a_node38"><a xlink:href="serializer_8h.html" target="_top" xlink:title="Serializer extension to support TVM data types Include this file to enable serialization of DLDataTyp...">
-<polygon fill="#ffffff" stroke="#000000" points="2338,-297 2338,-316 2472,-316 2472,-297 2338,-297"/>
-<text text-anchor="middle" x="2405" y="-304" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/serializer.h</text>
-</a>
-</g>
-</g>
-<!-- Node35->Node37 -->
-<g id="edge82" class="edge">
-<title>Node35->Node37</title>
-<path fill="none" stroke="#191970" d="M2308.4299,-363.9005C2324.1392,-352.7062 2353.6201,-334.0823 2376.0064,-321.0947"/>
-<polygon fill="#191970" stroke="#191970" points="2377.8464,-324.0746 2384.8008,-316.0817 2374.3798,-317.9932 2377.8464,-324.0746"/>
+<path fill="none" stroke="#191970" d="M1949.7266,-366.9538C2067.2291,-354.691 2320.2718,-328.2832 2449.5113,-314.7955"/>
+<polygon fill="#191970" stroke="#191970" points="2450.151,-318.2479 2459.7337,-313.7287 2449.4244,-311.2857 2450.151,-318.2479"/>
</g>
<!-- Node35->Node40 -->
-<g id="edge88" class="edge">
+<g id="edge79" class="edge">
<title>Node35->Node40</title>
-<path fill="none" stroke="#191970" d="M2346.6923,-363.9508C2414.8062,-350.5433 2532.4294,-327.1915 2552,-322 2555.3401,-321.114 2558.7907,-320.1238 2562.2347,-319.084"/>
-<polygon fill="#191970" stroke="#191970" points="2563.3365,-322.4066 2571.826,-316.0678 2561.2365,-315.729 2563.3365,-322.4066"/>
-</g>
-<!-- Node36->Node16 -->
-<g id="edge76" class="edge">
-<title>Node36->Node16</title>
-<path fill="none" stroke="#191970" d="M1819.4605,-291.3919C1851.0323,-281.6785 1885.8408,-268.4051 1895,-255 1900.0147,-247.6607 1900.3693,-242.084 1895,-235 1881.0928,-216.6515 1518.6535,-127.1105 1496,-123 1281.2783,-84.0381 1017.9732,-74.3903 926.8574,-72.1429"/>
-<polygon fill="#191970" stroke="#191970" points="926.6931,-68.6382 916.6142,-71.904 926.5298,-75.6363 926.6931,-68.6382"/>
-</g>
-<!-- Node36->Node18 -->
-<g id="edge77" class="edge">
-<title>Node36->Node18</title>
-<path fill="none" stroke="#191970" d="M1826.25,-292.7588C1871.8627,-282.7011 1934.8524,-268.5078 1990,-255 1991.4494,-254.645 1992.9286,-254.2776 1994.4219,-253.9026"/>
-<polygon fill="#191970" stroke="#191970" points="1995.4974,-257.2402 2004.3152,-251.3667 1993.7593,-250.4594 1995.4974,-257.2402"/>
-</g>
-<!-- Node36->Node21 -->
-<g id="edge78" class="edge">
-<title>Node36->Node21</title>
-<path fill="none" stroke="#191970" d="M1699.8325,-298.523C1597.037,-285.5416 1396.6921,-260.2414 1313.5552,-249.7426"/>
-<polygon fill="#191970" stroke="#191970" points="1313.9077,-246.2594 1303.5479,-248.4788 1313.0306,-253.2042 1313.9077,-246.2594"/>
-</g>
-<!-- Node37->Node9 -->
-<g id="edge85" class="edge">
-<title>Node37->Node9</title>
-<path fill="none" stroke="#191970" d="M2427.7676,-296.9967C2512.1226,-261.7863 2807.8686,-138.3399 2922.3038,-90.5739"/>
-<polygon fill="#191970" stroke="#191970" points="2923.9577,-93.6763 2931.8379,-86.5943 2921.2613,-87.2164 2923.9577,-93.6763"/>
-</g>
-<!-- Node37->Node35 -->
-<g id="edge86" class="edge">
-<title>Node37->Node35</title>
-<path fill="none" stroke="#191970" d="M2394.5951,-316.0817C2378.9004,-327.2681 2349.4231,-345.891 2327.0292,-358.8846"/>
-<polygon fill="#191970" stroke="#191970" points="2325.185,-355.9071 2318.2311,-363.9005 2328.6519,-361.9883 2325.185,-355.9071"/>
-</g>
-<!-- Node38 -->
-<g id="node39" class="node">
-<title>Node38</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2298,-235.5 2298,-254.5 2360,-254.5 2360,-235.5 2298,-235.5"/>
-<text text-anchor="middle" x="2329" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/io.h</text>
-</g>
-<!-- Node37->Node38 -->
-<g id="edge83" class="edge">
-<title>Node37->Node38</title>
-<path fill="none" stroke="#191970" d="M2393.125,-296.8906C2381.3703,-287.3786 2363.1514,-272.6357 2349.1328,-261.2917"/>
-<polygon fill="#191970" stroke="#191970" points="2351.0178,-258.3146 2341.0424,-254.7449 2346.6144,-263.7562 2351.0178,-258.3146"/>
-</g>
-<!-- Node39 -->
-<g id="node40" class="node">
-<title>Node39</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="2378.5,-235.5 2378.5,-254.5 2475.5,-254.5 2475.5,-235.5 2378.5,-235.5"/>
-<text text-anchor="middle" x="2427" y="-242.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">dmlc/serializer.h</text>
-</g>
-<!-- Node37->Node39 -->
-<g id="edge84" class="edge">
-<title>Node37->Node39</title>
-<path fill="none" stroke="#191970" d="M2408.4375,-296.8906C2411.5513,-288.1862 2416.232,-275.1015 2420.114,-264.2494"/>
-<polygon fill="#191970" stroke="#191970" points="2423.4412,-265.3394 2423.514,-254.7449 2416.8503,-262.9817 2423.4412,-265.3394"/>
+<path fill="none" stroke="#191970" d="M1934.0727,-363.9843C2000.2647,-350.5579 2114.9347,-327.1138 2134,-322 2137.3376,-321.1048 2140.7866,-320.1083 2144.2295,-319.0644"/>
+<polygon fill="#191970" stroke="#191970" points="2145.3337,-322.3862 2153.8191,-316.0419 2143.2294,-315.7099 2145.3337,-322.3862"/>
</g>
<!-- Node41->Node8 -->
-<g id="edge121" class="edge">
+<g id="edge113" class="edge">
<title>Node41->Node8</title>
-<path fill="none" stroke="#191970" d="M2101.8722,-425.2968C2112.8295,-415.9374 2125.8295,-403.0624 2134,-389 2169.3118,-328.2241 2197.5218,-299.6776 2170,-235 2153.6304,-196.5306 2114.8529,-165.5523 2089.0184,-148.2147"/>
-<polygon fill="#191970" stroke="#191970" points="2090.7696,-145.1786 2080.4781,-142.6612 2086.9535,-151.0469 2090.7696,-145.1786"/>
+<path fill="none" stroke="#191970" d="M1608.0645,-436.5264C1715.0371,-428.7975 1939.142,-410.591 2014,-389 2044.3677,-380.2412 2048.2892,-368.7788 2078,-358 2140.0147,-335.5016 2181.4014,-373.9977 2222,-322 2230.479,-311.1403 2229.053,-302.8356 2222,-291 2192.8327,-242.0542 2034.5663,-174.4758 1963.995,-146.2349"/>
+<polygon fill="#191970" stroke="#191970" points="1965.2273,-142.9584 1954.6417,-142.5171 1962.6416,-149.4634 1965.2273,-142.9584"/>
</g>
<!-- Node41->Node9 -->
-<g id="edge99" class="edge">
+<g id="edge90" class="edge">
<title>Node41->Node9</title>
-<path fill="none" stroke="#191970" d="M2140.3598,-434.5097C2272.3225,-420.9635 2582.325,-389.1341 2583,-389 2630.3079,-379.604 2640.1976,-369.6559 2687,-358 2765.7433,-338.3893 2803.4812,-373.7348 2866,-322 2936.7774,-263.4311 2958.9473,-148.0768 2965.4669,-96.9397"/>
-<polygon fill="#191970" stroke="#191970" points="2968.9711,-97.1083 2966.6623,-86.7681 2962.0189,-96.2912 2968.9711,-97.1083"/>
+<path fill="none" stroke="#191970" d="M1608.1391,-437.5793C1772.8139,-429.136 2232.3762,-404.3245 2261,-389 2294.2312,-371.2088 2293.0342,-353.972 2313,-322 2362.148,-243.2973 2406.5988,-142.1546 2425.874,-96.1942"/>
+<polygon fill="#191970" stroke="#191970" points="2429.1949,-97.3239 2429.8065,-86.7467 2422.7324,-94.6338 2429.1949,-97.3239"/>
</g>
<!-- Node41->Node13 -->
-<g id="edge108" class="edge">
+<g id="edge99" class="edge">
<title>Node41->Node13</title>
-<path fill="none" stroke="#191970" d="M2084.5901,-425.4473C2089.9569,-389.4896 2098.8427,-296.9833 2061,-235 2013.4282,-157.0814 1913.1109,-107.1247 1859.5182,-84.8902"/>
-<polygon fill="#191970" stroke="#191970" points="1860.6605,-81.5767 1850.0782,-81.0607 1858.029,-88.0632 1860.6605,-81.5767"/>
+<path fill="none" stroke="#191970" d="M1583.7333,-425.3642C1598.3002,-416.9147 1613.8477,-404.8543 1622,-389 1628.3004,-376.7472 1625.787,-371.2471 1622,-358 1591.5569,-251.5102 1530.2673,-248.3215 1496,-143 1490.5173,-126.1489 1487.1376,-106.2661 1485.2036,-91.7505"/>
+<polygon fill="#191970" stroke="#191970" points="1488.6293,-90.926 1483.95,-81.4205 1481.6803,-91.7694 1488.6293,-90.926"/>
</g>
<!-- Node41->Node14 -->
-<g id="edge125" class="edge">
+<g id="edge117" class="edge">
<title>Node41->Node14</title>
-<path fill="none" stroke="#191970" d="M2023.5375,-439.6421C1730.7669,-435.2309 436.687,-414.3262 404,-389 373.8139,-365.6115 380,-344.6867 380,-306.5 380,-306.5 380,-306.5 380,-189 380,-152.7454 363.4195,-113.1181 352.163,-90.3929"/>
-<polygon fill="#191970" stroke="#191970" points="355.1292,-88.5073 347.4445,-81.2137 348.9035,-91.7076 355.1292,-88.5073"/>
+<path fill="none" stroke="#191970" d="M1608.244,-438.2381C1777.8194,-431.4326 2263.5517,-410.2992 2332,-389 2357.6913,-381.0056 2358.7141,-367.1963 2384,-358 2478.3482,-323.6863 2518.7144,-369.7982 2607,-322 2713.4563,-264.3641 2796.1338,-137.442 2824.4376,-89.9328"/>
+<polygon fill="#191970" stroke="#191970" points="2827.6046,-91.4511 2829.641,-81.0538 2821.5652,-87.9118 2827.6046,-91.4511"/>
</g>
<!-- Node41->Node15 -->
-<g id="edge127" class="edge">
+<g id="edge119" class="edge">
<title>Node41->Node15</title>
-<path fill="none" stroke="#191970" d="M2140.1333,-434.3545C2219.2664,-425.7131 2365.2199,-408.7941 2489,-389 2491.4167,-388.6135 2833.4426,-323.8879 2835,-322 2913.4103,-226.9496 3028.7373,-324.315 2657,-123 2603.0688,-93.7934 2532.6464,-80.9661 2487.6093,-75.4602"/>
-<polygon fill="#191970" stroke="#191970" points="2487.9018,-71.9708 2477.5667,-74.3025 2487.1001,-78.9247 2487.9018,-71.9708"/>
+<path fill="none" stroke="#191970" d="M1491.7505,-440.018C1267.3954,-437.8811 469.8489,-427.5869 433,-389 292.9133,-242.3057 696.3309,-119.5457 834.4411,-82.759"/>
+<polygon fill="#191970" stroke="#191970" points="835.5215,-86.0938 844.2986,-80.1597 833.7367,-79.3251 835.5215,-86.0938"/>
</g>
<!-- Node41->Node16 -->
-<g id="edge128" class="edge">
+<g id="edge120" class="edge">
<title>Node41->Node16</title>
-<path fill="none" stroke="#191970" d="M2023.9309,-439.4707C1793.9278,-435.2064 953.6966,-417.7875 838,-389 771.0604,-372.3442 742.8205,-374.4351 698,-322 640.081,-254.2411 616.3436,-191.8181 673,-123 696.6413,-94.2839 805.6477,-79.9107 861.2229,-74.3451"/>
-<polygon fill="#191970" stroke="#191970" points="861.6471,-77.8205 871.2652,-73.3773 860.9756,-70.8528 861.6471,-77.8205"/>
+<path fill="none" stroke="#191970" d="M1491.9597,-439.3359C1274.3377,-434.7717 513.9022,-416.9616 409,-389 346.193,-372.2589 329.1418,-363.3617 279,-322 195.3825,-253.0244 65.2817,-203.384 138,-123 170.2569,-87.3427 509.7676,-75.2613 618.0278,-72.2997"/>
+<polygon fill="#191970" stroke="#191970" points="618.4381,-75.7901 628.3418,-72.0262 618.2525,-68.7926 618.4381,-75.7901"/>
</g>
<!-- Node41->Node18 -->
-<g id="edge129" class="edge">
+<g id="edge121" class="edge">
<title>Node41->Node18</title>
-<path fill="none" stroke="#191970" d="M2077.8524,-425.4841C2068.1905,-390.5046 2044.1877,-303.6055 2033.4008,-264.553"/>
-<polygon fill="#191970" stroke="#191970" points="2036.7679,-263.597 2030.7317,-254.8898 2030.0206,-265.4608 2036.7679,-263.597"/>
+<path fill="none" stroke="#191970" d="M1491.8293,-436.2769C1349.3895,-425.8103 992.9548,-398.8094 969,-389 961.0744,-385.7545 857.6374,-294.8754 850,-291 824.5161,-278.0691 748.0714,-261.0001 703.4232,-251.717"/>
+<polygon fill="#191970" stroke="#191970" points="704.108,-248.2846 693.6075,-249.695 702.6957,-255.1407 704.108,-248.2846"/>
</g>
<!-- Node41->Node29 -->
-<g id="edge124" class="edge">
+<g id="edge116" class="edge">
<title>Node41->Node29</title>
-<path fill="none" stroke="#191970" d="M2023.8243,-433.7995C1915.641,-419.9562 1690.5985,-384.1792 1648,-322 1640.2131,-310.6338 1641.0687,-302.9073 1648,-291 1657.3173,-274.9937 1674.5087,-264.1025 1690.6645,-256.9533"/>
-<polygon fill="#191970" stroke="#191970" points="1692.0417,-260.1716 1699.9897,-253.1659 1689.4076,-253.6861 1692.0417,-260.1716"/>
+<path fill="none" stroke="#191970" d="M1491.8449,-437.638C1358.4755,-430.7598 1033.8656,-412.1419 927,-389 840.7153,-370.3149 821.2717,-356.4831 740,-322 693.8372,-302.4134 641.9349,-275.8445 611.1553,-259.5888"/>
+<polygon fill="#191970" stroke="#191970" points="612.4118,-256.2933 601.9382,-254.6953 609.1293,-262.476 612.4118,-256.2933"/>
</g>
<!-- Node41->Node32 -->
-<g id="edge100" class="edge">
+<g id="edge91" class="edge">
<title>Node41->Node32</title>
-<path fill="none" stroke="#191970" d="M2023.9892,-436.3342C1861.5259,-424.6675 1402.1986,-391.6825 1222.4302,-378.7731"/>
-<polygon fill="#191970" stroke="#191970" points="1222.5061,-375.2697 1212.2811,-378.0443 1222.0047,-382.2517 1222.5061,-375.2697"/>
+<path fill="none" stroke="#191970" d="M1491.9529,-436.7783C1313.9476,-425.3655 775.9131,-390.8696 578.5819,-378.2177"/>
+<polygon fill="#191970" stroke="#191970" points="578.5242,-374.7069 568.3208,-377.5598 578.0763,-381.6926 578.5242,-374.7069"/>
</g>
<!-- Node41->Node33 -->
-<g id="edge107" class="edge">
+<g id="edge98" class="edge">
<title>Node41->Node33</title>
-<path fill="none" stroke="#191970" d="M2140.2636,-433.5835C2236.3897,-421.9843 2420.4728,-398.9096 2449,-389 2474.778,-380.0454 2476.5015,-367.722 2502,-358 2535.3985,-345.2659 2624.9791,-328.591 2688.5005,-317.7164"/>
-<polygon fill="#191970" stroke="#191970" points="2689.1876,-321.1499 2698.459,-316.0224 2688.0136,-314.249 2689.1876,-321.1499"/>
+<path fill="none" stroke="#191970" d="M1608.0617,-438.1516C1771.8137,-431.2943 2229.6184,-410.4265 2294,-389 2318.0962,-380.9806 2318.8228,-368.3806 2342,-358 2382.4514,-339.8826 2430.7413,-326.7811 2468.1154,-318.3271"/>
+<polygon fill="#191970" stroke="#191970" points="2469.3423,-321.6405 2478.3517,-316.0656 2467.8321,-314.8054 2469.3423,-321.6405"/>
</g>
<!-- Node41->Node35 -->
-<g id="edge120" class="edge">
+<g id="edge112" class="edge">
<title>Node41->Node35</title>
-<path fill="none" stroke="#191970" d="M2130.4746,-425.4639C2168.7894,-413.5792 2221.6399,-397.1858 2257.6668,-386.0108"/>
-<polygon fill="#191970" stroke="#191970" points="2258.8419,-389.3108 2267.3561,-383.0053 2256.7681,-382.6251 2258.8419,-389.3108"/>
+<path fill="none" stroke="#191970" d="M1608.3162,-428.906C1670.239,-416.5949 1767.5151,-397.2552 1829.2293,-384.9856"/>
+<polygon fill="#191970" stroke="#191970" points="1830.0678,-388.3875 1839.1933,-383.0046 1828.7028,-381.5218 1830.0678,-388.3875"/>
</g>
<!-- Node41->Node40 -->
-<g id="edge122" class="edge">
+<g id="edge114" class="edge">
<title>Node41->Node40</title>
-<path fill="none" stroke="#191970" d="M2140.0041,-433.0645C2198.6183,-424.8673 2291.4911,-410.0001 2370,-389 2407.7585,-378.9001 2510.5217,-340.3188 2564.6286,-319.6914"/>
-<polygon fill="#191970" stroke="#191970" points="2566.0454,-322.8969 2574.1387,-316.0596 2563.548,-316.3575 2566.0454,-322.8969"/>
+<path fill="none" stroke="#191970" d="M1608.2037,-437.4043C1688.0645,-432.1987 1835.6442,-419.1378 1958,-389 1996.2351,-379.5822 2004.0771,-371.6863 2041,-358 2076.5158,-344.8353 2117.146,-329.9034 2145.4545,-319.5205"/>
+<polygon fill="#191970" stroke="#191970" points="2146.7413,-322.7767 2154.9252,-316.048 2144.3315,-316.2045 2146.7413,-322.7767"/>
</g>
<!-- Node42 -->
-<g id="node43" class="node">
+<g id="node39" class="node">
<title>Node42</title>
-<g id="a_node43"><a xlink:href="map_8h.html" target="_top" xlink:title="Runtime Map container types. ">
-<polygon fill="#ffffff" stroke="#000000" points="847,-358.5 847,-388.5 973,-388.5 973,-358.5 847,-358.5"/>
-<text text-anchor="start" x="855" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
-<text text-anchor="middle" x="910" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/map.h</text>
+<g id="a_node39"><a xlink:href="map_8h.html" target="_top" xlink:title="Runtime Map container types. ">
+<polygon fill="#ffffff" stroke="#000000" points="978,-358.5 978,-388.5 1104,-388.5 1104,-358.5 978,-358.5"/>
+<text text-anchor="start" x="986" y="-376.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/container</text>
+<text text-anchor="middle" x="1041" y="-365.5" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">/map.h</text>
</a>
</g>
</g>
<!-- Node41->Node42 -->
-<g id="edge101" class="edge">
+<g id="edge92" class="edge">
<title>Node41->Node42</title>
-<path fill="none" stroke="#191970" d="M2023.8874,-438.3635C1869.5551,-432.5072 1436.623,-414.8922 1077,-389 1046.3443,-386.7928 1012.577,-383.7707 983.4758,-380.9808"/>
-<polygon fill="#191970" stroke="#191970" points="983.6802,-377.4843 973.3897,-380.0054 983.0064,-384.4518 983.6802,-377.4843"/>
+<path fill="none" stroke="#191970" d="M1491.6112,-432.8142C1398.4541,-420.5519 1216.3471,-396.581 1114.4732,-383.1713"/>
+<polygon fill="#191970" stroke="#191970" points="1114.6391,-379.6631 1104.2678,-381.828 1113.7255,-386.6032 1114.6391,-379.6631"/>
</g>
<!-- Node43 -->
-<g id="node44" class="node">
+<g id="node40" class="node">
<title>Node43</title>
-<g id="a_node44"><a xlink:href="runtime_2module_8h.html" target="_top" xlink:title="Runtime container of the functions generated by TVM, This is used to support dynamically link...">
-<polygon fill="#ffffff" stroke="#ff0000" points="1830,-364 1830,-383 1956,-383 1956,-364 1830,-364"/>
-<text text-anchor="middle" x="1893" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/module.h</text>
+<g id="a_node40"><a xlink:href="runtime_2module_8h.html" target="_top" xlink:title="Runtime container of the functions generated by TVM, This is used to support dynamically link...">
+<polygon fill="#ffffff" stroke="#000000" points="1487,-364 1487,-383 1613,-383 1613,-364 1487,-364"/>
+<text text-anchor="middle" x="1550" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tvm/runtime/module.h</text>
</a>
</g>
</g>
<!-- Node41->Node43 -->
-<g id="edge109" class="edge">
+<g id="edge100" class="edge">
<title>Node41->Node43</title>
-<path fill="none" stroke="#191970" d="M2033.8442,-425.4639C1999.4648,-413.7824 1954.3108,-397.7449 1924.4383,-386.5884"/>
-<polygon fill="#191970" stroke="#191970" points="1925.523,-383.2569 1914.9312,-383.0053 1923.0543,-389.8071 1925.523,-383.2569"/>
+<path fill="none" stroke="#191970" d="M1544.2392,-425.2967C1543.0912,-415.7699 1542.9424,-403.3954 1543.7929,-393.1306"/>
+<polygon fill="#191970" stroke="#191970" points="1547.2843,-393.4239 1545.108,-383.055 1540.3431,-392.5178 1547.2843,-393.4239"/>
</g>
<!-- Node41->Node45 -->
-<g id="edge123" class="edge">
+<g id="edge115" class="edge">
<title>Node41->Node45</title>
-<path fill="none" stroke="#191970" d="M2140.0515,-434.2356C2168.1157,-431.2853 2202.295,-427.8065 2233,-425 2432.3649,-406.7776 2487.6995,-437.226 2682,-389 2684.4849,-388.3832 2687.017,-387.613 2689.5287,-386.7476"/>
-<polygon fill="#191970" stroke="#191970" points="2690.9289,-389.9584 2698.9725,-383.0626 2688.3842,-383.4372 2690.9289,-389.9584"/>
+<path fill="none" stroke="#191970" d="M1608.3518,-437.0155C1811.5176,-424.8835 2483.9551,-384.7291 2639.6285,-375.4331"/>
+<polygon fill="#191970" stroke="#191970" points="2640.1568,-378.9078 2649.9304,-374.8179 2639.7395,-371.9203 2640.1568,-378.9078"/>
</g>
<!-- Node46 -->
-<g id="node46" class="node">
+<g id="node44" class="node">
<title>Node46</title>
-<polygon fill="#ffffff" stroke="#bfbfbf" points="1974.5,-364 1974.5,-383 2015.5,-383 2015.5,-364 1974.5,-364"/>
-<text text-anchor="middle" x="1995" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tuple</text>
+<polygon fill="#ffffff" stroke="#bfbfbf" points="1351.5,-364 1351.5,-383 1392.5,-383 1392.5,-364 1351.5,-364"/>
+<text text-anchor="middle" x="1372" y="-371" font-family="Helvetica,sans-Serif" font-size="10.00" fill="#000000">tuple</text>
</g>
<!-- Node41->Node46 -->
-<g id="edge126" class="edge">
+<g id="edge118" class="edge">
<title>Node41->Node46</title>
-<path fill="none" stroke="#191970" d="M2062.2584,-425.2967C2048.3085,-414.5537 2029.6565,-400.1895 2015.5332,-389.3129"/>
-<polygon fill="#191970" stroke="#191970" points="2017.4656,-386.3835 2007.4072,-383.055 2013.1945,-391.9295 2017.4656,-386.3835"/>
+<path fill="none" stroke="#191970" d="M1505.5663,-425.4134C1477.1806,-415.5505 1439.6973,-402.1086 1407,-389 1405.3885,-388.3539 1403.7414,-387.6783 1402.0828,-386.9859"/>
+<polygon fill="#191970" stroke="#191970" points="1403.1942,-383.6542 1392.6245,-382.9254 1400.4327,-390.0865 1403.1942,-383.6542"/>
</g>
<!-- Node42->Node16 -->
-<g id="edge104" class="edge">
+<g id="edge95" class="edge">
<title>Node42->Node16</title>
-<path fill="none" stroke="#191970" d="M846.7587,-366.5567C810.4938,-359.9445 765.806,-347.0634 733,-322 685.415,-285.6456 685.3167,-258.4343 678,-199 673.813,-164.9884 667.0164,-147.4767 691,-123 714.5197,-98.9967 809.9164,-82.806 861.1257,-75.6605"/>
-<polygon fill="#191970" stroke="#191970" points="861.9026,-79.0873 871.3408,-74.2735 860.9607,-72.1509 861.9026,-79.0873"/>
+<path fill="none" stroke="#191970" d="M977.6573,-368.386C855.697,-357.0288 595.8708,-324.9846 546,-255 504.2393,-196.3964 588.0259,-119.9302 629.5549,-87.3327"/>
+<polygon fill="#191970" stroke="#191970" points="631.9165,-89.9321 637.715,-81.0648 627.6524,-84.3808 631.9165,-89.9321"/>
</g>
<!-- Node42->Node20 -->
-<g id="edge106" class="edge">
+<g id="edge97" class="edge">
<title>Node42->Node20</title>
-<path fill="none" stroke="#191970" d="M888.443,-358.2967C875.5679,-349.2163 859.0245,-337.5488 844.8995,-327.587"/>
-<polygon fill="#191970" stroke="#191970" points="846.661,-324.5464 836.4717,-321.6432 842.6265,-330.2669 846.661,-324.5464"/>
+<path fill="none" stroke="#191970" d="M1049.6228,-358.2967C1054.287,-350.0729 1060.1547,-339.7272 1065.4174,-330.4483"/>
+<polygon fill="#191970" stroke="#191970" points="1068.5223,-332.0683 1070.4113,-321.6432 1062.4335,-328.6148 1068.5223,-332.0683"/>
</g>
<!-- Node42->Node21 -->
-<g id="edge105" class="edge">
+<g id="edge96" class="edge">
<title>Node42->Node21</title>
-<path fill="none" stroke="#191970" d="M958.1883,-358.4835C989.8085,-348.4846 1032.0017,-334.8665 1069,-322 1130.2267,-300.7077 1200.8626,-273.9569 1241.7781,-258.243"/>
-<polygon fill="#191970" stroke="#191970" points="1243.1947,-261.4481 1251.2707,-254.5904 1240.6809,-254.9151 1243.1947,-261.4481"/>
+<path fill="none" stroke="#191970" d="M1079.1161,-358.4125C1100.8622,-349.1363 1128.2367,-336.3058 1151,-322 1169.4542,-310.4023 1169.672,-301.0742 1189,-291 1228.9429,-270.1808 1279.4817,-257.76 1313.3969,-251.1693"/>
+<polygon fill="#191970" stroke="#191970" points="1314.1714,-254.5852 1323.3585,-249.3082 1312.8858,-247.7043 1314.1714,-254.5852"/>
</g>
<!-- Node42->Node24 -->
-<g id="edge102" class="edge">
+<g id="edge93" class="edge">
<title>Node42->Node24</title>
-<path fill="none" stroke="#191970" d="M846.9023,-366.2283C788.9796,-356.6494 715.9546,-335.4884 743,-291 782.0739,-226.7251 873.1006,-202.6725 926.9524,-193.8788"/>
-<polygon fill="#191970" stroke="#191970" points="927.521,-197.3327 936.8774,-192.3621 926.4634,-190.413 927.521,-197.3327"/>
+<path fill="none" stroke="#191970" d="M1017.5023,-358.4484C988.1979,-340.247 936.0859,-309.7287 888,-291 827.3315,-267.3706 799.5439,-292.0414 746,-255 729.0149,-243.2498 716.3037,-223.0524 708.5961,-208.0873"/>
+<polygon fill="#191970" stroke="#191970" points="711.575,-206.2078 704.0686,-198.7309 705.2739,-209.257 711.575,-206.2078"/>
</g>
<!-- Node42->Node31 -->
-<g id="edge103" class="edge">
+<g id="edge94" class="edge">
<title>Node42->Node31</title>
-<path fill="none" stroke="#191970" d="M973.0168,-361.6144C1019.5442,-352.3021 1084.2079,-338.2321 1140,-322 1179.012,-310.6499 1186.5231,-300.6095 1226,-291 1344.4833,-262.1586 1377.3727,-272.8658 1498,-255 1499.4597,-254.7838 1500.9385,-254.562 1502.4303,-254.3358"/>
-<polygon fill="#191970" stroke="#191970" points="1503.0501,-257.7816 1512.3965,-252.792 1501.9786,-250.8641 1503.0501,-257.7816"/>
+<path fill="none" stroke="#191970" d="M1028.5622,-358.1709C1021.2222,-348.3743 1012.3499,-335.0911 1007,-322 999.4546,-303.5363 995.6332,-280.9977 993.7444,-265.0893"/>
+<polygon fill="#191970" stroke="#191970" points="997.1938,-264.418 992.685,-254.8304 990.2308,-265.1371 997.1938,-264.418"/>
</g>
<!-- Node43->Node8 -->
-<g id="edge114" class="edge">
+<g id="edge105" class="edge">
<title>Node43->Node8</title>
-<path fill="none" stroke="#191970" d="M1920.0379,-363.9532C1959.6817,-348.2846 2032.0126,-312.9342 2061,-255 2077.4426,-222.1379 2072.9828,-177.8061 2068.3959,-152.6294"/>
-<polygon fill="#191970" stroke="#191970" points="2071.7645,-151.6288 2066.3541,-142.5196 2064.9031,-153.0146 2071.7645,-151.6288"/>
+<path fill="none" stroke="#191970" d="M1613.1836,-371.2231C1696.0227,-366.9565 1835.2591,-354.9679 1873,-322 1898.8813,-299.3918 1919.1925,-196.2212 1926.7779,-152.5826"/>
+<polygon fill="#191970" stroke="#191970" points="1930.244,-153.0767 1928.4648,-142.6323 1923.3425,-151.9066 1930.244,-153.0767"/>
</g>
<!-- Node43->Node9 -->
-<g id="edge111" class="edge">
+<g id="edge102" class="edge">
<title>Node43->Node9</title>
-<path fill="none" stroke="#191970" d="M1921.2663,-363.9249C1949.5745,-354.1265 1994.2401,-338.0959 2032,-322 2061.0743,-309.6065 2066.4961,-302.3328 2096,-291 2381.2707,-181.4241 2740.1316,-111.0824 2893.2423,-84.0426"/>
-<polygon fill="#191970" stroke="#191970" points="2894.2231,-87.424 2903.4672,-82.2474 2893.0125,-80.5295 2894.2231,-87.424"/>
+<path fill="none" stroke="#191970" d="M1613.3035,-369.8099C1683.8343,-364.3958 1800.6255,-351.5203 1897,-322 2092.584,-262.0911 2125.2566,-206.4811 2312,-123 2337.2504,-111.7121 2365.732,-99.8246 2389.0252,-90.3182"/>
+<polygon fill="#191970" stroke="#191970" points="2390.3906,-93.5413 2398.3356,-86.5322 2387.7538,-87.0569 2390.3906,-93.5413"/>
</g>
<!-- Node43->Node14 -->
-<g id="edge116" class="edge">
+<g id="edge108" class="edge">
... 533255 lines suppressed ...