You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/11/10 18:26:42 UTC

[GitHub] [tvm] nverke opened a new pull request, #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

nverke opened a new pull request, #13352:
URL: https://github.com/apache/tvm/pull/13352

   …lines using metaschedule
   
   This test uses a schedule function to run each conv2d in resnet50 in a pipeline async dma copies to vtcm and parallel HVX. 
    


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1019727075


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,11 +371,223 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            try:
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            except:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1])
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        if len(conv2d_loops) == 8:
+            # Handle case where kernel is not 1x1
+            oc, x0, x1, ic = conv2d_loops[-4:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        else:
+            # Handle case where kernel is 1x1

Review Comment:
   I think if you set `preserve_unit_loops = True` in `compute_at`, you can get rid off this branch.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1023284963


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -208,7 +215,7 @@ def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
         if conv2d_block is None:
             try:
                 conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            except ValueError:
+            except:
                 return False

Review Comment:
   Cool new feature! Added ✅



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] tvm-bot commented on pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

tvm-bot commented on PR #13352:
URL: https://github.com/apache/tvm/pull/13352#issuecomment-1310725420

   <!---bot-comment-->
   
   Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
   
   <!--bot-comment-ccs-start-->
    * cc @mehrdadh <sub>See [#10317](https://github.com/apache/tvm/issues/10317) for details</sub><!--bot-comment-ccs-end-->
   
   <sub>Generated by [tvm-bot](https://github.com/apache/tvm/blob/main/ci/README.md#github-actions)</sub>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi merged pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi merged PR #13352:
URL: https://github.com/apache/tvm/pull/13352


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1019557035


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -372,6 +596,11 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
 
         np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
 
+        debug_ex = session.get_graph_debug_executor(
+            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
+        )
+        print(debug_ex.profile(input_name=inp.copy()))

Review Comment:
   Do we want `benchmark` at L589 and profiling here, given that this test is not really tuning anything?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on PR #13352:
URL: https://github.com/apache/tvm/pull/13352#issuecomment-1315901484

   The changes to TIR broke the tensorization with this so those issues need to be fixed. For the time being rebased on top of another commit and made updates that were requested. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1019551995


##########
python/tvm/contrib/hexagon/meta_schedule.py:
##########
@@ -121,14 +129,23 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
     return costs
 
 
-def get_hexagon_local_builder():
+def get_hexagon_local_builder(pass_context: tvm.transform.PassContext = None):
     """Return Hexagon-compatible Builder for meta schedule."""
 
     def export_func(mod):
         binary_path = export_module(mod, tempfile.mkdtemp())
         return str(binary_path)
 
-    return LocalBuilder(f_export=export_func)
+    def build_func(mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]) -> Module:
+        if pass_context is not None:
+            with pass_context:
+                mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+                return tvm_build(mod, target=target)
+        else:
+            mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+            return tvm_build(mod, target=target)

Review Comment:
   Use `meta_schdule.builder.default_build` here. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1025811521


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,18 +429,115 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-        graph_mod.run()
-        hexagon_output = graph_mod.get_output(0).numpy()
+    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp)
 
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
 
-        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            if has_block(sch, "conv2d_NCHWc_int8"):
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            else:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1], preserve_unit_loops=True)
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        o_c, k_h, k_w, x_0, x_1, i_c = conv2d_loops[-6:]
+        ic_o, ic_i = sch.split(loop=i_c, factors=[None, 4], preserve_unit_iters=True)
+        oc_o, oc_i = sch.split(loop=o_c, factors=[None, 32], preserve_unit_iters=True)
+        sch.reorder(oc_o, k_h, k_w, x_0, x_1, ic_o, oc_i, ic_i)
+        new_loops = sch.get_loops(block=conv2d_block)
+        sch.parallel(new_loops[4])
+        sch.unroll(new_loops[5])
+        # TODO(nverke): Add compute optimizations here.
+        sch.blockize(loop=oc_i)

Review Comment:
   For some reason tensorization breaks when removing this...



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on PR #13352:
URL: https://github.com/apache/tvm/pull/13352#issuecomment-1316636438

   Actually there was also a `vrmpy` tensorization bug in the current `main`, fixed by https://github.com/apache/tvm/pull/13404. I forgot the fact that we don't need an explicit initialization block in a tensorize description. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1023285390


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,11 +371,223 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            try:
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            except:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1])
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        if len(conv2d_loops) == 8:
+            # Handle case where kernel is not 1x1
+            oc, x0, x1, ic = conv2d_loops[-4:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        else:
+            # Handle case where kernel is 1x1
+            oc, kh, kw, x0, x1, ic = conv2d_loops[-6:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, kh, kw, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        sch.blockize(loop=oc_i)
+
+        sch.tensorize(oc_i, VRMPY_u8i8i32_VTCM_INTRIN)
+
+        pipeline_loop = conv2d_loops[1]
+        sch.annotate(pipeline_loop, "software_pipeline_stage", [0, 0, 1, 2, 3])
+        sch.annotate(pipeline_loop, "software_pipeline_order", [0, 1, 2, 3, 4])
+        sch.annotate(pipeline_loop, "software_pipeline_async_stages", [0, 2])
+
+        return True
+
+    return schedule_fn
+
+
+def tune_async_dma_template(mod, params, hexagon_launcher):
+    """Generate async dma template."""
+
+    def schedule_rule_conv2d_async_dma(sch: Schedule, conv2d_block: BlockRV):
+        _schedule_async_dma_conv2d()(sch, conv2d_block)
+        return [sch]
+
+    register_func(
+        "meta_schedule.conv2d_NCHWc_int8.async_dma.hexagon", schedule_rule_conv2d_async_dma
+    )
+
+    def schedule_conv2d_for_tune(sch: Schedule):
+        _schedule_async_dma_conv2d()(sch)
+
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", EXECUTOR)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=TARGET_HEXAGON,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=20000,
+            max_trials_per_task=1,
+            num_trials_per_iter=1,
+            strategy="replay-trace",
+            builder=get_hexagon_local_builder(
+                tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": 0},
+                )
+            ),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            # Constrain search space to only be the single
+            # schedule provided for all blocks. No auto
+            # scheduling will be possible.
+            space=ms.space_generator.ScheduleFn(
+                schedule_conv2d_for_tune,
+                sch_rules=[],
+                postprocs=[],
+                mutator_probs={},
+            ),
+            # Without this, the same workloads with different constant weights
+            # are treated as distinct tuning tasks.
+            module_equality="ignore-ndarray",
+        )
+        return ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=TARGET_HEXAGON,
+            params=params,
+            pass_config={
+                "tir.use_async_copy": 1,
+                "tir.merge_async_commit_queue_scope": False,
+            },
+        )
+
+
+@tvm.testing.requires_hexagon
+def test_async_dma_resnet50(hexagon_launcher):

Review Comment:
   Was able to unify a good amount of the code ✅



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1023286600


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -372,6 +596,11 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
 
         np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
 
+        debug_ex = session.get_graph_debug_executor(
+            hexagon_lowered.get_graph_json(), hexagon_lowered.lib
+        )
+        print(debug_ex.profile(input_name=inp.copy()))

Review Comment:
   I wanted to run these to see what the performance of the model is in summation after we use all of these schedules. Is there a better way to get that? 



##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,11 +371,223 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            try:
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            except:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1])
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        if len(conv2d_loops) == 8:
+            # Handle case where kernel is not 1x1
+            oc, x0, x1, ic = conv2d_loops[-4:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        else:
+            # Handle case where kernel is 1x1

Review Comment:
   This worked! ✅



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1019552299


##########
python/tvm/contrib/hexagon/meta_schedule.py:
##########
@@ -121,14 +129,23 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
     return costs
 
 
-def get_hexagon_local_builder():
+def get_hexagon_local_builder(pass_context: tvm.transform.PassContext = None):
     """Return Hexagon-compatible Builder for meta schedule."""
 
     def export_func(mod):
         binary_path = export_module(mod, tempfile.mkdtemp())
         return str(binary_path)
 
-    return LocalBuilder(f_export=export_func)
+    def build_func(mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]) -> Module:
+        if pass_context is not None:
+            with pass_context:
+                mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+                return tvm_build(mod, target=target)
+        else:
+            mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+            return tvm_build(mod, target=target)

Review Comment:
   Use meta_schdule.builder.default_build here.



##########
python/tvm/tir/tensor_intrin/hexagon.py:
##########
@@ -64,6 +64,50 @@ def dot_product_32x4_u8u8i32_vrmpy(
         )
 
 
+@T.prim_func
+def dot_product_32x4_u8i8i32_vtcm_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1, scope="global.vtcm"),
+    B: T.Buffer((32, 4), "int8", offset_factor=1, scope="global.vtcm"),
+    C: T.Buffer((32,), "int32", offset_factor=1, scope="global.vtcm"),

Review Comment:
   Is it possible to parametrize `scope` to remove duplication? 



##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -208,7 +215,7 @@ def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
         if conv2d_block is None:
             try:
                 conv2d_block = sch.get_block("conv2d_NCHWc_int8")
-            except ValueError:
+            except:
                 return False

Review Comment:
   I added `has_block` recently here https://github.com/apache/tvm/blob/f42826eec49998452cff30a2b6510e7d3c31e3ec/python/tvm/tir/schedule/analysis.py#L127. It can be used to remove this try / catch.



##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,11 +371,223 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
+    with hexagon_launcher.create_session() as session:
+        graph_mod = session.get_executor_from_factory(hexagon_lowered)
+        graph_mod.set_input(input_name, inp.copy())
+        graph_mod.run()
+        hexagon_output = graph_mod.get_output(0).numpy()
+
+        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
+        llvm_graph_mod.set_input(input_name, inp.copy())
+        llvm_graph_mod.run()
+        ref_result = llvm_graph_mod.get_output(0).numpy()
+
+
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            try:
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            except:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1])
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        if len(conv2d_loops) == 8:
+            # Handle case where kernel is not 1x1
+            oc, x0, x1, ic = conv2d_loops[-4:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        else:
+            # Handle case where kernel is 1x1
+            oc, kh, kw, x0, x1, ic = conv2d_loops[-6:]
+            ic_o, ic_i = sch.split(loop=ic, factors=[None, 4], preserve_unit_iters=True)
+            oc_o, oc_i = sch.split(loop=oc, factors=[None, 32], preserve_unit_iters=True)
+            sch.reorder(oc_o, kh, kw, x0, x1, ic_o, oc_i, ic_i)
+            new_loops = sch.get_loops(block=conv2d_block)
+            sch.parallel(new_loops[2])
+            sch.unroll(new_loops[-4])
+            # TODO(nverke): Add compute optimizations here.
+        sch.blockize(loop=oc_i)
+
+        sch.tensorize(oc_i, VRMPY_u8i8i32_VTCM_INTRIN)
+
+        pipeline_loop = conv2d_loops[1]
+        sch.annotate(pipeline_loop, "software_pipeline_stage", [0, 0, 1, 2, 3])
+        sch.annotate(pipeline_loop, "software_pipeline_order", [0, 1, 2, 3, 4])
+        sch.annotate(pipeline_loop, "software_pipeline_async_stages", [0, 2])
+
+        return True
+
+    return schedule_fn
+
+
+def tune_async_dma_template(mod, params, hexagon_launcher):
+    """Generate async dma template."""
+
+    def schedule_rule_conv2d_async_dma(sch: Schedule, conv2d_block: BlockRV):
+        _schedule_async_dma_conv2d()(sch, conv2d_block)
+        return [sch]
+
+    register_func(
+        "meta_schedule.conv2d_NCHWc_int8.async_dma.hexagon", schedule_rule_conv2d_async_dma
+    )
+
+    def schedule_conv2d_for_tune(sch: Schedule):
+        _schedule_async_dma_conv2d()(sch)
+
+    # This line is necessary for link-params to take effect during
+    # task extraction and relay.build(...).
+    mod = mod.with_attr("executor", EXECUTOR)
+
+    with tempfile.TemporaryDirectory() as work_dir:
+        database = ms.relay_integration.tune_relay(
+            mod=mod,
+            target=TARGET_HEXAGON,
+            params=params,
+            work_dir=work_dir,
+            max_trials_global=20000,
+            max_trials_per_task=1,
+            num_trials_per_iter=1,
+            strategy="replay-trace",
+            builder=get_hexagon_local_builder(
+                tvm.transform.PassContext(
+                    opt_level=3,
+                    config={"tir.use_async_copy": 1, "tir.merge_async_commit_queue_scope": 0},
+                )
+            ),
+            runner=get_hexagon_rpc_runner(hexagon_launcher, number=20),
+            # Constrain search space to only be the single
+            # schedule provided for all blocks. No auto
+            # scheduling will be possible.
+            space=ms.space_generator.ScheduleFn(
+                schedule_conv2d_for_tune,
+                sch_rules=[],
+                postprocs=[],
+                mutator_probs={},
+            ),
+            # Without this, the same workloads with different constant weights
+            # are treated as distinct tuning tasks.
+            module_equality="ignore-ndarray",
+        )
+        return ms.relay_integration.compile_relay(
+            database=database,
+            mod=mod,
+            target=TARGET_HEXAGON,
+            params=params,
+            pass_config={
+                "tir.use_async_copy": 1,
+                "tir.merge_async_commit_queue_scope": False,
+            },
+        )
+
+
+@tvm.testing.requires_hexagon
+def test_async_dma_resnet50(hexagon_launcher):

Review Comment:
   I think it should be possible to remove dups with `test_packed_8x8x32_resnet50`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on PR #13352:
URL: https://github.com/apache/tvm/pull/13352#issuecomment-1319311603

   Was able to pull in the fix for vrmpy tensorization and rebase this onto mainline so should be ok to push now. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] nverke commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

nverke commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1023284499


##########
python/tvm/contrib/hexagon/meta_schedule.py:
##########
@@ -121,14 +129,23 @@ def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path
     return costs
 
 
-def get_hexagon_local_builder():
+def get_hexagon_local_builder(pass_context: tvm.transform.PassContext = None):
     """Return Hexagon-compatible Builder for meta schedule."""
 
     def export_func(mod):
         binary_path = export_module(mod, tempfile.mkdtemp())
         return str(binary_path)
 
-    return LocalBuilder(f_export=export_func)
+    def build_func(mod: IRModule, target: Target, _params: Optional[Dict[str, NDArray]]) -> Module:
+        if pass_context is not None:
+            with pass_context:
+                mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+                return tvm_build(mod, target=target)
+        else:
+            mod = RemoveWeightLayoutRewriteBlock(skip_ndarray_rewrite=True)(mod)
+            return tvm_build(mod, target=target)

Review Comment:
   ✅ Changed this to use the old strategy if pass context is not present. 



##########
python/tvm/tir/tensor_intrin/hexagon.py:
##########
@@ -64,6 +64,50 @@ def dot_product_32x4_u8u8i32_vrmpy(
         )
 
 
+@T.prim_func
+def dot_product_32x4_u8i8i32_vtcm_desc(
+    A: T.Buffer((4,), "uint8", offset_factor=1, scope="global.vtcm"),
+    B: T.Buffer((32, 4), "int8", offset_factor=1, scope="global.vtcm"),
+    C: T.Buffer((32,), "int32", offset_factor=1, scope="global.vtcm"),

Review Comment:
   Yes! ✅



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #13352: [Hexagon] Add test to show scheduling of resnet50 with async dma pipe…

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #13352:
URL: https://github.com/apache/tvm/pull/13352#discussion_r1023776556


##########
tests/python/contrib/test_hexagon/metaschedule_e2e/test_resnet50_int8.py:
##########
@@ -359,18 +429,115 @@ def test_packed_8x8x32_resnet50(hexagon_launcher):
             params=params,
         )
 
-    with hexagon_launcher.start_session() as session:
-        graph_mod = session.get_executor_from_factory(hexagon_lowered)
-        graph_mod.set_input(input_name, inp.copy())
-        graph_mod.run()
-        hexagon_output = graph_mod.get_output(0).numpy()
+    evaluate_mod(hexagon_launcher, hexagon_lowered, llvm_lowered, input_name, inp)
 
-        llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0)))
-        llvm_graph_mod.set_input(input_name, inp.copy())
-        llvm_graph_mod.run()
-        ref_result = llvm_graph_mod.get_output(0).numpy()
 
-        np.testing.assert_allclose(ref_result, hexagon_output, atol=1e-4, rtol=1e-5)
+def _schedule_async_dma_conv2d():
+    """Manually schedule a conv2d block, created from TE compute op via CreatePrimFunc,
+    using 8x8x32 packed layout.
+    """
+
+    def schedule_fn(sch, conv2d_block: Optional[BlockRV] = None) -> bool:
+        if conv2d_block is None:
+            if has_block(sch, "conv2d_NCHWc_int8"):
+                conv2d_block = sch.get_block("conv2d_NCHWc_int8")
+            else:
+                return False
+
+        assert "conv2d_NCHWc_int8" in sch.get(conv2d_block).annotations["schedule_rule"]
+
+        # Apply scheduling
+
+        post_blocks = sch.get_consumers(conv2d_block)
+        if len(post_blocks) > 0:
+            # Fuse all intermediate post ops into the last op.
+            # This is equivalent to the traverse_inline function used in TE schedules.
+            while True:
+                next_post_blocks = []
+                for post_block in post_blocks:
+                    next_consumers = sch.get_consumers(post_block)
+                    if len(next_consumers) > 0:
+                        sch.compute_inline(post_block)
+                    next_post_blocks += next_consumers
+                if len(next_post_blocks) == 0:
+                    assert len(post_blocks) == 1
+                    outer_block = post_blocks[0]
+                    break
+                post_blocks = next_post_blocks
+        else:
+            outer_block = conv2d_block
+
+        # Move the conv2d mma into the injective post mma compute block
+        if outer_block != conv2d_block:
+            loops = sch.get_loops(outer_block)
+            # Compute at the second loop for pipelining.
+            sch.compute_at(conv2d_block, loops[1], preserve_unit_loops=True)
+
+        # Add cache for input and output for copying data to vtcm.
+        input_a_cache = sch.cache_read(conv2d_block, 0, "global.vtcm")
+        sch.compute_at(input_a_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_a_cache)[2:])
+
+        input_b_cache = sch.cache_read(conv2d_block, 1, "global.vtcm")
+        sch.compute_at(input_b_cache, sch.get_loops(conv2d_block)[1])
+        sch.fuse(*sch.get_loops(input_b_cache)[2:])
+
+        output_cache_write = sch.cache_write(conv2d_block, 0, "global.vtcm")
+        sch.fuse(*sch.get_loops(output_cache_write)[2:])
+
+        conv2d_loops = sch.get_loops(block=conv2d_block)
+        o_c, k_h, k_w, x_0, x_1, i_c = conv2d_loops[-6:]
+        ic_o, ic_i = sch.split(loop=i_c, factors=[None, 4], preserve_unit_iters=True)
+        oc_o, oc_i = sch.split(loop=o_c, factors=[None, 32], preserve_unit_iters=True)
+        sch.reorder(oc_o, k_h, k_w, x_0, x_1, ic_o, oc_i, ic_i)
+        new_loops = sch.get_loops(block=conv2d_block)
+        sch.parallel(new_loops[4])
+        sch.unroll(new_loops[5])
+        # TODO(nverke): Add compute optimizations here.
+        sch.blockize(loop=oc_i)

Review Comment:
   Probably doesn't need this `blockize` (`tensorize` does `blockize` anyway)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org