You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2022/08/25 01:54:38 UTC

[GitHub] [tvm] masahi opened a new pull request, #12587: [Hexagon] Initial support for meta scheduler tuning

masahi opened a new pull request, #12587:
URL: https://github.com/apache/tvm/pull/12587

   Output log from tuning `vrmpy` dense (included in the test)
   
   ```
    ID | Name |      FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Terminated
   --------------------------------------------------------------------------------------------------------------
     0 | main | 150994944 |      1 |       380.3399 |     397.0000 |              397.0000 |     32 |
   --------------------------------------------------------------------------------------------------------------
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

csullivan commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r955391842


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   It would be really great for this to be tested pre-commit on the simulator so future changes don't regress on the ability to tune. @kparzysz-quic do you think we could remove the use of a local x86 rpc server for targeting the hexagon simulator? Goal would be to make HexagonLauncherSimulator pickleable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

csullivan commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r956192435


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   Thanks @masahi @kparzysz-quic - By the way also, not using a local RPC server is not making Hexagon a special case. For example, the CUDA target can be used locally without an x86 RPC server if it is attached to the machine being compiled. For the simulator there may be a similar solution, though it would end up testing a different codepath than runs with HW, so it is worth considering carefully. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] junrushao commented on pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

junrushao commented on PR #12587:
URL: https://github.com/apache/tvm/pull/12587#issuecomment-1226693737

   just a quick typo fix: it's meta schedule not meta scheduler :-) meta schedule is a "meta" schedule that generate basic schedules


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r955493687


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   The issue is that meta schedule RPCRunner uses `PopenPoolExuecutor` which requires everything that's passed to it be pickle-able:
   https://github.com/apache/tvm/blob/9331d9e086ce0b24a333be101ea26f758c216114/python/tvm/contrib/hexagon/meta_schedule.py#L88-L96
   
   If we try to pass `HexagonLauncherSimulator`, we get this error https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12587/3/pipeline/ , `Pickling an AuthenticationString object is disallowed for security reasons`. Apparently this is coming from trying to pickle https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614.
   
   The question is, can we remove `_server_process` from `HexagonLauncherSimulator`?
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r955493687


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   The issue is that meta schedule RPCRunner uses `PopenPoolExuecutor` which requires everything that's passed to it be pickle-able:
   https://github.com/apache/tvm/blob/9331d9e086ce0b24a333be101ea26f758c216114/python/tvm/contrib/hexagon/meta_schedule.py#L88-L96
   
   If we try to pass `HexagonLauncherSimulator`, we get this error https://ci.tlcpack.ai/blue/organizations/jenkins/tvm/detail/PR-12587/3/pipeline/ , `Pickling an AuthenticationString object is disallowed for security reasons`. Apparently this is coming from trying to pickle https://github.com/apache/tvm/blob/c97895e0ffb512e73c89de7cdee9846f052244fc/python/tvm/contrib/hexagon/build.py#L614.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] kparzysz-quic commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

kparzysz-quic commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r955475138


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   I'm not sure I understand the goal.  What exactly do you want to make pickleable?  Where would it be unpickled?
   
   Right now the simulator fits in the general RPC infrastructure, with the RPC tracker and all.  I guess it would be possible to change this, but I'm concerned that it would make Hexagon a "special case", plus I'm really not sure what we gain by doing this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #12587: [Hexagon] Initial support for meta scheduler tuning

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r954433668


##########
python/tvm/contrib/hexagon/meta_schedule.py:
##########
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta scheduler tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+    EvaluatorConfig,
+    RunnerInput,
+    RunnerFuture,
+    PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+    default_alloc_argument,
+    default_run_evaluator,
+    RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+    """RPCRunner for Hexagon. See the documentation of RPCRunner for more details."""
+
+    def __init__(
+        self,
+        hexagon_launcher : HexagonLauncherRPC,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        cooldown_sec: float = 0.0,
+        alloc_repeat: int = 1,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Parameters
+        ----------
+        hexagon_launcher : HexagonLauncherRPC
+            The RPC launcher for Hexagon. It is needed for creating hexagon.Session
+            object inside the worker function.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        cooldown_sec: float
+            The cooldown in seconds.
+        alloc_repeat: int
+            The number of times to random fill the allocation.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+
+        super().__init__()
+        self.hexagon_launcher = hexagon_launcher
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.cooldown_sec = cooldown_sec
+        self.alloc_repeat = alloc_repeat
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=100,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results = []
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.hexagon_launcher,
+                    self.evaluator_config,
+                    self.alloc_repeat,
+                    str(runner_input.artifact_path),
+                    tuple(arg_info.as_json() for arg_info in runner_input.args_info),
+                ),
+                timeout_sec=100,
+            )
+            results.append(future)
+        return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
+    with hexagon_launcher.start_session() as session:
+        device = session.device
+        _, remote_path = os.path.split(artifact_path)
+        uploaded = session.upload(artifact_path, remote_path)
+        rt_mod = session.load_module(uploaded)
+        repeated_args = default_alloc_argument(
+            session,
+            device,
+            args_info,
+            alloc_repeat,
+        )
+        costs = default_run_evaluator(
+            session,
+            rt_mod,
+            device,
+            evaluator_config,
+            repeated_args,
+        )
+    return costs
+
+
+def get_hexagon_local_builder():
+    """Return Hexagon-compatible Builder for meta scheduler."""
+
+    def export_func(mod):
+        binary_path = export_module(mod, tempfile.mkdtemp())
+        return str(binary_path)
+
+    return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(hexagon_launcher: HexagonLauncherRPC):
+    """Return Hexagon-compatible RPC Runner for meta scheduler."""
+    evaluator_config = EvaluatorConfig(
+        number=1,
+        repeat=1,
+        min_repeat_ms=0,

Review Comment:
   oops need to fix these params



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] kparzysz-quic commented on a diff in pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

kparzysz-quic commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r956212176


##########
tests/python/contrib/test_hexagon/test_meta_schedule.py:
##########
@@ -0,0 +1,211 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test rpc based launcher for hexagon """
+import pytest
+import numpy as np
+import tempfile
+
+import tvm.testing
+from tvm import te
+from tvm import meta_schedule as ms
+from tvm.meta_schedule.arg_info import TensorInfo
+from tvm.meta_schedule.builder import BuilderInput
+from tvm.script import tir as T
+from tvm.tir import FloatImm
+from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN
+from tvm.meta_schedule.runner import RunnerInput
+from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner
+
+MATMUL_N = 16
+MATMUL_M = 32
+
+
+@tvm.script.ir_module
+class MatmulModule:
+    @T.prim_func
+    def main(a: T.handle, b: T.handle, c: T.handle) -> None:  # pylint: disable=no-self-argument
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, (16, 16), "float32")
+        B = T.match_buffer(b, (16, 16), "float32")
+        C = T.match_buffer(c, (16, 16), "float32")
+        for i, j, k in T.grid(16, 16, 16):
+            with T.block("matmul"):
+                vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+                with T.init():
+                    C[vi, vj] = 0.0
+                C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
+@tvm.testing.requires_hexagon
+def test_builder_runner(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":
+        pytest.skip(msg="Tuning on simulator not supported.")
+
+    target_hexagon = tvm.target.hexagon("v68", link_params=True)
+    target = tvm.target.Target(target_hexagon, host=target_hexagon)
+    mod = MatmulModule
+
+    builder = get_hexagon_local_builder()
+    runner = get_hexagon_rpc_runner(hexagon_launcher, number=1, repeat=1, min_repeat_ms=0)
+
+    (builder_result,) = builder.build([BuilderInput(mod, target)])
+    assert builder_result.artifact_path is not None
+    assert builder_result.error_msg is None
+
+    runner_input = RunnerInput(
+        builder_result.artifact_path,
+        "llvm",
+        [
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+            TensorInfo("float32", (MATMUL_N, MATMUL_N)),
+        ],
+    )
+
+    # Run the module
+    (runner_future,) = runner.run([runner_input])
+    runner_result = runner_future.result()
+
+    assert runner_result.error_msg is None
+    for result in runner_result.run_secs:
+        if isinstance(result, FloatImm):
+            result = result.value
+        assert isinstance(result, float)
+        assert result >= 0.0
+
+
+def dense(m, n, k):
+    X = te.placeholder((m, k), name="X", dtype="uint8")
+    packedW = te.placeholder((n // 32, k // 4, 32, 4), name="packedW", dtype="uint8")
+
+    ak = te.reduce_axis((0, k), name="k")
+    out = te.compute(
+        (m, n),
+        lambda i, j: te.sum(
+            X[i, ak].astype("int32")
+            * packedW[tvm.tir.indexdiv(j, 32), tvm.tir.indexdiv(ak, 4), j % 32, ak % 4].astype(
+                "int32"
+            ),
+            axis=ak,
+        ),
+        name="compute",
+    )
+    return [X, packedW, out]
+
+
+def schedule_dense(sch, block, M, do_tune):
+    a_y, a_x, _ = sch.get_loops(block)[-3:]
+
+    if do_tune:
+        y_factors = sch.sample_perfect_tile(a_y, n=2, max_innermost_factor=128)
+        a_yo, a_yi = sch.split(a_y, factors=y_factors)
+    else:
+        a_yo, a_yi = sch.split(a_y, factors=[None, min(M, 32)])
+
+    a_xo, a_xi = sch.split(a_x, factors=[None, 32])
+    sch.reorder(a_yo, a_xo, a_yi, a_xi)
+
+    a_xi, a_k = sch.get_loops(block)[-2:]
+    a_ko, a_ki = sch.split(a_k, factors=[None, 4])
+    sch.reorder(a_ko, a_xi, a_ki)
+
+    fused = sch.fuse(a_yo, a_xo)
+
+    sch.parallel(fused)
+
+    dec = sch.decompose_reduction(block, a_ko)
+
+    init_loop = sch.get_loops(dec)[-1]
+    sch.vectorize(init_loop)
+
+    sch.tensorize(a_xi, VRMPY_u8u8i32_INTRIN)
+
+
+def verify_dense(sch, target, M, N, K, hexagon_session):
+    f = tvm.build(sch.mod["main"], target=target, name="dense")
+    mod = hexagon_session.load_module(f)
+    dev = hexagon_session.device
+
+    a_np = np.random.uniform(1, 10, size=(M, K)).astype("uint8")
+    b_np = np.random.uniform(1, 10, size=(N, K)).astype("uint8")
+    c_np = np.dot(a_np.astype("int32"), b_np.transpose().astype("int32"))
+
+    packW = np.random.uniform(1, 10, size=(N // 32, (K // 4), 32, 4)).astype("uint8")
+
+    for r_idx in range(N // 32):
+        for ko in range(K // 4):
+            for s_idx in range(32):
+                for t_idx in range(4):
+                    packW[r_idx][ko][s_idx][t_idx] = b_np[r_idx * 32 + s_idx][ko * 4 + t_idx]
+
+    a = tvm.nd.array(a_np, dev)
+    b = tvm.nd.array(packW, dev)
+    c = tvm.nd.array(np.zeros((M, N), dtype="int32"), dev)
+
+    mod(a, b, c)
+    np.testing.assert_equal(c.numpy(), c_np)
+
+    evaluator = mod.time_evaluator(mod.entry_name, dev, number=10)
+    gflops = (N * M * K) * 2 / 1e9
+    time_ms = evaluator(a, b, c).mean * 1e3
+    print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3)))
+
+
+@pytest.mark.skip(reason="xgboost not installed on CI")
+@tvm.testing.requires_hexagon
+def test_vrmpy_dense(hexagon_launcher):
+    if hexagon_launcher._serial_number == "simulator":

Review Comment:
   Thanks, that makes sense.  Could we keep the current setup with the x86 server, and add another way to execute the simulator?  I think there is value in being able to test the HW code path via simulator (and tests wouldn't need to be changed---right now they are set up to only depend on the device serial to decide whether they should run on HW or sim).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] csullivan merged pull request #12587: [Hexagon] Initial support for meta schedule tuning

Posted by GitBox <gi...@apache.org>.

csullivan merged PR #12587:
URL: https://github.com/apache/tvm/pull/12587


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #12587: [Hexagon] Initial support for meta scheduler tuning

Posted by GitBox <gi...@apache.org>.

masahi commented on code in PR #12587:
URL: https://github.com/apache/tvm/pull/12587#discussion_r954433668


##########
python/tvm/contrib/hexagon/meta_schedule.py:
##########
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Meta scheduler tuning utilities for Hexagon."""
+import os
+import tempfile
+from typing import Callable, List, Optional
+from tvm.contrib.popen_pool import PopenPoolExecutor
+from tvm.meta_schedule.utils import cpu_count, derived_object
+from tvm.meta_schedule.builder import LocalBuilder
+from tvm.meta_schedule.runner import (
+    EvaluatorConfig,
+    RunnerInput,
+    RunnerFuture,
+    PyRunner,
+)
+from tvm.meta_schedule.runner.rpc_runner import (
+    default_alloc_argument,
+    default_run_evaluator,
+    RPCRunnerFuture,
+)
+
+from .build import HexagonLauncherRPC
+from .tools import export_module
+
+
+@derived_object
+class HexagonRPCRunner(PyRunner):
+    """RPCRunner for Hexagon. See the documentation of RPCRunner for more details."""
+
+    def __init__(
+        self,
+        hexagon_launcher : HexagonLauncherRPC,
+        evaluator_config: Optional[EvaluatorConfig] = None,
+        cooldown_sec: float = 0.0,
+        alloc_repeat: int = 1,
+        max_workers: Optional[int] = None,
+        initializer: Optional[Callable[[], None]] = None,
+    ):
+        """
+        Parameters
+        ----------
+        hexagon_launcher : HexagonLauncherRPC
+            The RPC launcher for Hexagon. It is needed for creating hexagon.Session
+            object inside the worker function.
+        evaluator_config: EvaluatorConfig
+            The evaluator configuration.
+        cooldown_sec: float
+            The cooldown in seconds.
+        alloc_repeat: int
+            The number of times to random fill the allocation.
+        max_workers: Optional[int] = None
+            The maximum number of connections. Defaults to number of logical CPU cores.
+        initializer: Optional[Callable[[], None]]
+            The initializer function.
+        """
+
+        super().__init__()
+        self.hexagon_launcher = hexagon_launcher
+        self.evaluator_config = EvaluatorConfig._normalized(evaluator_config)
+        self.cooldown_sec = cooldown_sec
+        self.alloc_repeat = alloc_repeat
+        if max_workers is None:
+            max_workers = cpu_count(logical=True)
+        self.pool = PopenPoolExecutor(
+            max_workers=max_workers,
+            timeout=100,
+            initializer=initializer,
+        )
+
+    def run(self, runner_inputs: List[RunnerInput]) -> List[RunnerFuture]:
+        results = []
+        for runner_input in runner_inputs:
+            future = RPCRunnerFuture(
+                future=self.pool.submit(
+                    _worker_func,
+                    self.hexagon_launcher,
+                    self.evaluator_config,
+                    self.alloc_repeat,
+                    str(runner_input.artifact_path),
+                    tuple(arg_info.as_json() for arg_info in runner_input.args_info),
+                ),
+                timeout_sec=100,
+            )
+            results.append(future)
+        return results
+
+
+def _worker_func(hexagon_launcher, evaluator_config, alloc_repeat, artifact_path, args_info):
+    with hexagon_launcher.start_session() as session:
+        device = session.device
+        _, remote_path = os.path.split(artifact_path)
+        uploaded = session.upload(artifact_path, remote_path)
+        rt_mod = session.load_module(uploaded)
+        repeated_args = default_alloc_argument(
+            session,
+            device,
+            args_info,
+            alloc_repeat,
+        )
+        costs = default_run_evaluator(
+            session,
+            rt_mod,
+            device,
+            evaluator_config,
+            repeated_args,
+        )
+    return costs
+
+
+def get_hexagon_local_builder():
+    """Return Hexagon-compatible Builder for meta scheduler."""
+
+    def export_func(mod):
+        binary_path = export_module(mod, tempfile.mkdtemp())
+        return str(binary_path)
+
+    return LocalBuilder(f_export=export_func)
+
+
+def get_hexagon_rpc_runner(hexagon_launcher: HexagonLauncherRPC):
+    """Return Hexagon-compatible RPC Runner for meta scheduler."""
+    evaluator_config = EvaluatorConfig(
+        number=1,
+        repeat=1,
+        min_repeat_ms=0,

Review Comment:
   oops need to fix these params



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org