You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by GitBox <gi...@apache.org> on 2021/10/20 16:41:52 UTC
[GitHub] [tvm] Mousius commented on a change in pull request #9331: [4/10] Code generation for Conv2D via CMSIS-NN

Mousius commented on a change in pull request #9331:
URL: https://github.com/apache/tvm/pull/9331#discussion_r732945527



##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW

Review comment:
       Isn't this already `TVMBAW` ? We just need to parse the flags to get the right sizes?

##########
File path: python/tvm/relay/op/contrib/cmsisnn.py
##########
@@ -47,42 +47,93 @@ def partition_for_cmsisnn(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
+    tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__)

Review comment:
       I think we can put this at the top of the file as it imports into whichever scope it's in? It is likely to only ever be called once but it'd be good to guarantee it.

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));

Review comment:
       We should name these variables better rather than adding comments, plus I don't think we need auto types here?
   ```suggestion
        tir::Var in_var("input", DataType::Handle(8));
        tir::Var filter("filter", DataType::Handle(8));    
        tir::Var multiplier("multiplier", DataType::Handle(32));  
        tir::Var filter_scale("filter_scale", DataType::Handle(32)); 
        tir::Var bias("bias", DataType::Handle(32));       
        tir::Var input_scale("input_scale", DataType::Handle(32));  
        tir::Var shift("shift", DataType::Handle(32));        
        tir::Var output("output", DataType::Handle(8));
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();

Review comment:
       ```suggestion
       Conv2DAttrs* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
   ```

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)
+    p = (0, 0, 0, 0)
+    if padding == "SAME":
+        p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(
+            a,
+            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+            pad_value=input_zp,
+            pad_mode="constant",
+        )
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(kernel_dtype).min,
+            high=np.iinfo(kernel_dtype).max,
+            size=weight_shape,
+            dtype=kernel_dtype,
+        )
+    )
+    weights = relay.const(w, kernel_dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p,
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    bc = relay.const(b, "int32")

Review comment:
       Genuinely unsure what `bc` means in this context.

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)

Review comment:
       We should use more descriptive names?
   
   ```suggestion
       input = relay.var("input", shape=shape, dtype=dtype)
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/extract_constants.cc
##########
@@ -0,0 +1,158 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+class ExtractConstantsMutator : public MixedModeMutator {
+ public:
+  explicit ExtractConstantsMutator(IRModule& mod) : mod_(mod) {}
+
+ private:
+  String gen_var_name() { return "tvm_var_extract_const_" + std::to_string(var_count_++); }

Review comment:
       Should we make this more CMSIS-NN specific?

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();

Review comment:
       ```suggestion
       CallNode* final_call = expr.as<CallNode>();
       OpNode* final_op = final_call->op.as<OpNode>();
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/extract_constants.cc
##########
@@ -0,0 +1,158 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+class ExtractConstantsMutator : public MixedModeMutator {
+ public:
+  explicit ExtractConstantsMutator(IRModule& mod) : mod_(mod) {}
+
+ private:
+  String gen_var_name() { return "tvm_var_extract_const_" + std::to_string(var_count_++); }
+
+  Expr VisitExpr_(const FunctionNode* func) final {
+    Function final_func = GetRef<Function>(func);
+    ++func_nesting_level_;
+    auto new_body = VisitExpr(func->body);
+    --func_nesting_level_;
+    if (!new_body.same_as(func->body)) {
+      final_func = Function(FreeVars(new_body), new_body, func->ret_type,
+                            FreeTypeVars(new_body, mod_), func->attrs);
+      function_to_constants_.Set(GetRef<Function>(func), constants_within_function_);
+      constants_within_function_.clear();
+    }
+    return final_func;
+  }
+
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
+    Expr final_call = post;
+    auto* post_call = post.as<CallNode>();
+    if (post_call == nullptr) {
+      return final_call;
+    }
+
+    // Replace Constant arguments with Vars for ML Operators
+    // Perform this for non-main Call Nodes only
+    if (func_nesting_level_ && call->op.as<OpNode>()) {
+      Array<Expr> new_args;
+      for (auto& arg : post_call->args) {
+        auto* const_arg = arg.as<ConstantNode>();
+        if (const_arg && !const_arg->is_scalar()) {
+          Var var_arg = Var(gen_var_name(), const_arg->tensor_type());
+          new_args.push_back(var_arg);
+          constants_within_function_.push_back(GetRef<Constant>(const_arg));
+        } else {
+          new_args.push_back(arg);
+        }
+      }
+      final_call = Call(call->op, new_args, call->attrs, {});
+    }
+
+    // Since the constants are kicked out of partitioned functions
+    // a new call to global function is needed
+    if (auto* glob_var_node = post_call->op.as<GlobalVarNode>()) {
+      auto glob_var = GetRef<GlobalVar>(glob_var_node);
+      auto glob_func = Downcast<Function>(mod_->Lookup(glob_var));
+      auto new_glob_func = VisitExpr(glob_func);
+      if (!new_glob_func.same_as(glob_func)) {
+        mod_->Update(glob_var, Downcast<Function>(new_glob_func));
+        Array<Expr> new_args = post_call->args;
+        ICHECK(function_to_constants_.find(glob_func) != function_to_constants_.end());
+        for (auto constant : function_to_constants_.at(glob_func)) {
+          new_args.push_back(constant);
+        }
+        final_call = Call(glob_var, new_args);
+      }
+    }
+
+    // Since the constants are kicked out of the local partitioned functions

Review comment:
       We should consider a less aggressive verb for extracting constants.

##########
File path: src/relay/backend/contrib/cmsisnn/generate_constants.cc
##########
@@ -0,0 +1,230 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <tvm/relay/attrs/nn.h>
+#include <tvm/relay/attrs/transform.h>
+#include <tvm/relay/expr_functor.h>
+#include <tvm/relay/transform.h>
+#include <tvm/runtime/ndarray.h>
+
+#include "../../../qnn/utils.h"
+#include "../../../transforms/pattern_utils.h"
+
+namespace tvm {
+namespace relay {
+Expr MakeTranspose(Expr data, Array<Integer> axes);

Review comment:
       You should be able to include this from `../../../op/make_op.h` 

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();
+      clip_min = clip_attrs->a_min;
+      clip_max = clip_attrs->a_max;
+    } else {
+      clip_min = -128;
+      clip_max = 127;
+    }
+
+    // cmsis_nn_dims *input_dims
+    auto input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
+    int32_t input_n = qnn::get_const_int(input_shape[0]);
+    int32_t input_h = qnn::get_const_int(input_shape[1]);
+    int32_t input_w = qnn::get_const_int(input_shape[2]);
+    int32_t input_c = qnn::get_const_int(input_shape[3]);
+
+    // cmsis_nn_dims *filter_dims (OHWI)
+    auto filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;

Review comment:
       ```suggestion
       Array<PrimExpr> filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();

Review comment:
       ```suggestion
         ClipAttrs* clip_attrs = clip_call->attrs.as<ClipAttrs>();
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();
+      clip_min = clip_attrs->a_min;
+      clip_max = clip_attrs->a_max;
+    } else {
+      clip_min = -128;
+      clip_max = 127;
+    }
+
+    // cmsis_nn_dims *input_dims

Review comment:
       It might be worth constructing helper functions to generate these pseudo-struct's just to make it easier to see?
   
   ```
   Array<IntImm> dimensions = CMSISNNDimensions(input_shape);
   ```
   
   and then at the end doing:
   
   ```cpp
   Concat(extern_args, dimensions, woofles);
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();

Review comment:
       ```suggestion
       CallNode* requantize_input = requantize_call->args[0].as<CallNode>();
       OpNode* requantize_input_op = requantize_input->op.as<OpNode>();
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();
+      clip_min = clip_attrs->a_min;
+      clip_max = clip_attrs->a_max;
+    } else {
+      clip_min = -128;
+      clip_max = 127;
+    }
+
+    // cmsis_nn_dims *input_dims
+    auto input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
+    int32_t input_n = qnn::get_const_int(input_shape[0]);
+    int32_t input_h = qnn::get_const_int(input_shape[1]);
+    int32_t input_w = qnn::get_const_int(input_shape[2]);
+    int32_t input_c = qnn::get_const_int(input_shape[3]);
+
+    // cmsis_nn_dims *filter_dims (OHWI)
+    auto filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
+    int32_t filter_n = qnn::get_const_int(filter_shape[0]);
+    int32_t filter_h = qnn::get_const_int(filter_shape[1]);
+    int32_t filter_w = qnn::get_const_int(filter_shape[2]);
+    int32_t filter_c = qnn::get_const_int(filter_shape[3]);
+
+    // cmsis_nn_dims *bias_dims
+    int32_t bias_n = 1;
+    int32_t bias_h = 1;
+    int32_t bias_w = 1;
+    int32_t bias_c = qnn::get_const_int(filter_shape[0]);
+
+    // cmsis_nn_dims *output_dims
+    auto output_shape = conv2d_call->type_as<TensorTypeNode>()->shape;

Review comment:
       ```suggestion
       Array<PrimExpr> output_shape = conv2d_call->type_as<TensorTypeNode>()->shape;
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();
+      clip_min = clip_attrs->a_min;
+      clip_max = clip_attrs->a_max;
+    } else {
+      clip_min = -128;
+      clip_max = 127;
+    }
+
+    // cmsis_nn_dims *input_dims
+    auto input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;

Review comment:
       We should try not to use `auto` unless it gets very long - https://google.github.io/styleguide/cppguide.html#Type_deduction, it took me a minute to figure out what type this was going to be.
   
   ```suggestion
       Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -43,23 +44,153 @@ class RelayToTIRVisitor : public MixedModeVisitor {
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
   void CreatePrimFuncForExtern(Array<tir::Var> func_signature,
-                               tvm::Array<PrimExpr> call_extern_args) {
+                               tvm::Array<PrimExpr> call_extern_args, int context_buffer_size = 0) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set("global_symbol", func_name_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(8), tir::builtin::call_extern(), call_extern_args));
 
+    if (context_buffer_size) {
+      // TODO(@ashutosh-arm) while supporting MVE, we need to move allocation through TVMBAW
+      tir::Var buffer_var("context_buffer", PointerType(PrimType(DataType::Int(8)), "global"));
+      body = tir::Allocate(buffer_var, DataType::Int(8), {context_buffer_size}, tir::const_true(),
+                           body);
+    }
+
     primfunc_ = tir::PrimFunc(func_signature, body, VoidType(), Map<tir::Var, tir::Buffer>(),
                               DictAttrs(dict_attrs));
   }
 
+  void EmitConv2D(const Expr& expr) {
+    const CallNode* clip_call = nullptr;
+    const CallNode* requantize_call = nullptr;
+    const CallNode* bias_add_call = nullptr;
+    const CallNode* conv2d_call = nullptr;
+    auto* final_call = expr.as<CallNode>();
+    auto* final_op = final_call->op.as<OpNode>();
+    if (final_op->name == "clip") {
+      clip_call = final_call;
+      requantize_call = clip_call->args[0].as<CallNode>();
+    } else {
+      requantize_call = final_call;
+    }
+    auto* requantize_input = requantize_call->args[0].as<CallNode>();
+    auto* requantize_input_op = requantize_input->op.as<OpNode>();
+    if (requantize_input_op->name == "nn.bias_add") {
+      bias_add_call = requantize_input;
+      conv2d_call = bias_add_call->args[0].as<CallNode>();
+    } else {
+      conv2d_call = requantize_input;
+    }
+
+    // TIR variables are created in the order they appear in the Relay partitioned function
+    // %1 = qnn.conv2d(%input, %weight_const_0, input_zero_point_scalar,
+    //                 %cmsisnn_multiplier_const_1, %input_scale_scalar, %weight_scale_const_2)
+    // %2 = nn.bias_add(%1, %bias_const_3, axis=3)
+    // %3 = qnn.requantize(%2, %input_scale_const_4, %cmsisnn_shift_const_5,
+    //                     %output_scale_scalar, %output_zero_point_scalar)
+    // clip(%3, a_min=%min_scalar, a_max=%max_scalar)
+    auto in_var = tir::Var("input", DataType::Handle(8));
+    auto const_var0 = tir::Var("filter", DataType::Handle(8));         // weight
+    auto const_var1 = tir::Var("multiplier", DataType::Handle(32));    // quant multiplier
+    auto const_var2 = tir::Var("filter_scale", DataType::Handle(32));  // weight scale
+    auto const_var3 = tir::Var("bias", DataType::Handle(32));          // bias
+    auto const_var4 = tir::Var("input_scale", DataType::Handle(32));   // input_scale * weight_scale
+    auto const_var5 = tir::Var("shift", DataType::Handle(32));         // quant shift
+    auto out_var = tir::Var("output", DataType::Handle(8));
+
+    // Individual arguments to the structs arguments of the CMSIS-NN API are filled into call_extern
+    // https://github.com/ARM-software/CMSIS_5/blob/def6f800f95661eb3451d317f7d0dde504f6020d/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_wrapper_s8.c#L50
+
+    // prepare cmsis_nn_conv_params
+    auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
+    int32_t input_offset = -GetScalarFromConstant<int32_t>(conv2d_call->args[2]);
+    int32_t output_offset = GetScalarFromConstant<int32_t>(requantize_call->args[4]);
+    int32_t stride_w = qnn::get_const_int(conv2d_attrs->strides[1]);
+    int32_t stride_h = qnn::get_const_int(conv2d_attrs->strides[0]);
+    int32_t padding_w = qnn::get_const_int(conv2d_attrs->padding[1]);
+    int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
+    int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
+    int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t clip_min, clip_max;
+    if (clip_call) {
+      auto* clip_attrs = clip_call->attrs.as<ClipAttrs>();
+      clip_min = clip_attrs->a_min;
+      clip_max = clip_attrs->a_max;
+    } else {
+      clip_min = -128;
+      clip_max = 127;
+    }
+
+    // cmsis_nn_dims *input_dims
+    auto input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
+    int32_t input_n = qnn::get_const_int(input_shape[0]);
+    int32_t input_h = qnn::get_const_int(input_shape[1]);
+    int32_t input_w = qnn::get_const_int(input_shape[2]);
+    int32_t input_c = qnn::get_const_int(input_shape[3]);
+
+    // cmsis_nn_dims *filter_dims (OHWI)
+    auto filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
+    int32_t filter_n = qnn::get_const_int(filter_shape[0]);
+    int32_t filter_h = qnn::get_const_int(filter_shape[1]);
+    int32_t filter_w = qnn::get_const_int(filter_shape[2]);
+    int32_t filter_c = qnn::get_const_int(filter_shape[3]);
+
+    // cmsis_nn_dims *bias_dims
+    int32_t bias_n = 1;
+    int32_t bias_h = 1;
+    int32_t bias_w = 1;
+    int32_t bias_c = qnn::get_const_int(filter_shape[0]);
+
+    // cmsis_nn_dims *output_dims
+    auto output_shape = conv2d_call->type_as<TensorTypeNode>()->shape;
+    int32_t output_n = qnn::get_const_int(output_shape[0]);
+    int32_t output_h = qnn::get_const_int(output_shape[1]);
+    int32_t output_w = qnn::get_const_int(output_shape[2]);
+    int32_t output_c = qnn::get_const_int(output_shape[3]);
+
+    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm("arm_convolve_wrapper_s8"), in_var,
+                                          const_var0, const_var1};
+    if (bias_add_call) {
+      call_ext_args.push_back(const_var3);
+    }
+    call_ext_args.push_back(const_var5);
+    call_ext_args.push_back(out_var);
+
+    tvm::Array<PrimExpr> scalar_args = {
+        ToArg(input_offset), ToArg(output_offset), ToArg(stride_w),   ToArg(stride_h),
+        ToArg(padding_w),    ToArg(padding_h),     ToArg(dilation_w), ToArg(dilation_h),
+        ToArg(clip_min),     ToArg(clip_max),      ToArg(input_n),    ToArg(input_h),
+        ToArg(input_w),      ToArg(input_c),       ToArg(filter_n),   ToArg(filter_h),
+        ToArg(filter_w),     ToArg(filter_c),      ToArg(bias_n),     ToArg(bias_h),
+        ToArg(bias_w),       ToArg(bias_c),        ToArg(output_n),   ToArg(output_h),
+        ToArg(output_w),     ToArg(output_c),
+    };
+
+    call_ext_args = tvm::runtime::Concat(call_ext_args, scalar_args);
+
+    Array<tir::Var> func_signature{in_var, const_var0, const_var1, const_var2};
+    if (bias_add_call) {
+      func_signature.push_back(const_var3);
+    }
+    func_signature.push_back(const_var4);
+    func_signature.push_back(const_var5);
+    func_signature.push_back(out_var);
+
+    // https://github.com/ARM-software/CMSIS_5/blob/d788fd583984388553391de18afd8b4d2a146868/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c#L367
+    size_t context_buffer_size = (2 * input_c * filter_w * filter_h) * (int32_t)sizeof(int16_t);
+
+    CreatePrimFuncForExtern(func_signature, call_ext_args, context_buffer_size);
+  }
+
   void EmitSoftMax(const Expr& expr) {
     auto* quantize_call = expr.as<CallNode>();
     auto* softmax_call = quantize_call->args[0].as<CallNode>();
     auto* dequant_call = softmax_call->args[0].as<CallNode>();
-    const float quant_scale = GetScalarFromConstant<float>(dequant_call->args[1]);
+    auto* scale_const = dequant_call->args[1].as<ConstantNode>();

Review comment:
       ```suggestion
       ConstantNode* scale_const = dequant_call->args[1].as<ConstantNode>();
   ```

##########
File path: src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
##########
@@ -54,6 +54,162 @@ class CodeGenCMSISNN : public CodeGenC {
   }
 
  private:
+  /*!  * \brief Emit the CMSIS-NN context buffer */
+  void VisitStmt_(const AllocateNode* op) {
+    context_buffer_name_ = op->buffer_var->name_hint;
+    context_buffer_size_ = op->constant_allocation_size();
+    CodeGenC::VisitStmt_(op);
+  }
+
+  /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
+  void VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
+    if (!op->op.same_as(builtin::call_extern())) {
+      return;
+    }
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+    if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" ||
+        cmsis_func_name == "arm_elementwise_add_s8") {
+      CodeGenC::VisitExpr_(op, os);
+    } else if (cmsis_func_name == "arm_convolve_wrapper_s8") {
+      EmitConv2D(op);
+    }
+    return;
+  }
+
+  /*!  * \brief Emits cmsis_nn_context struct */
+  std::string EmitCMSISNNContext(std::ostream& os, std::string buf_name, int buf_size) {
+    std::string struct_name = "context";
+    PrintIndent();
+    os << "cmsis_nn_context " << struct_name << "= {" << buf_name << "," << buf_size << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_conv_params struct */
+  std::string EmitCMSISNNConvParams(std::ostream& os, int32_t input_offset, int32_t output_offset,
+                                    int32_t stride_w, int32_t stride_h, int32_t padding_w,
+                                    int32_t padding_h, int32_t dilation_w, int32_t dilation_h,
+                                    int32_t clip_min, int32_t clip_max) {
+    std::string struct_name = "conv_params";
+    PrintIndent();
+    os << "cmsis_nn_tile stride = {" << stride_w << "," << stride_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_tile padding = {" << padding_w << "," << padding_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_tile dilation = {" << dilation_w << "," << dilation_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_activation activation = {" << clip_min << "," << clip_max << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_conv_params " << struct_name << " = {" << input_offset << ", " << output_offset
+       << ", stride, padding, dilation, activation};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_per_channel_quant_params struct */
+  std::string EmitCMSISNNPerChannelQuantParams(std::ostream& os, std::string multiplier,
+                                               std::string shift) {
+    std::string struct_name = "quant_params";
+    PrintIndent();
+    os << "cmsis_nn_per_channel_quant_params " << struct_name << " = {" << multiplier << ", "
+       << shift << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_dims struct */
+  std::string EmitCMSISNNDims(std::ostream& os, std::string tensor_type, int32_t n, int32_t h,
+                              int32_t w, int32_t c) {
+    std::string struct_name = tensor_type + "_dims";
+    PrintIndent();
+    os << "cmsis_nn_dims " << struct_name << " = {" << n << "," << h << "," << w << "," << c
+       << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
+  void EmitConv2D(const CallNode* op) {
+    static const int max_num_args = 33;
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+
+    bool bias_enabled = false;
+    if (op->args.size() == max_num_args) {
+      bias_enabled = true;
+    }
+
+    auto get_var_name = [](const CallNode* op, int id) {
+      return op->args[id].as<VarNode>()->name_hint.c_str();
+    };
+    auto get_arg_value = [](const CallNode* op, int id) {
+      return op->args[id].as<IntImmNode>()->value;
+    };
+    int arg_id = 0;
+    std::string input_data = get_var_name(op, ++arg_id);
+    std::string filter_data = get_var_name(op, ++arg_id);
+    std::string multiplier = get_var_name(op, ++arg_id);
+    std::string bias_data("0x0");
+    if (bias_enabled) {
+      bias_data = get_var_name(op, ++arg_id);
+    }
+    std::string shift = get_var_name(op, ++arg_id);
+    std::string output_data = get_var_name(op, ++arg_id);
+
+    int input_offset = get_arg_value(op, ++arg_id);
+    int output_offset = get_arg_value(op, ++arg_id);
+    int stride_w = get_arg_value(op, ++arg_id);
+    int stride_h = get_arg_value(op, ++arg_id);
+    int padding_w = get_arg_value(op, ++arg_id);
+    int padding_h = get_arg_value(op, ++arg_id);
+    int dilation_w = get_arg_value(op, ++arg_id);
+    int dilation_h = get_arg_value(op, ++arg_id);
+    int clip_min = get_arg_value(op, ++arg_id);
+    int clip_max = get_arg_value(op, ++arg_id);
+    int input_n = get_arg_value(op, ++arg_id);
+    int input_h = get_arg_value(op, ++arg_id);
+    int input_w = get_arg_value(op, ++arg_id);
+    int input_c = get_arg_value(op, ++arg_id);
+    int filter_n = get_arg_value(op, ++arg_id);
+    int filter_h = get_arg_value(op, ++arg_id);
+    int filter_w = get_arg_value(op, ++arg_id);
+    int filter_c = get_arg_value(op, ++arg_id);
+    int bias_n = get_arg_value(op, ++arg_id);
+    int bias_h = get_arg_value(op, ++arg_id);
+    int bias_w = get_arg_value(op, ++arg_id);
+    int bias_c = get_arg_value(op, ++arg_id);
+    int output_n = get_arg_value(op, ++arg_id);
+    int output_h = get_arg_value(op, ++arg_id);
+    int output_w = get_arg_value(op, ++arg_id);
+    int output_c = get_arg_value(op, ++arg_id);
+
+    // TODO(ashutosh-arm) for mve code, need to look for tir allocate

Review comment:
       Done now?

##########
File path: src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
##########
@@ -223,19 +353,25 @@ class RelayToTIRVisitor : public MixedModeVisitor {
 
     auto comp_name = func->GetAttr<String>(attr::kComposite);
     if (comp_name.defined()) {
-      if (comp_name == "cmsisnn.quantized_softmax") {
+      if (comp_name == "cmsisnn.qnn_conv2d") {
+        EmitConv2D(func->body);
+      }
+      if (comp_name == "cmsisnn.qnn_softmax") {
         EmitSoftMax(func->body);
       }
-      if (comp_name == "cmsisnn.quantized_mul") {
+      if (comp_name == "cmsisnn.qnn_mul") {
         EmitMul(func->body);
       }
-      if (comp_name == "cmsisnn.quantized_add") {
+      if (comp_name == "cmsisnn.qnn_add") {
         EmitAdd(func->body);
       }
     }
   }
 
  public:
+  int32_t kScaledDiffIntegerBits = 5;

Review comment:
       Do all of these need to be public now? Also I believe we could mark these as `constexpr` ?

##########
File path: src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
##########
@@ -54,6 +54,162 @@ class CodeGenCMSISNN : public CodeGenC {
   }
 
  private:
+  /*!  * \brief Emit the CMSIS-NN context buffer */
+  void VisitStmt_(const AllocateNode* op) {
+    context_buffer_name_ = op->buffer_var->name_hint;
+    context_buffer_size_ = op->constant_allocation_size();
+    CodeGenC::VisitStmt_(op);
+  }
+
+  /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
+  void VisitExpr_(const CallNode* op, std::ostream& os) {  // NOLINT(*)
+    if (!op->op.same_as(builtin::call_extern())) {
+      return;
+    }
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+    if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" ||
+        cmsis_func_name == "arm_elementwise_add_s8") {
+      CodeGenC::VisitExpr_(op, os);
+    } else if (cmsis_func_name == "arm_convolve_wrapper_s8") {
+      EmitConv2D(op);
+    }
+    return;
+  }
+
+  /*!  * \brief Emits cmsis_nn_context struct */
+  std::string EmitCMSISNNContext(std::ostream& os, std::string buf_name, int buf_size) {
+    std::string struct_name = "context";
+    PrintIndent();
+    os << "cmsis_nn_context " << struct_name << "= {" << buf_name << "," << buf_size << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_conv_params struct */
+  std::string EmitCMSISNNConvParams(std::ostream& os, int32_t input_offset, int32_t output_offset,
+                                    int32_t stride_w, int32_t stride_h, int32_t padding_w,
+                                    int32_t padding_h, int32_t dilation_w, int32_t dilation_h,
+                                    int32_t clip_min, int32_t clip_max) {
+    std::string struct_name = "conv_params";
+    PrintIndent();
+    os << "cmsis_nn_tile stride = {" << stride_w << "," << stride_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_tile padding = {" << padding_w << "," << padding_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_tile dilation = {" << dilation_w << "," << dilation_h << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_activation activation = {" << clip_min << "," << clip_max << "};\n";
+    PrintIndent();
+    os << "cmsis_nn_conv_params " << struct_name << " = {" << input_offset << ", " << output_offset
+       << ", stride, padding, dilation, activation};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_per_channel_quant_params struct */
+  std::string EmitCMSISNNPerChannelQuantParams(std::ostream& os, std::string multiplier,
+                                               std::string shift) {
+    std::string struct_name = "quant_params";
+    PrintIndent();
+    os << "cmsis_nn_per_channel_quant_params " << struct_name << " = {" << multiplier << ", "
+       << shift << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits cmsis_nn_dims struct */
+  std::string EmitCMSISNNDims(std::ostream& os, std::string tensor_type, int32_t n, int32_t h,
+                              int32_t w, int32_t c) {
+    std::string struct_name = tensor_type + "_dims";
+    PrintIndent();
+    os << "cmsis_nn_dims " << struct_name << " = {" << n << "," << h << "," << w << "," << c
+       << "};\n";
+    return struct_name;
+  }
+
+  /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
+  void EmitConv2D(const CallNode* op) {
+    static const int max_num_args = 33;
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+
+    bool bias_enabled = false;
+    if (op->args.size() == max_num_args) {
+      bias_enabled = true;
+    }
+
+    auto get_var_name = [](const CallNode* op, int id) {
+      return op->args[id].as<VarNode>()->name_hint.c_str();
+    };
+    auto get_arg_value = [](const CallNode* op, int id) {
+      return op->args[id].as<IntImmNode>()->value;
+    };
+    int arg_id = 0;
+    std::string input_data = get_var_name(op, ++arg_id);
+    std::string filter_data = get_var_name(op, ++arg_id);
+    std::string multiplier = get_var_name(op, ++arg_id);
+    std::string bias_data("0x0");
+    if (bias_enabled) {
+      bias_data = get_var_name(op, ++arg_id);
+    }
+    std::string shift = get_var_name(op, ++arg_id);
+    std::string output_data = get_var_name(op, ++arg_id);
+
+    int input_offset = get_arg_value(op, ++arg_id);
+    int output_offset = get_arg_value(op, ++arg_id);
+    int stride_w = get_arg_value(op, ++arg_id);
+    int stride_h = get_arg_value(op, ++arg_id);
+    int padding_w = get_arg_value(op, ++arg_id);
+    int padding_h = get_arg_value(op, ++arg_id);
+    int dilation_w = get_arg_value(op, ++arg_id);
+    int dilation_h = get_arg_value(op, ++arg_id);
+    int clip_min = get_arg_value(op, ++arg_id);
+    int clip_max = get_arg_value(op, ++arg_id);
+    int input_n = get_arg_value(op, ++arg_id);
+    int input_h = get_arg_value(op, ++arg_id);
+    int input_w = get_arg_value(op, ++arg_id);
+    int input_c = get_arg_value(op, ++arg_id);
+    int filter_n = get_arg_value(op, ++arg_id);
+    int filter_h = get_arg_value(op, ++arg_id);
+    int filter_w = get_arg_value(op, ++arg_id);
+    int filter_c = get_arg_value(op, ++arg_id);
+    int bias_n = get_arg_value(op, ++arg_id);
+    int bias_h = get_arg_value(op, ++arg_id);
+    int bias_w = get_arg_value(op, ++arg_id);
+    int bias_c = get_arg_value(op, ++arg_id);
+    int output_n = get_arg_value(op, ++arg_id);
+    int output_h = get_arg_value(op, ++arg_id);
+    int output_w = get_arg_value(op, ++arg_id);
+    int output_c = get_arg_value(op, ++arg_id);
+
+    // TODO(ashutosh-arm) for mve code, need to look for tir allocate
+    std::string context = EmitCMSISNNContext(stream, context_buffer_name_, context_buffer_size_);
+    std::string conv_params =
+        EmitCMSISNNConvParams(stream, input_offset, output_offset, stride_w, stride_h, padding_w,
+                              padding_h, dilation_w, dilation_h, clip_min, clip_max);
+    std::string quant_params = EmitCMSISNNPerChannelQuantParams(stream, multiplier, shift);
+    std::string input_dim = EmitCMSISNNDims(stream, "input", input_n, input_h, input_w, input_c);
+    std::string filter_dim =
+        EmitCMSISNNDims(stream, "filter", filter_n, filter_h, filter_w, filter_c);
+    std::string bias_dim = EmitCMSISNNDims(stream, "bias", bias_n, bias_h, bias_w, bias_c);
+    std::string output_dim =
+        EmitCMSISNNDims(stream, "output", output_n, output_h, output_w, output_c);
+
+    PrintIndent();
+    stream << "arm_status status = ";
+    stream << cmsis_func_name << "(";
+    stream << "&" << context << ", ";
+    stream << "&" << conv_params << ", ";
+    stream << "&" << quant_params << ", ";
+    stream << "&" << input_dim << ", " << input_data << ", ";
+    stream << "&" << filter_dim << ", " << filter_data << ", ";
+    stream << "&" << bias_dim << ", " << bias_data << ", ";
+    stream << "&" << output_dim << ", " << output_data << ");\n";
+    PrintIndent();
+    stream << "if (status != ARM_MATH_SUCCESS) {\n";
+    PrintIndent();
+    PrintIndent();
+    stream << "printf(\"Failed during execution of " << cmsis_func_name << "().\");\n";

Review comment:
       We can't assume there'll be `printf` available, this function should return `0` or `-1` on error.

##########
File path: src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
##########
@@ -69,6 +225,10 @@ class CodeGenCMSISNN : public CodeGenC {
     ss << "}\n";
     ss << "#endif\n";
   }
+
+ private:

Review comment:
       Isn't this block still private from above?

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)
+    p = (0, 0, 0, 0)
+    if padding == "SAME":
+        p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(
+            a,
+            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+            pad_value=input_zp,
+            pad_mode="constant",
+        )
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(kernel_dtype).min,
+            high=np.iinfo(kernel_dtype).max,
+            size=weight_shape,
+            dtype=kernel_dtype,
+        )
+    )
+    weights = relay.const(w, kernel_dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p,
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    bc = relay.const(b, "int32")
+    bias = conv
+    if enable_bias:
+        bias = relay.nn.bias_add(conv, bc, axis=3)

Review comment:
       This logic is a bit weird, it'd be clearer if it didn't have `bias = conv` in there, maybe something like:
   ```suggestion
       last_op = relay.nn.bias_add(conv, bc, axis=3) if enable_bias else conv
   ```

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)
+    p = (0, 0, 0, 0)
+    if padding == "SAME":
+        p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(
+            a,
+            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+            pad_value=input_zp,
+            pad_mode="constant",
+        )
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(kernel_dtype).min,
+            high=np.iinfo(kernel_dtype).max,
+            size=weight_shape,
+            dtype=kernel_dtype,
+        )
+    )
+    weights = relay.const(w, kernel_dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p,
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    bc = relay.const(b, "int32")
+    bias = conv
+    if enable_bias:
+        bias = relay.nn.bias_add(conv, bc, axis=3)
+    requant_input_sc = [sc * input_sc for sc in kernel_sc]
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(requant_input_sc, "float32"),
+        relay.const(0, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+        out_dtype=dtype,
+    )
+    relu = make_qnn_relu(req, relu_type, output_sc, output_zp, dtype)
+    params = {"w": w, "b": b}
+    return relu, params
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
+@pytest.mark.parametrize("kernel_size", [(3, 3)])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))])
+@pytest.mark.parametrize("enable_bias", [True, False])
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+@pytest.mark.parametrize(
+    "in_zp, in_sc, k_sc, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
+)
+def test_op_int8(
+    ifm_shape,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    enable_bias,
+    relu_type,
+    in_zp,
+    in_sc,
+    k_sc,
+    out_channels,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+
+    k_zp = 0
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    dtype = "int8"
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    weight_shape = None
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, ifm_shape[3], out_channels)
+
+    out_sc, out_zp = get_conv2d_qnn_params(
+        weight_shape, in_sc, in_zp, k_sc, k_zp, dtype, dtype, dtype, False
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        weight_shape,
+        in_zp,
+        in_sc,
+        k_zp,
+        k_sc,
+        out_zp,
+        out_sc,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    attrs = [
+        cmsisnn_mod[var.name_hint].attrs
+        for var in cmsisnn_mod.get_global_vars()
+        if cmsisnn_mod[var.name_hint].attrs
+    ]
+    assert any(attrs), "At least one function with external attributes was expected."
+
+    compilers = [
+        key == "Compiler" and value == "cmsisnn" for attr in attrs for key, value in attr.items()
+    ]
+    assert any(compilers), "Module does not contain function for cmsisnn target."
+
+    assert count_num_calls(orig_mod) == count_num_calls(
+        cmsisnn_mod
+    ), "Number of calls changed during partitioning"
+
+    # validate the output
+    np.random.seed(0)
+    inputs = {
+        "in0": np.random.randint(in_min, high=in_max, size=ifm_shape, dtype="int8"),
+    }
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+def parameterize_for_invalid_model(test):
+    in_dtype = ["uint8", "int8"]
+    kernel_dtype = ["uint8", "int8"]
+    kernel_zero_point = [-33, 10, 0]
+    all_combinations = itertools.product(in_dtype, kernel_dtype, kernel_zero_point)
+    all_combinations = filter(
+        lambda parameters: not (
+            parameters[0] == "int8" and parameters[1] == "int8" and parameters[2] == 0
+        ),
+        all_combinations,
+    )
+    return pytest.mark.parametrize(
+        ["in_dtype", "kernel_dtype", "kernel_zero_point"],
+        all_combinations,
+    )(test)
+
+
+@parameterize_for_invalid_model
+def test_invalid_parameters(
+    in_dtype,
+    kernel_dtype,
+    kernel_zero_point,
+):
+    ifm_shape = (1, 28, 28, 12)
+    out_channels = 2
+    in_sc = 1
+    in_zp = 24
+    k_sc = [0.11, 0.0237]
+    in_min, in_max = get_range_for_dtype_str(in_dtype)
+
+    kernel_layout = "HWIO"
+    kernel_shape = [3, 3, ifm_shape[3], out_channels]
+    out_sc, out_zp = get_conv2d_qnn_params(
+        kernel_shape, in_sc, in_zp, k_sc, kernel_zero_point, in_dtype, kernel_dtype, in_dtype, False
+    )
+    model, params = make_model(
+        shape=ifm_shape,
+        kernel_shape=kernel_shape,
+        input_zp=in_zp,
+        input_sc=in_sc,
+        kernel_zp=kernel_zero_point,
+        kernel_sc=k_sc,
+        output_zp=out_zp,
+        output_sc=out_sc,
+        padding="SAME",
+        strides=(1, 1),
+        dilation=(1, 1),
+        groups=1,
+        dtype=in_dtype,
+        kernel_dtype=kernel_dtype,
+        out_channels=out_channels,
+        weight_format=kernel_layout,
+        enable_bias=True,
+        relu_type="NONE",
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # print(cmsisnn_mod.astext(False))

Review comment:
       `print` snuck in?

##########
File path: tests/python/contrib/test_cmsisnn/test_networks.py
##########
@@ -92,7 +92,6 @@ def test_cnn_small():
 
     orig_mod, params = convert_to_relay(tflite_model_buf, input_data, "input")
     cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
-

Review comment:
       Any particular reason to remove this line in particular?

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)
+    p = (0, 0, 0, 0)
+    if padding == "SAME":
+        p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(
+            a,
+            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+            pad_value=input_zp,
+            pad_mode="constant",
+        )
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(kernel_dtype).min,
+            high=np.iinfo(kernel_dtype).max,
+            size=weight_shape,
+            dtype=kernel_dtype,
+        )
+    )
+    weights = relay.const(w, kernel_dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p,
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    bc = relay.const(b, "int32")
+    bias = conv
+    if enable_bias:
+        bias = relay.nn.bias_add(conv, bc, axis=3)
+    requant_input_sc = [sc * input_sc for sc in kernel_sc]
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(requant_input_sc, "float32"),
+        relay.const(0, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+        out_dtype=dtype,
+    )
+    relu = make_qnn_relu(req, relu_type, output_sc, output_zp, dtype)
+    params = {"w": w, "b": b}
+    return relu, params
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
+@pytest.mark.parametrize("kernel_size", [(3, 3)])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))])
+@pytest.mark.parametrize("enable_bias", [True, False])
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+@pytest.mark.parametrize(
+    "in_zp, in_sc, k_sc, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
+)
+def test_op_int8(
+    ifm_shape,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    enable_bias,
+    relu_type,
+    in_zp,
+    in_sc,
+    k_sc,
+    out_channels,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+
+    k_zp = 0

Review comment:
       ```suggestion
       zero_point = 0
   ```

##########
File path: tests/python/contrib/test_cmsisnn/test_conv2d.py
##########
@@ -0,0 +1,303 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""CMSIS-NN integration tests: Conv2D"""
+import itertools
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tvm.relay.op.contrib import cmsisnn
+
+
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    AOT_DEFAULT_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+from utils import (
+    skip_if_no_reference_system,
+    make_module,
+    count_num_calls,
+    get_range_for_dtype_str,
+    get_same_padding,
+    get_conv2d_qnn_params,
+    make_qnn_relu,
+)
+
+
+def make_model(
+    shape,
+    kernel_shape,
+    input_zp,
+    input_sc,
+    kernel_zp,
+    kernel_sc,
+    output_zp,
+    output_sc,
+    padding,
+    strides,
+    dilation,
+    groups,
+    dtype,
+    kernel_dtype,
+    out_channels,
+    weight_format,
+    enable_bias,
+    relu_type,
+):
+    """Return a model and any parameters it may have"""
+    h_index = weight_format.index("H")
+    w_index = weight_format.index("W")
+    kernel_h = kernel_shape[h_index]
+    kernel_w = kernel_shape[w_index]
+    a = relay.var("in0", shape=shape, dtype=dtype)
+    p = (0, 0, 0, 0)
+    if padding == "SAME":
+        p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
+        a = relay.nn.pad(
+            a,
+            pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
+            pad_value=input_zp,
+            pad_mode="constant",
+        )
+        shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
+
+    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
+    w = tvm.nd.array(
+        np.random.randint(
+            np.iinfo(kernel_dtype).min,
+            high=np.iinfo(kernel_dtype).max,
+            size=weight_shape,
+            dtype=kernel_dtype,
+        )
+    )
+    weights = relay.const(w, kernel_dtype)
+    conv = relay.qnn.op.conv2d(
+        a,
+        weights,
+        input_zero_point=relay.const(input_zp, "int32"),
+        kernel_zero_point=relay.const(kernel_zp, "int32"),
+        input_scale=relay.const(input_sc, "float32"),
+        kernel_scale=relay.const(kernel_sc, "float32"),
+        kernel_size=(kernel_h, kernel_w),
+        data_layout="NHWC",
+        kernel_layout=weight_format,
+        dilation=dilation,
+        strides=strides,
+        groups=groups,
+        channels=out_channels,
+        padding=p,
+        out_dtype="int32",
+    )
+    b = tvm.nd.array(np.random.randint(0, high=10, size=(out_channels,), dtype="int32"))
+    bc = relay.const(b, "int32")
+    bias = conv
+    if enable_bias:
+        bias = relay.nn.bias_add(conv, bc, axis=3)
+    requant_input_sc = [sc * input_sc for sc in kernel_sc]
+    req = relay.qnn.op.requantize(
+        bias,
+        relay.const(requant_input_sc, "float32"),
+        relay.const(0, "int32"),
+        relay.const(output_sc, "float32"),
+        relay.const(output_zp, "int32"),
+        out_dtype=dtype,
+    )
+    relu = make_qnn_relu(req, relu_type, output_sc, output_zp, dtype)
+    params = {"w": w, "b": b}
+    return relu, params
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
+@pytest.mark.parametrize("kernel_size", [(3, 3)])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))])
+@pytest.mark.parametrize("enable_bias", [True, False])
+@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
+@pytest.mark.parametrize(
+    "in_zp, in_sc, k_sc, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
+)
+def test_op_int8(
+    ifm_shape,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    enable_bias,
+    relu_type,
+    in_zp,
+    in_sc,
+    k_sc,
+    out_channels,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+
+    k_zp = 0
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    dtype = "int8"
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    weight_shape = None
+    if weight_format == "HWIO":
+        weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    else:
+        weight_shape = (kernel_h, kernel_w, ifm_shape[3], out_channels)
+
+    out_sc, out_zp = get_conv2d_qnn_params(
+        weight_shape, in_sc, in_zp, k_sc, k_zp, dtype, dtype, dtype, False
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        weight_shape,
+        in_zp,
+        in_sc,
+        k_zp,
+        k_sc,
+        out_zp,
+        out_sc,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    attrs = [
+        cmsisnn_mod[var.name_hint].attrs
+        for var in cmsisnn_mod.get_global_vars()
+        if cmsisnn_mod[var.name_hint].attrs
+    ]
+    assert any(attrs), "At least one function with external attributes was expected."
+
+    compilers = [
+        key == "Compiler" and value == "cmsisnn" for attr in attrs for key, value in attr.items()
+    ]
+    assert any(compilers), "Module does not contain function for cmsisnn target."
+
+    assert count_num_calls(orig_mod) == count_num_calls(
+        cmsisnn_mod
+    ), "Number of calls changed during partitioning"
+
+    # validate the output
+    np.random.seed(0)

Review comment:
       This pollutes the global `numpy` namespace, if we want to fix the seed we should use [the numpy Random Generator](https://numpy.org/doc/stable/reference/random/generator.html) to isolate it.
   
   Also, maybe `42` rather than `0`? :smile_cat:




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org