You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by ma...@apache.org on 2021/11/26 09:31:25 UTC
[tvm] branch main updated: [5/10] Code generation for Depthwise Convolution via CMSIS-NN (#9409)

This is an automated email from the ASF dual-hosted git repository.

manupa pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 52edc9a  [5/10] Code generation for Depthwise Convolution via CMSIS-NN (#9409)
52edc9a is described below

commit 52edc9a4fe230cd533e6ee3511924af21adecb07
Author: Ashutosh Parkhi <86...@users.noreply.github.com>
AuthorDate: Fri Nov 26 09:31:02 2021 +0000

    [5/10] Code generation for Depthwise Convolution via CMSIS-NN (#9409)
    
    This PR adds support for depthwise convolution via CMSIS-NN.
---
 python/tvm/relay/op/contrib/cmsisnn.py             |  11 +
 .../backend/contrib/cmsisnn/generate_constants.cc  |  21 +-
 src/relay/backend/contrib/cmsisnn/relay_to_tir.cc  |  43 ++--
 .../backend/contrib/cmsisnn/tir_to_runtime.cc      | 222 +++++++++++++--------
 tests/python/contrib/test_cmsisnn/test_conv2d.py   | 147 ++++++++++++--
 5 files changed, 328 insertions(+), 116 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index 34efb1d..5e0ad27 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -123,6 +123,16 @@ def pattern_table():
         kernel_zp = conv2d.args[3].data.numpy()
         kernel_zp = [kernel_zp] if kernel_zp.ndim == 0 else kernel_zp
 
+        # check if depthwise Conv2D
+        kernel_layout = conv2d.attrs.kernel_layout
+        pos_o = kernel_layout.index("O")
+        groups = conv2d.attrs.groups
+        is_depthwise = False
+        if groups == int(conv2d_input.checked_type.shape[3]) and groups == int(
+            conv2d_weight.checked_type.shape[pos_o]
+        ):
+            is_depthwise = True
+
         return (
             conv2d.attrs.out_dtype == "int32"
             and conv2d.attrs.padding[2] == 0
@@ -132,6 +142,7 @@ def pattern_table():
             and pattern.checked_type.dtype == "int8"
             and bias_dtype == "int32"
             and all([zp == 0 for zp in kernel_zp])
+            and (not is_depthwise or bias_add is not None)
         )
 
     def binary_op_pattern(op):
diff --git a/src/relay/backend/contrib/cmsisnn/generate_constants.cc b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
index 0231e8b..2e12697 100644
--- a/src/relay/backend/contrib/cmsisnn/generate_constants.cc
+++ b/src/relay/backend/contrib/cmsisnn/generate_constants.cc
@@ -105,11 +105,20 @@ class GenerateConstantsMutator : public MixedModeMutator {
       conv2d_call = requantize_input;
     }
 
-    // Transpose weights: HWIO -> OHWI
     auto* conv2d_attrs = conv2d_call->attrs.as<Conv2DAttrs>();
-    tvm::Attrs new_conv2d_attrs;
-    Expr transposed_kernel =
-        ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs);
+    tvm::Attrs new_conv2d_attrs = conv2d_call->attrs;
+    Expr conv2d_kernel = conv2d_call->args[1];
+
+    Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
+    Array<PrimExpr> kernel_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
+    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
+    int kernel_pos_o = kernel_layout.find("O");
+    int groups = conv2d_attrs->groups;
+    if (groups != qnn::get_const_int(input_shape[3]) ||
+        groups != qnn::get_const_int(kernel_shape[kernel_pos_o])) {
+      // Transpose weights: HWIO -> OHWI for Conv2D
+      conv2d_kernel = ConvertKernelLayout(conv2d_call->args[1], conv2d_attrs, &new_conv2d_attrs);
+    }
 
     // Obtain input and output scales from Relay's Requantization
     int64_t out_channels = conv2d_attrs->channels.as<IntImmNode>()->value;
@@ -153,11 +162,11 @@ class GenerateConstantsMutator : public MixedModeMutator {
       req_inp_scale = Constant(req_inp_scale_nda);
     }
 
-    // Replace existing weights (HWIO) with the transposed ones (OHWI)
+    // Replace existing weights (HWIO) with the transposed ones (OHWI) for Conv2D
     // Substitute Conv2D weight_zero_point with the CMSIS-NN multiplier
     // Substitute Requantize input_zero_point with CMSIS-NN shift
     // Conv2D arguments: data, weight, input_zp, weight_zp, input_sc, weight_sc
-    Array<Expr> conv2d_args = {conv2d_call->args[0], transposed_kernel,    conv2d_call->args[2],
+    Array<Expr> conv2d_args = {conv2d_call->args[0], conv2d_kernel,        conv2d_call->args[2],
                                multiplier_const,     conv2d_call->args[4], weight_scale};
     Call ret_call = Call(conv2d_call->op, conv2d_args, new_conv2d_attrs, {});
     if (bias_add_call) {
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 1b639dd..6683527 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -146,6 +146,9 @@ class RelayToTIRVisitor : public MixedModeMutator {
     int32_t padding_h = qnn::get_const_int(conv2d_attrs->padding[0]);
     int32_t dilation_w = qnn::get_const_int(conv2d_attrs->dilation[1]);
     int32_t dilation_h = qnn::get_const_int(conv2d_attrs->dilation[0]);
+    int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels);
+    int32_t groups = conv2d_attrs->groups;
+    std::string kernel_layout = conv2d_attrs->kernel_layout.c_str();
     int32_t clip_min, clip_max;
     if (clip_call) {
       const ClipAttrs* clip_attrs = clip_call->attrs.as<ClipAttrs>();
@@ -156,14 +159,6 @@ class RelayToTIRVisitor : public MixedModeMutator {
       clip_max = 127;
     }
 
-    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm("arm_convolve_wrapper_s8"), input, filter,
-                                          multiplier};
-    if (bias_add_call) {
-      call_ext_args.push_back(bias);
-    }
-    call_ext_args.push_back(shift);
-    call_ext_args.push_back(output);
-
     tvm::Array<PrimExpr> scalar_args = {ToArg(input_offset), ToArg(output_offset), ToArg(stride_w),
                                         ToArg(stride_h),     ToArg(padding_w),     ToArg(padding_h),
                                         ToArg(dilation_w),   ToArg(dilation_h),    ToArg(clip_min),
@@ -173,18 +168,42 @@ class RelayToTIRVisitor : public MixedModeMutator {
     Array<PrimExpr> input_shape = conv2d_call->args[0]->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> input_dims = CMSISNNDimensions(input_shape);
 
-    // cmsis_nn_dims *filter_dims (OHWI)
+    // cmsis_nn_dims *filter_dims (OHWI for Conv2D and IHWO for depthwise)
     Array<PrimExpr> filter_shape = conv2d_call->args[1]->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> filter_dims = CMSISNNDimensions(filter_shape);
 
-    // cmsis_nn_dims *bias_dims (1,1,1,output_channels)
-    Array<PrimExpr> bias_shape{1, 1, 1, filter_shape[0]};
+    // cmsis_nn_dims *bias_dims
+    Array<PrimExpr> bias_shape{1, 1, 1, out_channels};
     Array<PrimExpr> bias_dims = CMSISNNDimensions(bias_shape);
 
-    // cmsis_nn_dims *output_dims (NHWC)
+    // cmsis_nn_dims *output_dims (same order as input_dims)
     Array<PrimExpr> output_shape = conv2d_call->type_as<TensorTypeNode>()->shape;
     Array<PrimExpr> output_dims = CMSISNNDimensions(output_shape);
 
+    int32_t depth_multiplier = -1;
+    int kernel_pos_o = kernel_layout.find("O");
+    if (groups == qnn::get_const_int(input_shape[3]) &&
+        groups == qnn::get_const_int(filter_shape[kernel_pos_o])) {
+      int kernel_pos_i = kernel_layout.find("I");
+      depth_multiplier = qnn::get_const_int(filter_shape[kernel_pos_i]);
+    }
+    scalar_args.push_back(ToArg(depth_multiplier));
+
+    // original filter_layout for depthwise is HWOI
+    std::string cmsisnn_api = "arm_convolve_wrapper_s8";
+    if (depth_multiplier != -1) {
+      cmsisnn_api = "arm_depthwise_conv_wrapper_s8";
+      Array<PrimExpr> depthwise_filter_shape{1, filter_shape[0], filter_shape[1], out_channels};
+      filter_dims = CMSISNNDimensions(depthwise_filter_shape);
+    }
+
+    tvm::Array<PrimExpr> call_ext_args = {tir::StringImm(cmsisnn_api), input, filter, multiplier};
+    if (bias_add_call) {
+      call_ext_args.push_back(bias);
+    }
+    call_ext_args.push_back(shift);
+    call_ext_args.push_back(output);
+
     // https://github.com/ARM-software/CMSIS_5/blob/d788fd583984388553391de18afd8b4d2a146868/CMSIS/NN/Source/ConvolutionFunctions/arm_convolve_s8.c#L367
     std::string context_buffer_name = "NULL";
     size_t context_buffer_size =
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index b243af6..85923b3 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -39,7 +39,6 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     decl_stream << "#include <stdio.h>\n";
     decl_stream << "#include <stdlib.h>\n";
     decl_stream << "#include <dlpack/dlpack.h>\n";
-    decl_stream << "#include <tvm/runtime/crt/module.h>\n";
     decl_stream << "#include <arm_nnfunctions.h>\n";
     decl_stream << "#include <arm_nn_types.h>\n";
     CodeGenCHost::Init(output_ssa, emit_asserts, target_str);
@@ -53,6 +52,35 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
   void AddFunction(const PrimFunc& prim_func) { CodeGenC::AddFunction(prim_func); }
 
  private:
+  /*!  * \brief CMSIS-NN context buffer info */
+  struct CMSISNNContextBuffer {
+    std::string name;
+    int size;
+  };
+
+  /*!  * \brief CMSIS-NN buffer dimensions */
+  struct CMSISNNDims {
+    int n;
+    int h;
+    int w;
+    int c;
+  };
+
+  /*!  * \brief CMSIS-NN Conv2D and Depthwise parameters */
+  struct Conv2DParams {
+    int input_offset;
+    int output_offset;
+    int stride_w;
+    int stride_h;
+    int padding_w;
+    int padding_h;
+    int dilation_w;
+    int dilation_h;
+    int clip_min;
+    int clip_max;
+    int depth_multiplier;
+  };
+
   /*!  * \brief Emit the CMSIS-NN context buffer */
   void VisitStmt_(const AllocateNode* op) {
     context_buffer_name_ = op->buffer_var->name_hint;
@@ -70,38 +98,46 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" ||
         cmsis_func_name == "arm_elementwise_add_s8") {
       CodeGenC::VisitExpr_(op, os);
-    } else if (cmsis_func_name == "arm_convolve_wrapper_s8") {
+    } else if (cmsis_func_name == "arm_convolve_wrapper_s8" ||
+               cmsis_func_name == "arm_depthwise_conv_wrapper_s8") {
       EmitConv2D(op);
     }
     return;
   }
 
   /*!  * \brief Emits cmsis_nn_context struct */
-  std::string EmitCMSISNNContext(std::ostream& os, std::string buf_name, int buf_size) {
+  std::string EmitCMSISNNContext(std::ostream& os, CMSISNNContextBuffer context_buffer) {
     std::string struct_name = "context";
     PrintIndent();
-    os << "cmsis_nn_context " << struct_name << "= {" << buf_name << "," << buf_size << "};\n";
+    os << "cmsis_nn_context " << struct_name << "= {" << context_buffer.name << ","
+       << context_buffer.size << "};\n";
     return struct_name;
   }
 
   /*!  * \brief Emits cmsis_nn_conv_params struct */
-  std::string EmitCMSISNNConvParams(std::ostream& os, int32_t input_offset, int32_t output_offset,
-                                    int32_t stride_w, int32_t stride_h, int32_t padding_w,
-                                    int32_t padding_h, int32_t dilation_w, int32_t dilation_h,
-                                    int32_t clip_min, int32_t clip_max) {
-    std::string struct_name = "conv_params";
+  std::string EmitCMSISNNConvParams(std::ostream& os, Conv2DParams params) {
+    std::string struct_name = "cmsis_nn_conv_params";
+    std::string instance_name = "conv_params";
+    if (params.depth_multiplier != -1) {
+      struct_name = "cmsis_nn_dw_conv_params";
+    }
     PrintIndent();
-    os << "cmsis_nn_tile stride = {" << stride_w << "," << stride_h << "};\n";
+    os << "cmsis_nn_tile stride = {" << params.stride_w << "," << params.stride_h << "};\n";
     PrintIndent();
-    os << "cmsis_nn_tile padding = {" << padding_w << "," << padding_h << "};\n";
+    os << "cmsis_nn_tile padding = {" << params.padding_w << "," << params.padding_h << "};\n";
     PrintIndent();
-    os << "cmsis_nn_tile dilation = {" << dilation_w << "," << dilation_h << "};\n";
+    os << "cmsis_nn_tile dilation = {" << params.dilation_w << "," << params.dilation_h << "};\n";
     PrintIndent();
-    os << "cmsis_nn_activation activation = {" << clip_min << "," << clip_max << "};\n";
+    os << "cmsis_nn_activation activation = {" << params.clip_min << "," << params.clip_max
+       << "};\n";
     PrintIndent();
-    os << "cmsis_nn_conv_params " << struct_name << " = {" << input_offset << ", " << output_offset
-       << ", stride, padding, dilation, activation};\n";
-    return struct_name;
+    os << struct_name << " " << instance_name << " = {" << params.input_offset << ", "
+       << params.output_offset;
+    if (params.depth_multiplier != -1) {
+      os << ", " << params.depth_multiplier;
+    }
+    os << ", stride, padding, dilation, activation};\n";
+    return instance_name;
   }
 
   /*!  * \brief Emits cmsis_nn_per_channel_quant_params struct */
@@ -115,83 +151,109 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
   }
 
   /*!  * \brief Emits cmsis_nn_dims struct */
-  std::string EmitCMSISNNDims(std::ostream& os, std::string tensor_type, int32_t n, int32_t h,
-                              int32_t w, int32_t c) {
+  std::string EmitCMSISNNDims(std::ostream& os, std::string tensor_type, CMSISNNDims dims) {
     std::string struct_name = tensor_type + "_dims";
     PrintIndent();
-    os << "cmsis_nn_dims " << struct_name << " = {" << n << "," << h << "," << w << "," << c
-       << "};\n";
+    os << "cmsis_nn_dims " << struct_name << " = {" << dims.n << "," << dims.h << "," << dims.w
+       << "," << dims.c << "};\n";
     return struct_name;
   }
 
+  /*!  * \brief Deduces variable name from call_extern argument resting at id */
+  std::string VarNameFromArg(const CallNode* op, int id) {
+    return op->args[id].as<VarNode>()->name_hint.c_str();
+  }
+
+  /*!  * \brief Deduces value from call_extern argument resting at id */
+  int ValueFromArg(const CallNode* op, int id) { return op->args[id].as<IntImmNode>()->value; }
+
+  /*!  * \brief extracts CMSIS-NN context buffer information */
+  CMSISNNContextBuffer extract_context_buffer_info(const CallNode* op, int base_pos) {
+    CMSISNNContextBuffer context_buffer;
+    context_buffer.name = op->args[base_pos].as<StringImmNode>()->value;
+    context_buffer.size = ValueFromArg(op, base_pos + 1);
+    return context_buffer;
+  }
+
+  /*!  * \brief extracts CMSIS-NN conv2d parameters from call_extern */
+  Conv2DParams extract_conv2d_params(const CallNode* op, int base_pos) {
+    Conv2DParams conv2d_params;
+    conv2d_params.input_offset = ValueFromArg(op, base_pos);
+    conv2d_params.output_offset = ValueFromArg(op, ++base_pos);
+    conv2d_params.stride_w = ValueFromArg(op, ++base_pos);
+    conv2d_params.stride_h = ValueFromArg(op, ++base_pos);
+    conv2d_params.padding_w = ValueFromArg(op, ++base_pos);
+    conv2d_params.padding_h = ValueFromArg(op, ++base_pos);
+    conv2d_params.dilation_w = ValueFromArg(op, ++base_pos);
+    conv2d_params.dilation_h = ValueFromArg(op, ++base_pos);
+    conv2d_params.clip_min = ValueFromArg(op, ++base_pos);
+    conv2d_params.clip_max = ValueFromArg(op, ++base_pos);
+    conv2d_params.depth_multiplier = ValueFromArg(op, ++base_pos);
+    return conv2d_params;
+  }
+
+  /*!  * \brief extracts CMSIS-NN buffer dimensions from call_extern */
+  CMSISNNDims extract_buffer_dims(const CallNode* op, int base_pos) {
+    CMSISNNDims dims;
+    dims.n = ValueFromArg(op, base_pos);
+    dims.h = ValueFromArg(op, ++base_pos);
+    dims.w = ValueFromArg(op, ++base_pos);
+    dims.c = ValueFromArg(op, ++base_pos);
+    return dims;
+  }
+
   /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
   void EmitConv2D(const CallNode* op) {
-    static const int max_num_args = 35;
-    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+    // Position of various arguments relative to buffers in the call_extern
+    enum CallExternArgPos {
+      CONTEXT_BUFFER_POS = 1,
+      CONV2D_PARAMS_POS = 3,
+      INPUT_DIM_POS = 14,
+      FILTER_DIM_POS = 18,
+      BIAS_DIM_POS = 22,
+      OUTPUT_DIM_POS = 26,
+      MAX_NUM_ARGS = 36
+    };
 
-    bool bias_enabled = false;
-    if (op->args.size() == max_num_args) {
-      bias_enabled = true;
-    }
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
 
-    auto get_var_name = [](const CallNode* op, int id) {
-      return op->args[id].as<VarNode>()->name_hint.c_str();
-    };
-    auto get_arg_value = [](const CallNode* op, int id) {
-      return op->args[id].as<IntImmNode>()->value;
-    };
+    // extract buffer names from call_extern
     int arg_id = 0;
-    std::string input_data = get_var_name(op, ++arg_id);
-    std::string filter_data = get_var_name(op, ++arg_id);
-    std::string multiplier = get_var_name(op, ++arg_id);
-    std::string bias_data("0x0");
-    if (bias_enabled) {
-      bias_data = get_var_name(op, ++arg_id);
+    std::string input_data = VarNameFromArg(op, ++arg_id);
+    std::string filter_data = VarNameFromArg(op, ++arg_id);
+    std::string multiplier = VarNameFromArg(op, ++arg_id);
+    std::string bias_data("NULL");
+    if (op->args.size() == CallExternArgPos::MAX_NUM_ARGS) {
+      bias_data = VarNameFromArg(op, ++arg_id);
     }
-    std::string shift = get_var_name(op, ++arg_id);
-    std::string output_data = get_var_name(op, ++arg_id);
-
-    std::string context_buffer_name = op->args[++arg_id].as<StringImmNode>()->value;
-    int context_buffer_size = get_arg_value(op, ++arg_id);
-    int input_offset = get_arg_value(op, ++arg_id);
-    int output_offset = get_arg_value(op, ++arg_id);
-    int stride_w = get_arg_value(op, ++arg_id);
-    int stride_h = get_arg_value(op, ++arg_id);
-    int padding_w = get_arg_value(op, ++arg_id);
-    int padding_h = get_arg_value(op, ++arg_id);
-    int dilation_w = get_arg_value(op, ++arg_id);
-    int dilation_h = get_arg_value(op, ++arg_id);
-    int clip_min = get_arg_value(op, ++arg_id);
-    int clip_max = get_arg_value(op, ++arg_id);
-    int input_n = get_arg_value(op, ++arg_id);
-    int input_h = get_arg_value(op, ++arg_id);
-    int input_w = get_arg_value(op, ++arg_id);
-    int input_c = get_arg_value(op, ++arg_id);
-    int filter_n = get_arg_value(op, ++arg_id);
-    int filter_h = get_arg_value(op, ++arg_id);
-    int filter_w = get_arg_value(op, ++arg_id);
-    int filter_c = get_arg_value(op, ++arg_id);
-    int bias_n = get_arg_value(op, ++arg_id);
-    int bias_h = get_arg_value(op, ++arg_id);
-    int bias_w = get_arg_value(op, ++arg_id);
-    int bias_c = get_arg_value(op, ++arg_id);
-    int output_n = get_arg_value(op, ++arg_id);
-    int output_h = get_arg_value(op, ++arg_id);
-    int output_w = get_arg_value(op, ++arg_id);
-    int output_c = get_arg_value(op, ++arg_id);
-
-    std::string context = EmitCMSISNNContext(stream, context_buffer_name, context_buffer_size);
-    std::string conv_params =
-        EmitCMSISNNConvParams(stream, input_offset, output_offset, stride_w, stride_h, padding_w,
-                              padding_h, dilation_w, dilation_h, clip_min, clip_max);
+    std::string shift = VarNameFromArg(op, ++arg_id);
+    std::string output_data = VarNameFromArg(op, ++arg_id);
+
+    // extract CMSIS-NN API parameters
+    int context_buffer_pos = arg_id + CallExternArgPos::CONTEXT_BUFFER_POS;
+    int conv2d_params_pos = arg_id + CallExternArgPos::CONV2D_PARAMS_POS;
+    int input_dim_pos = arg_id + CallExternArgPos::INPUT_DIM_POS;
+    int filter_dim_pos = arg_id + CallExternArgPos::FILTER_DIM_POS;
+    int bias_dim_pos = arg_id + CallExternArgPos::BIAS_DIM_POS;
+    int output_dim_pos = arg_id + CallExternArgPos::OUTPUT_DIM_POS;
+
+    CMSISNNContextBuffer context_buffer = extract_context_buffer_info(op, context_buffer_pos);
+    Conv2DParams conv2d_params = extract_conv2d_params(op, conv2d_params_pos);
+    CMSISNNDims input_dims = extract_buffer_dims(op, input_dim_pos);
+    CMSISNNDims filter_dims = extract_buffer_dims(op, filter_dim_pos);
+    CMSISNNDims bias_dims = extract_buffer_dims(op, bias_dim_pos);
+    CMSISNNDims output_dims = extract_buffer_dims(op, output_dim_pos);
+
+    // Emit CMSIS-NN API arguments
+    std::string context = EmitCMSISNNContext(stream, context_buffer);
+    std::string conv_params = EmitCMSISNNConvParams(stream, conv2d_params);
     std::string quant_params = EmitCMSISNNPerChannelQuantParams(stream, multiplier, shift);
-    std::string input_dim = EmitCMSISNNDims(stream, "input", input_n, input_h, input_w, input_c);
-    std::string filter_dim =
-        EmitCMSISNNDims(stream, "filter", filter_n, filter_h, filter_w, filter_c);
-    std::string bias_dim = EmitCMSISNNDims(stream, "bias", bias_n, bias_h, bias_w, bias_c);
-    std::string output_dim =
-        EmitCMSISNNDims(stream, "output", output_n, output_h, output_w, output_c);
+    std::string input_dim = EmitCMSISNNDims(stream, "input", input_dims);
+    std::string filter_dim = EmitCMSISNNDims(stream, "filter", filter_dims);
+    std::string bias_dim = EmitCMSISNNDims(stream, "bias", bias_dims);
+    std::string output_dim = EmitCMSISNNDims(stream, "output", output_dims);
 
+    // Emit CMSIS-NN API
     PrintIndent();
     stream << "arm_status status = ";
     stream << cmsis_func_name << "(";
diff --git a/tests/python/contrib/test_cmsisnn/test_conv2d.py b/tests/python/contrib/test_cmsisnn/test_conv2d.py
index 243197e..8d62763 100644
--- a/tests/python/contrib/test_cmsisnn/test_conv2d.py
+++ b/tests/python/contrib/test_cmsisnn/test_conv2d.py
@@ -67,31 +67,30 @@ def make_model(
     w_index = weight_format.index("W")
     kernel_h = kernel_shape[h_index]
     kernel_w = kernel_shape[w_index]
-    a = relay.var("input", shape=shape, dtype=dtype)
+    invar = relay.var("input", shape=shape, dtype=dtype)
     p = (0, 0, 0, 0)
     if padding == "SAME":
         p = get_same_padding((shape[1], shape[2]), (kernel_h, kernel_w), dilation, strides)
-        a = relay.nn.pad(
-            a,
+        invar = relay.nn.pad(
+            invar,
             pad_width=[(0, 0), (p[0], p[2]), (p[1], p[3]), (0, 0)],
             pad_value=input_zero_point,
             pad_mode="constant",
         )
         shape = (shape[0], shape[1] + p[0] + p[2], shape[2] + p[1] + p[3], shape[3])
 
-    weight_shape = (kernel_h, kernel_w, shape[3] // groups, out_channels)
     rng = np.random.default_rng(12321)
     w = tvm.nd.array(
         rng.integers(
             np.iinfo(kernel_dtype).min,
             high=np.iinfo(kernel_dtype).max,
-            size=weight_shape,
+            size=kernel_shape,
             dtype=kernel_dtype,
         )
     )
     weight_const = relay.const(w, kernel_dtype)
     conv = relay.qnn.op.conv2d(
-        a,
+        invar,
         weight_const,
         input_zero_point=relay.const(input_zero_point, "int32"),
         kernel_zero_point=relay.const(kernel_zero_point, "int32"),
@@ -128,14 +127,14 @@ def make_model(
 @pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
 @pytest.mark.parametrize("kernel_size", [(3, 3)])
 @pytest.mark.parametrize("padding", ["SAME", "VALID"])
-@pytest.mark.parametrize("strides, dilation", [((2, 2), (1, 1)), ((1, 1), (1, 1))])
+@pytest.mark.parametrize("strides, dilation", [((1, 1), (1, 1))])
+@pytest.mark.parametrize("relu_type", ["RELU"])
 @pytest.mark.parametrize("enable_bias", [True, False])
-@pytest.mark.parametrize("relu_type", ["NONE", "RELU"])
 @pytest.mark.parametrize(
     "input_zero_point, input_scale, kernel_scale, out_channels",
     [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
 )
-def test_op_int8(
+def test_conv2d_int8(
     ifm_shape,
     kernel_size,
     padding,
@@ -152,22 +151,134 @@ def test_op_int8(
     use_unpacked_api = True
     test_runner = AOT_CORSTONE300_RUNNER
 
-    kernel_zero_point = 0
+    dtype = "int8"
     groups = 1
     weight_format = "HWIO"
     kernel_h = kernel_size[0]
     kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
+    in_min, in_max = get_range_for_dtype_str(dtype)
+
+    output_scale, output_zero_point = get_conv2d_qnn_params(
+        kernel_shape,
+        input_scale,
+        input_zero_point,
+        kernel_scale,
+        kernel_zero_point,
+        dtype,
+        dtype,
+        dtype,
+    )
+
+    model, params = make_model(
+        ifm_shape,
+        kernel_shape,
+        input_zero_point,
+        input_scale,
+        kernel_zero_point,
+        kernel_scale,
+        output_zero_point,
+        output_scale,
+        padding,
+        strides,
+        dilation,
+        groups,
+        dtype,
+        dtype,
+        out_channels,
+        weight_format,
+        enable_bias,
+        relu_type,
+    )
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod, params)
+
+    # validate pattern matching
+    attrs = [
+        cmsisnn_mod[var.name_hint].attrs
+        for var in cmsisnn_mod.get_global_vars()
+        if cmsisnn_mod[var.name_hint].attrs
+    ]
+    assert any(attrs), "At least one function with external attributes was expected."
+
+    compilers = [
+        key == "Compiler" and value == "cmsis-nn" for attr in attrs for key, value in attr.items()
+    ]
+    assert any(compilers), "Module does not contain function for cmsis-nn target."
+
+    assert count_num_calls(orig_mod) == count_num_calls(
+        cmsisnn_mod
+    ), "Number of calls changed during partitioning"
+
+    # validate the output
+    rng = np.random.default_rng(12345)
+    inputs = {"input": rng.integers(in_min, high=in_max, size=ifm_shape, dtype=dtype)}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=1,
+        ),
+        test_runner,
+        interface_api,
+        use_unpacked_api,
+    )
+
+
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize("ifm_shape", [(1, 28, 28, 12), (1, 64, 100, 4)])
+@pytest.mark.parametrize("kernel_size", [(3, 3)])
+@pytest.mark.parametrize("padding", ["SAME", "VALID"])
+@pytest.mark.parametrize("strides, dilation", [((1, 1), (1, 1))])
+@pytest.mark.parametrize("relu_type", ["RELU"])
+@pytest.mark.parametrize(
+    "depth_multiplier, enable_bias",
+    [(1, True), (3, True)],
+)
+@pytest.mark.parametrize(
+    "input_zero_point, input_scale, kernel_scale, out_channels",
+    [(10, 0.0128, [0.11, 0.22], 2), (-64, 1, [1, 0.0256, 1.37], 3)],
+)
+def test_depthwise_int8(
+    ifm_shape,
+    kernel_size,
+    padding,
+    strides,
+    dilation,
+    enable_bias,
+    relu_type,
+    input_zero_point,
+    input_scale,
+    kernel_scale,
+    out_channels,
+    depth_multiplier,
+):
+    interface_api = "c"
+    use_unpacked_api = True
+    test_runner = AOT_CORSTONE300_RUNNER
+
     dtype = "int8"
+    groups = 1
+    weight_format = "HWIO"
+    kernel_h = kernel_size[0]
+    kernel_w = kernel_size[1]
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
+    kernel_zero_point = 0
     in_min, in_max = get_range_for_dtype_str(dtype)
 
-    weight_shape = None
-    if weight_format == "HWIO":
-        weight_shape = (kernel_h, kernel_w, ifm_shape[3] // groups, out_channels)
-    else:
-        weight_shape = (kernel_h, kernel_w, ifm_shape[3], out_channels)
+    groups = ifm_shape[3]
+    weight_format = "HWOI"
+    kernel_shape = (kernel_h, kernel_w, ifm_shape[3], depth_multiplier)
+    out_channels = ifm_shape[3] * depth_multiplier
+    ks_len = len(kernel_scale)
+    kernel_scale = [kernel_scale[i % ks_len] for i in range(out_channels)]
 
     output_scale, output_zero_point = get_conv2d_qnn_params(
-        weight_shape,
+        kernel_shape,
         input_scale,
         input_zero_point,
         kernel_scale,
@@ -175,12 +286,12 @@ def test_op_int8(
         dtype,
         dtype,
         dtype,
-        False,
+        True,
     )
 
     model, params = make_model(
         ifm_shape,
-        weight_shape,
+        kernel_shape,
         input_zero_point,
         input_scale,
         kernel_zero_point,