You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tvm.apache.org by lu...@apache.org on 2023/08/09 14:07:12 UTC

[tvm] branch main updated: [CMSIS-NN] Support for Softmax Int16 operator (#15407)

This is an automated email from the ASF dual-hosted git repository.

lukhut pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git


The following commit(s) were added to refs/heads/main by this push:
     new 77b71fc830 [CMSIS-NN] Support for Softmax Int16 operator (#15407)
77b71fc830 is described below

commit 77b71fc8304467ba6a86433066b1a86eb8c225c6
Author: Codrut-Grigore Irimie <78...@users.noreply.github.com>
AuthorDate: Wed Aug 9 17:07:03 2023 +0300

    [CMSIS-NN] Support for Softmax Int16 operator (#15407)
    
    * Support for int16 Softmax in CMSIS-NN
    * Supporting integration test
---
 python/tvm/relay/op/contrib/cmsisnn.py             |  14 +-
 src/relay/backend/contrib/cmsisnn/compute_luts.cc  |  76 +++++++++++
 src/relay/backend/contrib/cmsisnn/compute_luts.h   |  55 ++++++++
 src/relay/backend/contrib/cmsisnn/relay_to_tir.cc  | 151 +++++++++++++++++----
 .../backend/contrib/cmsisnn/tir_to_runtime.cc      |  56 ++++++++
 tests/python/contrib/test_cmsisnn/test_softmax.py  |  43 ++++++
 6 files changed, 364 insertions(+), 31 deletions(-)

diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
index cf32947446..ed620f0ff1 100644
--- a/python/tvm/relay/op/contrib/cmsisnn.py
+++ b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -86,11 +86,21 @@ def pattern_table():
         zero_point = pattern.args[2].data.numpy().item(0)
 
         # check for dtypes of quantize and dequantize
-        return (
+        if (
             (scale == 1.0 / 256 and zero_point == -128)
             and pattern.attrs.out_dtype == "int8"
             and dequantize_call.args[0].checked_type.dtype == "int8"
-        )
+        ):
+            return True
+
+        if (
+            (scale == 1.0 / 32768 and zero_point == 0)
+            and pattern.attrs.out_dtype == "int16"
+            and dequantize_call.args[0].checked_type.dtype == "int16"
+        ):
+            return True
+
+        return False
 
     def qnn_conv2d_pattern(with_pad):
         """Create pattern for qnn.conv2D with optional pad and/or optional fused relu."""
diff --git a/src/relay/backend/contrib/cmsisnn/compute_luts.cc b/src/relay/backend/contrib/cmsisnn/compute_luts.cc
new file mode 100644
index 0000000000..13dcb395b3
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/compute_luts.cc
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*!
+ * \file src/relay/backend/contrib/cmsisnn/compute_luts.cc
+ * \brief Creates LUTs for operators in different bit formats for accelerating computations.
+ */
+
+#include "compute_luts.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+void CalculateLUTInt16(int key_zero_point, float key_scale, int value_zero_point, float value_scale,
+                       float (*func)(float), const int steps, int16_t* lut) {
+  const float value_min = static_cast<float>(std::numeric_limits<int16_t>::min());
+  const float value_max = static_cast<float>(std::numeric_limits<int16_t>::max());
+  const float key_min_deq = key_scale * (std::numeric_limits<int16_t>::min() - key_zero_point);
+  const float key_max_deq = key_scale * (std::numeric_limits<int16_t>::max() - key_zero_point);
+  const float value_min_deq =
+      value_scale * (std::numeric_limits<int16_t>::min() - value_zero_point);
+  const float value_max_deq =
+      value_scale * (std::numeric_limits<int16_t>::max() - value_zero_point);
+
+  const float step_size_deq = (key_max_deq - key_min_deq) / (steps - 1);
+  const float half_step_size_deq = step_size_deq / 2;
+
+  const float value_inv_quantizing =
+      (std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min() + 1) /
+      (value_max_deq - value_min_deq);
+
+  for (int i = 0; i < steps - 1; i++) {
+    float value_deq = func(key_min_deq + i * step_size_deq);
+    float mid_value_deq = func(key_min_deq + i * step_size_deq + half_step_size_deq);
+    float next_value_deq = func(key_min_deq + (i + 1) * step_size_deq);
+
+    float value = std::round(value_deq * value_inv_quantizing);
+    float mid_value = std::round(mid_value_deq * value_inv_quantizing);
+    float next_value = std::round(next_value_deq * value_inv_quantizing);
+    float mid_iterp_value = std::round((value + next_value) / 2);
+
+    float mid_err = mid_iterp_value - mid_value;
+    float bias = std::round(mid_err / 2);
+
+    lut[i] = static_cast<int16_t>(std::max(std::min(value - bias, value_max), value_min));
+  }
+
+  lut[steps - 1] = static_cast<int16_t>(
+      std::max(std::min(func(value_max_deq) * value_inv_quantizing, value_max), value_min));
+}
+
+}  // namespace cmsisnn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/contrib/cmsisnn/compute_luts.h b/src/relay/backend/contrib/cmsisnn/compute_luts.h
new file mode 100644
index 0000000000..eca4127e40
--- /dev/null
+++ b/src/relay/backend/contrib/cmsisnn/compute_luts.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cmsisnn/compute_luts.h
+ * \brief CMSIS-NN LUTs calculation functions
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CMSISNN_COMPUTE_LUTS_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CMSISNN_COMPUTE_LUTS_H_
+
+#include <cstdint>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+/*!
+ * \brief Populates an int16 LUT based on the quantization parameters of its keys, values and
+ * respective transformation function
+ *
+ * \param key_zero_point - zero point of table's keys
+ * \param key_scale - scale of the table's keys
+ * \param value_zero_point - zero point of table's values
+ * \param value_scale - scale of the table's values
+ * \param func - function pointer of the transformation performed by the LUT
+ * \param steps - number of total values inside the table
+ * \param lut - int16_t array storing the values of the LUT
+ */
+void CalculateLUTInt16(int key_zero_point, float key_scale, int value_zero_point, float value_scale,
+                       float (*func)(float), const int steps, int16_t* lut);
+
+}  // namespace cmsisnn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_CONTRIB_CMSISNN_COMPUTE_LUTS_H_
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
index 33547f4bd8..49800195f6 100644
--- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
+++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -30,6 +30,7 @@
 #include "../../../transforms/pattern_utils.h"
 #include "buffer_size.h"
 #include "compiler_attrs.h"
+#include "compute_luts.h"
 #include "convolutions.h"
 
 namespace tvm {
@@ -89,11 +90,17 @@ class RelayToTIRVisitor : public MixedModeMutator {
  private:
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
-  void CreatePrimFuncForExtern(const GlobalVar& global_var, Array<tir::Var> func_signature,
-                               const Map<tir::Var, tir::Buffer>& buffer_map,
-                               tvm::Array<PrimExpr> call_extern_args,
-                               PrimExpr context_buffer_var = PrimExpr(),
-                               int context_buffer_size = 0, int num_bits = 8) {
+  //  struct used to allocated const NDArray
+  struct tir_input_constant_buffers {
+    tir::Var buffer_var;
+    tvm::runtime::NDArray ndarray;
+  };
+
+  void CreatePrimFuncForExtern(
+      const GlobalVar& global_var, Array<tir::Var> func_signature,
+      const Map<tir::Var, tir::Buffer>& buffer_map, tvm::Array<PrimExpr> call_extern_args,
+      PrimExpr context_buffer_var = PrimExpr(), int context_buffer_size = 0, int num_bits = 8,
+      std::vector<tir_input_constant_buffers> context_const_buffer_vars = {}) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set(tvm::attr::kGlobalSymbol, global_var->name_hint);
     dict_attrs.Set(tvm::attr::kTarget, target_);
@@ -107,8 +114,22 @@ class RelayToTIRVisitor : public MixedModeMutator {
                            {context_buffer_size}, tir::const_true(), body);
     }
 
+    for (int i = 0; i < static_cast<int>(context_const_buffer_vars.size()); i++) {
+      int bits = context_const_buffer_vars[i].ndarray.DataType().bits();
+
+      Array<PrimExpr> extents;
+      for (int shape : context_const_buffer_vars[i].ndarray.Shape()) {
+        extents.push_back(PrimExpr(shape));
+      }
+
+      body = tir::AllocateConst(Downcast<tir::Var>(context_const_buffer_vars[i].buffer_var),
+                                DataType::Int(bits), extents, context_const_buffer_vars[i].ndarray,
+                                body);
+    }
+
     tir::PrimFunc replacement_func(func_signature, body, VoidType(), buffer_map,
                                    DictAttrs(dict_attrs));
+
     ir_module_->Add(global_var, replacement_func);
   }
 
@@ -505,6 +526,7 @@ class RelayToTIRVisitor : public MixedModeMutator {
     const CallNode* softmax_call = quantize_call->args[0].as<CallNode>();
     const CallNode* dequant_call = softmax_call->args[0].as<CallNode>();
     const float quant_scale = GetScalarFromConstant<float>(dequant_call->args[1]);
+    const auto bit_width = quantize_call->type_as<TensorTypeNode>()->dtype.bits();
 
     // assuming layout as NHWC
     auto shape = quantize_call->type_as<TensorTypeNode>()->shape;
@@ -517,36 +539,107 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     // calculate multiplier and shift for CMSIS-NN softmax API
     // Note: TensorFlow Lite Micro assumptions
-    // Output zero point and scale are fixed to -128 and 1 / 256
+    // Output zero point and scale are fixed to -128 and 1 / 256 in the case of an int8 operator
+    // or to 0 and 1 / 32768 in the case of an int16 operator
     // kScaledDiffIntegerBits, kInputBits, kBeta are described on the following github page
     // https://github.com/tensorflow/tflite-micro/blob/d97cd0908d8cf5021e9d86f05a49888bee28c2a4/tensorflow/lite/micro/kernels/softmax_common.cc#L47
-    double beta_multiplier = (kBeta * quant_scale * (1 << (31 - kInputBits)));
-    beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
-    auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
-    int32_t mult = std::get<0>(mult_shift_pair);
-    int32_t shift = std::get<1>(mult_shift_pair);
-    int32_t diff_min = (1 << kScaledDiffIntegerBits) - 1;
-    diff_min <<= (31 - kScaledDiffIntegerBits);
-    diff_min >>= shift;
-    diff_min *= -1;
+
+    int32_t mult;
+    int32_t shift;
+    int32_t diff_min = 0;
+
+    std::vector<tir_input_constant_buffers> softmax_params(2);
+    Device dev{DLDeviceType::kDLCPU, 0};
+
+    if (bit_width == 8) {
+      double beta_multiplier = (kBeta * quant_scale * (1 << (31 - kInputBits)));
+      beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
+      auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+      diff_min = (1 << kScaledDiffIntegerBits) - 1;
+      diff_min <<= (31 - kScaledDiffIntegerBits);
+      diff_min >>= shift;
+      diff_min *= -1;
+    } else {  // bit_width == 16
+      double scale_beta_rescale = quant_scale * kBeta / (10.0 / 65535.0);
+      auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(scale_beta_rescale);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+
+      const int kLUTEntries = 513;
+      int16_t softmax_s16_exp_lut[kLUTEntries];
+      int16_t softmax_s16_one_by_one_lut[kLUTEntries];
+
+      const int range_int16 =
+          std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min();
+      int exp_zero_point = std::numeric_limits<int16_t>::max();
+      float exp_scale = 10.0f / range_int16;
+
+      int one_by_one_zero_point = std::numeric_limits<int16_t>::min();
+      float one_by_one_scale = 1.0f / range_int16;
+
+      int lut_value_zero_point = 0;
+      float lut_value_scale = 2.0f / range_int16;
+
+      CalculateLUTInt16(
+          exp_zero_point, exp_scale, lut_value_zero_point, lut_value_scale,
+          [](float key) { return std::exp(key); }, kLUTEntries, softmax_s16_exp_lut);
+      CalculateLUTInt16(
+          one_by_one_zero_point, one_by_one_scale, lut_value_zero_point, lut_value_scale,
+          [](float key) { return 1.0f / (1.0f + key); }, kLUTEntries, softmax_s16_one_by_one_lut);
+
+      // first LUT
+      softmax_params[0].buffer_var =
+          tir::Var("exp_lut", PointerType(PrimType(DataType::Int(bit_width)), "global.workspace"));
+      softmax_params[0].ndarray =
+          runtime::NDArray::Empty({kLUTEntries}, DataType::Int(bit_width), dev);
+      softmax_params[0].ndarray.CopyFromBytes(softmax_s16_exp_lut, sizeof(int16_t) * kLUTEntries);
+
+      // second LUT
+      softmax_params[1].buffer_var = tir::Var(
+          "one_by_one_lut", PointerType(PrimType(DataType::Int(bit_width)), "global.workspace"));
+      softmax_params[1].ndarray =
+          runtime::NDArray::Empty({kLUTEntries}, DataType::Int(bit_width), dev);
+      softmax_params[1].ndarray.CopyFromBytes(softmax_s16_one_by_one_lut,
+                                              sizeof(int16_t) * kLUTEntries);
+    }
 
     BufferCreator buffer_creator;
-    tir::Var in_var = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var out_var = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var in_var = buffer_creator.CreateBufferVar("input", DataType::Handle(bit_width));
+    tir::Var out_var = buffer_creator.CreateBufferVar("output", DataType::Handle(bit_width));
+
+    if (bit_width == 8) {
+      tvm::Array<PrimExpr> args = {
+          tir::StringImm("arm_softmax_s" + std::to_string(bit_width)),
+          in_var,
+          ToArg(num_rows),
+          ToArg(row_size),
+          ToArg(mult),
+          ToArg(shift),
+          ToArg(diff_min),
+          out_var,
+      };
 
-    tvm::Array<PrimExpr> args = {
-        tir::StringImm("arm_softmax_s8"),
-        in_var,
-        ToArg(num_rows),
-        ToArg(row_size),
-        ToArg(mult),
-        ToArg(shift),
-        ToArg(diff_min),
-        out_var,
-    };
+      CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
+                              buffer_creator.GetBufferMap(), args);
+    } else {  // bit_width == 16
+      tvm::Array<PrimExpr> args = {
+          tir::StringImm("arm_softmax_s" + std::to_string(bit_width)),
+          in_var,
+          ToArg(num_rows),
+          ToArg(row_size),
+          ToArg(mult),
+          ToArg(shift),
+          softmax_params[0].buffer_var,
+          softmax_params[1].buffer_var,
+          out_var,
+      };
 
-    CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
-                            buffer_creator.GetBufferMap(), args);
+      CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
+                              buffer_creator.GetBufferMap(), args, PrimExpr(), 0, 16,
+                              softmax_params);
+    }
   }
 
   struct BinaryElementwiseClipPattern {
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
index ea2eabd767..6febfe3486 100644
--- a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
+++ b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
@@ -99,6 +99,11 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     int clip_max;
   };
 
+  struct CMSISNNSoftmaxLutS16 {
+    std::string exp_lut_name;
+    std::string one_by_one_lut_name;
+  };
+
   using codegen::CodeGenCHost::VisitStmt_;
 
   /*!  * \brief Emits CMSIS-NN APIs for every call_extern */
@@ -107,6 +112,7 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
       CodeGenCHost::VisitExpr_(op, os);
       return;
     }
+
     std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
     if (cmsis_func_name == "arm_softmax_s8" || cmsis_func_name == "arm_elementwise_mul_s8" ||
         cmsis_func_name == "arm_elementwise_add_s8" ||
@@ -124,6 +130,8 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     } else if (cmsis_func_name == "arm_avgpool_s8" || cmsis_func_name == "arm_avgpool_s16" ||
                cmsis_func_name == "arm_max_pool_s8" || cmsis_func_name == "arm_max_pool_s16") {
       EmitPool2D(op);
+    } else if (cmsis_func_name == "arm_softmax_s16") {
+      EmitSoftmaxInt16(op);
     }
     return;
   }
@@ -220,6 +228,14 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
        << "," << dims.c << "};\n";
     return struct_name;
   }
+  /*!  * \brief Emits cmsis_nn_softmax_params struct */
+  std::string EmitCMSISNNSoftmaxLutS16(std::ostream& os, CMSISNNSoftmaxLutS16 softmax_params) {
+    std::string struct_name = "softmax_params";
+    PrintIndent();
+    os << "cmsis_nn_softmax_lut_s16 " << struct_name << "= {" << softmax_params.exp_lut_name << ", "
+       << softmax_params.one_by_one_lut_name << "};\n";
+    return struct_name;
+  }
 
   /*!  * \brief Deduces variable name from call_extern argument resting at id */
   std::string VarNameFromArg(const CallNode* op, int id) {
@@ -295,6 +311,14 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     dims.c = ValueFromArg(op, ++base_pos);
     return dims;
   }
+  /*!  * \brief extracts CMSIS-NN softmax LUTs from call_extern */
+  CMSISNNSoftmaxLutS16 extract_softmax_softmax_lut_s16(const CallNode* op, int exp_lut_pos,
+                                                       int one_by_one_lut_pos) {
+    CMSISNNSoftmaxLutS16 softmax_params;
+    softmax_params.exp_lut_name = op->args[exp_lut_pos].as<VarNode>()->name_hint;
+    softmax_params.one_by_one_lut_name = op->args[one_by_one_lut_pos].as<VarNode>()->name_hint;
+    return softmax_params;
+  }
 
   /*!  * \brief Emits CMSIS-NN APIs for every call_extern comprising convolution */
   void EmitConv2D(const CallNode* op) {
@@ -472,6 +496,38 @@ class CodeGenCMSISNN : public codegen::CodeGenCHost {
     EmitErrorCheck();
   }
 
+  void EmitSoftmaxInt16(const CallNode* op) {
+    std::string cmsis_func_name = op->args[0].as<StringImmNode>()->value;
+
+    // extract buffer names from call_extern
+    int arg_id = 0;
+    std::string input_data = VarNameFromArg(op, ++arg_id);
+    int num_rows = ValueFromArg(op, ++arg_id);
+    int row_size = ValueFromArg(op, ++arg_id);
+    int multiplier = ValueFromArg(op, ++arg_id);
+    int shift = ValueFromArg(op, ++arg_id);
+    // extracting LUT names from call_extern
+    CMSISNNSoftmaxLutS16 softmax_params_buffer =
+        extract_softmax_softmax_lut_s16(op, arg_id + 1, arg_id + 2);
+    arg_id += 2;
+    std::string output_data = VarNameFromArg(op, ++arg_id);
+
+    // Emit CMSIS-NN API arguments
+    std::string softmax_params = EmitCMSISNNSoftmaxLutS16(stream, softmax_params_buffer);
+
+    PrintIndent();
+    stream << "arm_cmsis_nn_status status = ";
+    stream << cmsis_func_name << "(";
+    stream << input_data << ", ";
+    stream << num_rows << ", ";
+    stream << row_size << ", ";
+    stream << multiplier << ", ";
+    stream << shift << ", ";
+    stream << "&" << softmax_params << ", ";
+    stream << output_data << ");\n";
+    EmitErrorCheck();
+  }
+
   void EmitErrorCheck() {
     auto emit_error = [&](std::string error) {
       if (this->debug_last_error) {
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py
index 0316d567ad..82547f44f5 100644
--- a/tests/python/contrib/test_cmsisnn/test_softmax.py
+++ b/tests/python/contrib/test_cmsisnn/test_softmax.py
@@ -91,6 +91,49 @@ def test_op_int8(zero_point, scale, compiler_cpu, cpu_flags):
     )
 
 
+@skip_if_no_reference_system
+@tvm.testing.requires_cmsisnn
+@pytest.mark.parametrize(["zero_point", "scale"], [[0, 1.0 / 32768]])
+@pytest.mark.parametrize(
+    "compiler_cpu, cpu_flags", [("cortex-m55", "+nomve"), ("cortex-m55", ""), ("cortex-m7", "")]
+)
+def test_op_int16(zero_point, scale, compiler_cpu, cpu_flags):
+    """Tests int16 QNN Softmax for CMSIS-NN"""
+    interface_api = "c"
+    use_unpacked_api = True
+
+    dtype = "int16"
+    shape = [1, 16, 16, 3]
+
+    # output scale and zero_point must be fixed
+    model = make_model(shape, dtype, dtype, zero_point, scale, 0, 1.0 / 32768)
+    orig_mod = make_module(model)
+    cmsisnn_mod = cmsisnn.partition_for_cmsisnn(orig_mod)
+
+    # validate pattern matching
+    assert_partitioned_function(orig_mod, cmsisnn_mod)
+
+    # validate the output
+    in_min, in_max = get_dtype_range(dtype)
+    np.random.seed(0)
+    input_data = np.random.randint(in_min, high=in_max, size=shape, dtype=dtype)
+    inputs = {"in0": input_data}
+    params = {}
+    output_list = generate_ref_data(orig_mod["main"], inputs, params)
+    compile_and_run(
+        AOTTestModel(
+            module=cmsisnn_mod,
+            inputs=inputs,
+            outputs=output_list,
+            params=params,
+            output_tolerance=2,
+        ),
+        create_test_runner(compiler_cpu, cpu_flags),
+        interface_api,
+        use_unpacked_api,
+    )
+
+
 def parameterize_for_invalid_model(test):
     """Generates parameters for non int8 input and output of Softmax"""
     in_dtype = ["uint8", "int8"]