You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tvm.apache.org by "maekawatoshiki (via GitHub)" <gi...@apache.org> on 2023/05/12 12:25:07 UTC

[GitHub] [tvm] maekawatoshiki opened a new pull request, #14536: [QNN] Implement 'qnn.softmax'

maekawatoshiki opened a new pull request, #14536:
URL: https://github.com/apache/tvm/pull/14536

   This PR implements `qnn.softmax` which is the quantized operator of `nn.softmax`.
   The implementation is based on the algorithm proposed in https://arxiv.org/pdf/2207.01405.pdf.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193444298


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";
+
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data = Subtract(Cast(new_args[0], DataType::Int(32)), input_zero_point);
+
+  const Expr x_0 = ConvertDtype(const_f32(std::round(1.f / input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Clip(Divide(x_p, Negative(x_0)), 0, 20);
+  const Expr max_q = Max(q, {axis}, true, false);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(max_q, q));

Review Comment:
   I guess you mean `S_exp <- S/(2^N)` in the [paper](https://arxiv.org/pdf/2207.01405.pdf) (`N` is `max_q`).
   I think we don't have to do that because `S_exp` is not the same as `S_out`, where `S_out` is `1/2^(8-1)`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] ibsidorenko commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "ibsidorenko (via GitHub)" <gi...@apache.org>.

ibsidorenko commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1501988367

   Hi, @maekawatoshiki ! Thank you for your PR, relay very interesting work! 
   But I have my doubts about accuracy. Can you check my comment for unit test.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162058650


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   > since x_p is 0. or less than 0
   
   ~Why is that? It's `q * log_2_e` right?~
   
   (UPDATE: my confusion has been cleared by https://github.com/apache/tvm/pull/14536#discussion_r1162180893) 
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162356413


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   > P.S.
   I have checked output after qnn.quantize and see that some of value have diff by 7. I think it is too much and the accuracy is unsatisfactory... any thoughts?
   
   In case all computation performed in integer-only arithmetic, how big diff is allowed for `softmax` operation generally? I'm not sure about this.
   I'm also not sure if any other algorithms outperform the current implementation.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193237954


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)
+        assert not tvm.ir.structural_equal(mod, mod_int)
+
+        result = (
+            relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+        result_int = (
+            relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+
+        # Check at least the softmax output is in ascending order,
+        # since it is difficult to use allclose due to not-so-good accuracy.
+        for qdq, qop in zip(result, result_int):
+            assert is_sorted(qdq)
+            assert is_sorted(qop)
+
+        try:
+            np.testing.assert_allclose(result_int, result, atol=1)
+        except AssertionError as e:
+            # To see the difference
+            print(e)

Review Comment:
   As long as this implementation is useful for your use case, I'm fine with this. cc @ibsidorenko @AndrewZhaoLuo 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193101620


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)

Review Comment:
   The pass does not use the fully-integer implementation for `nn.softmax`, unless it is specified as `optional_qnn_ops`.



##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)
+        assert not tvm.ir.structural_equal(mod, mod_int)
+
+        result = (
+            relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+        result_int = (
+            relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+
+        # Check at least the softmax output is in ascending order,
+        # since it is difficult to use allclose due to not-so-good accuracy.
+        for qdq, qop in zip(result, result_int):
+            assert is_sorted(qdq)
+            assert is_sorted(qop)
+
+        try:
+            np.testing.assert_allclose(result_int, result, atol=1)
+        except AssertionError as e:
+            # To see the difference
+            print(e)

Review Comment:
   What do you think about this?
   I checked the max absolute difference here is, in most cases, 0~6.
   Also the overall trend of the softmax output didn't differ much between the QOp and QDQ implementation. 



##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";

Review Comment:
   The assertion fails when the input scale does not meet the condition.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162188063


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Yeah I think that is another reasonable thing to do.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161769396


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    x = relay.var("x", shape=[5, 10], dtype="float32")
+
+    x = relay.qnn.op.quantize(x, relay.const(0.08), relay.const(-48), out_dtype="int8")
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0038), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0038), relay.const(-128))
+
+    x_np = np.random.random_sample([5, 10]).astype(np.float32) * 20.0 - 6
+    args = [x_np]
+
+    mod = tvm.IRModule.from_expr(op)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod_int = tvm.relay.transform.FakeQuantizationToInteger(hard_fail=True)(mod)
+    assert not tvm.ir.structural_equal(mod, mod_int)
+
+    result = (
+        relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+    result_int = (
+        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+
+    assert np.allclose(result_int, result, rtol=1, atol=0.05)

Review Comment:
   I verified that `assert np.allclose(result_int, result, atol=0.05)` is fine.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] ibsidorenko commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "ibsidorenko (via GitHub)" <gi...@apache.org>.

ibsidorenko commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161842215


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   Looks like this test does not allow to check accuracy in full.
   
   I have printed out output and found that ~70% of output values is equal to **0.0** in this test. This is because output after `qnn.quantize` operation is equal to **"-128"**. It is not very interesting/representative case for **"int8"** data type.
   
   Can you slightly modify this test in the following way:
   
   1. Remove second `qnn.dequantize`. Let's check output of `qnn.dequantize` + `softmax` + `qnn.quantize` only
   2. Play with QNN parameters (zero point, scale) in a such way that output from quantize will be in the range [-100, +100] for example. Not only **"-128"** like now
   
   P.S.
   I have checked output after `qnn.quantize` and see that some of value have diff by **7**. I think it is too much and the accuracy is unsatisfactory... any thoughts?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161770329


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    x = relay.var("x", shape=[5, 10], dtype="float32")
+
+    x = relay.qnn.op.quantize(x, relay.const(0.08), relay.const(-48), out_dtype="int8")
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0038), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0038), relay.const(-128))
+
+    x_np = np.random.random_sample([5, 10]).astype(np.float32) * 20.0 - 6

Review Comment:
   What do you think about this: https://github.com/apache/tvm/pull/14536/commits/35b554875943de8fca899a3b86faae448f53ebf0#diff-3068125dc5426cffbc64c13858f102aa8f7eaee8565a6aaacb4badde71faf7b1



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162183305


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Isn't S < 1 by definition? Since it should be 127 (or 255) / max(calib_data) (typically)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162183305


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Isn't S < 0 by definition? Since it should be 1.0 / max(calib_data)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] Aleksei-grovety commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "Aleksei-grovety (via GitHub)" <gi...@apache.org>.

Aleksei-grovety commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1171182996


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));

Review Comment:
   Is it necessary to cast input_zero_point to int32? It is assumed that input_zero_point is of type int32.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1172013000


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));

Review Comment:
   This is apparently a redundant cast. I should remove it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161627893


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    x = relay.var("x", shape=[5, 10], dtype="float32")
+
+    x = relay.qnn.op.quantize(x, relay.const(0.08), relay.const(-48), out_dtype="int8")
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0038), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0038), relay.const(-128))
+
+    x_np = np.random.random_sample([5, 10]).astype(np.float32) * 20.0 - 6
+    args = [x_np]
+
+    mod = tvm.IRModule.from_expr(op)
+    mod = tvm.relay.transform.InferType()(mod)
+    mod_int = tvm.relay.transform.FakeQuantizationToInteger(hard_fail=True)(mod)
+    assert not tvm.ir.structural_equal(mod, mod_int)
+
+    result = (
+        relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+    result_int = (
+        relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+        .evaluate()(*args)
+        .numpy()
+    )
+
+    assert np.allclose(result_int, result, rtol=1, atol=0.05)

Review Comment:
   I think `rtol = 1` is a bit too large... is this a reasonable choice or could there be accuracy issues in the arithmetic? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162171494


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Yeah but the point is to decompose in a way that the integer part is negative, so that we can "divide" by this integer. I don't see how such decomposition is possible if the input is positive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193235148


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";
+
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data = Subtract(Cast(new_args[0], DataType::Int(32)), input_zero_point);
+
+  const Expr x_0 = ConvertDtype(const_f32(std::round(1.f / input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Clip(Divide(x_p, Negative(x_0)), 0, 20);
+  const Expr max_q = Max(q, {axis}, true, false);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(max_q, q));

Review Comment:
   If we are left-shifting by `max_q`, shouldn't we update the scale factor at L123 to divide by 2^(max_q - 1) as well?  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193235148


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";
+
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data = Subtract(Cast(new_args[0], DataType::Int(32)), input_zero_point);
+
+  const Expr x_0 = ConvertDtype(const_f32(std::round(1.f / input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Clip(Divide(x_p, Negative(x_0)), 0, 20);
+  const Expr max_q = Max(q, {axis}, true, false);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(max_q, q));

Review Comment:
   If we are left-shifting by `max_q`, shouldn't we update the output scale to divide by 2^(max_q - 1) as well?  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193467059


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";
+
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data = Subtract(Cast(new_args[0], DataType::Int(32)), input_zero_point);
+
+  const Expr x_0 = ConvertDtype(const_f32(std::round(1.f / input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Clip(Divide(x_p, Negative(x_0)), 0, 20);
+  const Expr max_q = Max(q, {axis}, true, false);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(max_q, q));

Review Comment:
   Yes, I believe so.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi merged pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi merged PR #14536:
URL: https://github.com/apache/tvm/pull/14536


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162183305


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Isn't S < 1 by definition? Since it should be 1.0 / max(calib_data) (typically)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162149997


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   I believe the decomposition can be thought of as treating `S * I` as the "real number" represented by `I`. For all real numbers we can take the integer and decimal components. The integer component in real-number space becomes a shift, and decimal component has to be quantized using S to get an integer `r` in the paper. 
   
   It can always be done, though whether it can be done efficiently is I think the issue. If we look at Algo 1, when S >>1 then I think the way they calculate `r` and `q` is wrong.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162350168


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   > 2. Play with QNN parameters (zero point, scale) in a such way that output from quantize will be in the range [-100, +100] for example. Not only "-128" like now
   
   I'm not sure why we need to modify QNN parameters (of `qnn.quantize`).
   I think that it's enough to change the value range specified in `x_np = np.random.randint(-128, 127, ...)` to satisfy the `qnn.quantize` output to be in the range of [-100, +100].
   (Sorry if my understanding is wrong)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162060645


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    x = relay.var("x", shape=[5, 10], dtype="float32")
+
+    x = relay.qnn.op.quantize(x, relay.const(0.08), relay.const(-48), out_dtype="int8")
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0038), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0038), relay.const(-128))
+
+    x_np = np.random.random_sample([5, 10]).astype(np.float32) * 20.0 - 6

Review Comment:
   Yes it looks good. We should also test on uint8. A new feature involving subtle arithmetic like this should be tested very thoroughly. See also the remark by @ibsidorenko 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] ibsidorenko commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "ibsidorenko (via GitHub)" <gi...@apache.org>.

ibsidorenko commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162443317


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   Yes, sure, it's up to you. My main concern here is to avoid the case when all (or almost all) output values are equal to "-128" (it is not representative case for "int8" data type.)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162147717


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Another option is when decomposing `2^(S * I_p)` into integer and decimal components, you instead decompose:
   
   `2^[(S * 2^-n * I_p) * 2^n]`. At compile time we can choose an `n` to make `S * 2^-n << 1` to get around this problem. You can then apply the decomposition routine to the internal terms in parentheses and the outer `2^n` now merely becomes another shift. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1515628400

   I took a look at the latest tensorflow implementation and noticed it is, for some reason, using float arithmetic: https://github.com/tensorflow/tensorflow/blob/98a187a8bfcdcf0c55c16f07b4a06b50e06a9a26/tensorflow/lite/kernels/internal/optimized/optimized_ops.h#L3471-L3477.
   
   I also found another paper proposing quantized softmax (https://arxiv.org/pdf/2101.01321.pdf) and its implementation (https://github.com/kssteven418/I-BERT/blob/1b09c759d6aeb71312df9c6ef74fa268a87c934e/fairseq/quantization/utils/quant_modules.py#L578).
   However, unlike the proposed algorithm in the paper, its implementation looks like using float arithmetic in several places of code.
   
   I realized that it's difficult to implement integer-only quantized softmax with satisfying quality in a variety of input scales.
   
   Let me abandon this PR for now to investigate further, and hopefully, I'll make another PR.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] tvm-bot commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "tvm-bot (via GitHub)" <gi...@apache.org>.

tvm-bot commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1500883914

   <!---bot-comment-->
   
   Thanks for contributing to TVM! Please refer to the contributing guidelines https://tvm.apache.org/docs/contribute/ for useful information and tips. Please request code reviews from [Reviewers](https://github.com/apache/incubator-tvm/blob/master/CONTRIBUTORS.md#reviewers) by @-ing them in a comment.
   
   <!--bot-comment-ccs-start-->
    * cc @ibsidorenko <sub>See [#10317](https://github.com/apache/tvm/issues/10317) for details</sub><!--bot-comment-ccs-end-->
   
   <sub>Generated by [tvm-bot](https://github.com/apache/tvm/blob/main/ci/README.md#github-actions)</sub>


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161628452


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    x = relay.var("x", shape=[5, 10], dtype="float32")
+
+    x = relay.qnn.op.quantize(x, relay.const(0.08), relay.const(-48), out_dtype="int8")
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))
+    op = relay.op.nn.softmax(x, axis=1)
+    op = relay.qnn.op.quantize(op, relay.const(0.0038), relay.const(-128), out_dtype="int8")
+    op = relay.qnn.op.dequantize(op, relay.const(0.0038), relay.const(-128))
+
+    x_np = np.random.random_sample([5, 10]).astype(np.float32) * 20.0 - 6

Review Comment:
   Please test on more inputs, ideally on sampling from all int8 / uint8 range. This choice looks very contrived.  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162184110


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   I believe S here is defined as:
   
   ![image](https://user-images.githubusercontent.com/13855451/231023293-7e7c5b3e-6a68-48cf-af54-ccf88585fcd5.png)
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161382762


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  ICHECK(param != nullptr) << "SoftmaxAttrs cannot be nullptr.";

Review Comment:
   `param` can be removed since it's not used



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1501498359

   Thank you for the review, @masahi. I've just modified the code.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] ibsidorenko commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "ibsidorenko (via GitHub)" <gi...@apache.org>.

ibsidorenko commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161842215


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   Looks like this test does not allow to check accuracy in full.
   
   I have printed out output and found that ~70% of output values is equal to **0.0** in this test. This is because output after `qnn.quantize` operation is equal to **"-128"**. It is not very interesting/representative case for **"int8"** data type.
   
   Can you slightly modify this test in the following way:
   
   1. Remove second `qnn.dequantize`. Let's check output of `qnn.dequantize` + `softmax` + `qnn.quantize` only
   2. Play with QNN parameters (zero point, scale) in a such way that output from quantize will be in the range [-100, +100] fo r example. Not only **"-128"** like now
   
   P.S.
   I have checked output after `qnn.quantize` and see that some of value have diff by **7**. I think it is too much and the accuracy is unsatisfactory... any thoughts?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162188161


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Ok. But I'm still not sure about my first question https://github.com/apache/tvm/pull/14536#discussion_r1161619325.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] shinh commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "shinh (via GitHub)" <gi...@apache.org>.

shinh commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1539435547

   Does this make sense to add this `qnn.softmax` implementation as an optional feature? By default, we wouldn't enable qnn.softmax, but users could activate it when they find its precision satisfactory for their use case. To be more specific, I propose the following:
   
   1. Add `@register_optional_fake_quantization_to_integer` and use it in fake_quantization_to_integer.py for `softmax`:
   
   ```
   @register_optional_fake_quantization_to_integer("nn.softmax")
   def softmax(expr, type_map):
     ...
   ```
   
   2. Modify `fake_quantization_to_integer.cc` so that optional rewriters will be ignored unless users explicitly state they want to use quantized softmax by something like
   
   ```
   relay.transform.FakeQuantizationToInteger(optional_qnn_ops={"nn.softmax"})(mod)
   ```
   
   I guess it's OK to relax checks of unittests if this feature is optional? What are your thoughts?


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193464536


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const auto const_f32 = [&](float val) { return MakeConstantScalar(DataType::Float(32), val); };
+
+  const auto const_input_scale = new_args[1].as<ConstantNode>();
+  ICHECK(const_input_scale) << "Input scale should be constant.";
+  ICHECK(const_input_scale->is_scalar()) << "Input scale should be scalar.";
+  const float input_scale = static_cast<float*>(const_input_scale->data->data)[0];
+  ICHECK(input_scale <= 1.f) << "Input scale should be less than or equal to 1.";
+
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data = Subtract(Cast(new_args[0], DataType::Int(32)), input_zero_point);
+
+  const Expr x_0 = ConvertDtype(const_f32(std::round(1.f / input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Clip(Divide(x_p, Negative(x_0)), 0, 20);
+  const Expr max_q = Max(q, {axis}, true, false);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = LeftShift(x_b, Subtract(max_q, q));

Review Comment:
   I see, I guess we don't have to compute `S_exp` because the scale factors cancel in the denominator / numerator in `I_out`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1539740828

   @shinh Sounds good to me, as long as things won't break by default, I don't see any problem. 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193101620


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)

Review Comment:
   The pass does not use the fully-integer implementation for `nn.softmax`, unless it is specified in `optional_qnn_ops`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki closed pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki closed pull request #14536: [QNN] Implement 'qnn.softmax'
URL: https://github.com/apache/tvm/pull/14536


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1515632315

   Yeah it is challenging problem. I would not necessarily constrain yourself to making a single implementation that can handle all cases however. 
   
   Feel free to plan for several future specialization with fallbacks if needed imo. Just make sure tests reflect this.
   
   Though, will leave up to you.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162131142


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   ![image](https://user-images.githubusercontent.com/13855451/231009023-1839d40b-be39-4b18-a48b-5dbdec409ce8.png)
   
   This is in `ShiftExp` under `algorithm 1`
   
   First off, if S > 1 then we already have issues as `x_0` (`I_0` in paper) maybe 0... 
   
   For attention I would expect output activation range to potentially be very large (see LLM int8 paper) so having a high scale factor is not unreasonable for some schemes.



##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [50, 10]
+    x = relay.var("x", shape=shape, dtype="int8")
+
+    x = relay.qnn.op.dequantize(x, relay.const(0.08), relay.const(-48))

Review Comment:
   Things we probably want to test:
   
   1. Different dtypes
   2. Different scale factors
   3. Different distributions along the axis of reduction (e.g. flat distribution should give flat probabilities, muiltiple spikes, etc.)



##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Maybe I am not understanding something but seems like an obvious flaw...
   
   I think you can get around this by not rounding I_0 in `algorithm1` but keeping it a float and rounding when needed. However this would introduce runtime FLOPS.



##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Another option is when decomposing `2^(S * I_p)` into integer and decimal components, you instead decompose:
   
   `2^[(S * 2^-n * I_p) * 2^n]`. At runtime we can choose an `n` to make `S * 2^-n << 1` to get around this problem. You can then decompose the internal terms in parentheses and the outer `2^n` now merely becomes another shift. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162147717


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   Another option is when decomposing `2^(S * I_p)` into integer and decimal components, you instead decompose:
   
   `2^[(S * 2^-n * I_p) * 2^n]`. At runtime we can choose an `n` to make `S * 2^-n << 1` to get around this problem. You can then apply the decomposition routine to the internal terms in parentheses and the outer `2^n` now merely becomes another shift. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162182323


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   `x_p` is the same as `I_p` in the Algorithm 1 from the paper.
   And here: https://github.com/apache/tvm/blob/35b554875943de8fca899a3b86faae448f53ebf0/src/relay/qnn/op/softmax.cc#L102-L103 we subtract the max value in the axis.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161388083


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+  const auto* param = attrs.as<SoftmaxAttrs>();
+  ICHECK(param != nullptr) << "SoftmaxAttrs cannot be nullptr.";
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, MakeConstantScalar(DataType::Int(32), 1))),
+                            RightShift(x, MakeConstantScalar(DataType::Int(32), 4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, MakeConstantScalar(DataType::Int(32), 1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, MakeConstantScalar(DataType::Int(32), n)), q);
+  const Expr sums = Sum(exps, {axis}, true, false);
+  const Expr output =
+      RightShift(Multiply(Divide(MakeConstantScalar(DataType::Int(32), 1 << m), sums), exps),
+                 MakeConstantScalar(DataType::Int(32), m - (bits - 1)));
+  const Expr requantized =
+      Requantize(output, arg_types[0].as<TensorTypeNode>()->shape,
+                 MakeConstantScalar(DataType::Float(32), 1.f / (1 << (bits - 1))),
+                 MakeConstantScalar(DataType::Int(32), 0), output_scale, output_zero_point,

Review Comment:
   I suggest creating a helper function to clean up many `MakeConstantScalar(DataType::Int(32), ...)`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1504925657

   I appreciate your review!
   
   I found a quantized softmax implementation in tensorflow lite: (https://github.com/tensorflow/tensorflow/blob/305fec9fddc3bdb5bb574a134b955bf4b07fd795/tensorflow/lite/kernels/internal/reference/softmax.h#L60), so I'm going to try it and compare the accuracy with my current implementation.
   
   Besides, from https://github.com/apache/tvm/pull/14536#discussion_r1162144248 and https://github.com/apache/tvm/pull/14536#discussion_r1162443317, the current unit tests seem the need to be improved, I acknowledged.
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161639758


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Actually I'm not sure their eq. 14 is correct for a positive exponent, since we cannot decompose `S * I` into `(-q)` + fraction where `q` is positive... thought? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161624685


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   floor missing after divide?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162187457


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   We can fallback to the fp32-based impl when S > 1.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161619325


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Could `q` be bigger than `n`? If so, aren't we ending up doing a left shift by a negative number? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162058650


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   > since x_p is 0. or less than 0
   
   Why is that? It's `q * log_2_e` right? 
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161634023


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   From the paper, it's not clear to me why it is always safe to right-shift by q. I don't see an obvious bound on it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161624685


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   floor missing after divide? I don't quite understand why they negate here.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162184110


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(

Review Comment:
   I believe S here is defined as:
   
   ![image](https://user-images.githubusercontent.com/13855451/231023293-7e7c5b3e-6a68-48cf-af54-ccf88585fcd5.png)
   
   Which can be be artbirary depending on the range m.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162171494


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Yeah but the point is to decompose in a way that the integer part is negative, so that we can "divide" (right shift) by this integer. I don't see how such decomposition is possible if the input is positive.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162350168


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,36 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():

Review Comment:
   > 2. Play with QNN parameters (zero point, scale) in a such way that output from quantize will be in the range [-100, +100] for example. Not only "-128" like now
   
   I'm not sure why we need to modify QNN parameters (of `qnn.quantize`).
   I think that changing the value range specified in `x_np = np.random.randint(-128, 127, ...)` to satisfy the `qnn.quantize` output to be in the range of [-100, +100].
   (Sorry if my understanding is wrong)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1163070211


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Ah yes I think it can which is bad. I think the way around this might be exploring decomposing `2^[(S * 2^-k * I_p) * 2^k]` instead of `2^(S * I_p)` for some well chosen k.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162180893


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Hmm, I believe it is due to the fact that the integers are all normalized by subtracting the maximum value (eq 12). Therefore all values are either <= 0



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on PR #14536:
URL: https://github.com/apache/tvm/pull/14536#issuecomment-1501679252

   cc @AndrewZhaoLuo @ibsidorenko @Icemist please help review.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193237555


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.

Review Comment:
   Please document that this implementation is highly experimental.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] masahi commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "masahi (via GitHub)" <gi...@apache.org>.

masahi commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193237684


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)
+        assert not tvm.ir.structural_equal(mod, mod_int)
+
+        result = (
+            relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+        result_int = (
+            relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+
+        # Check at least the softmax output is in ascending order,
+        # since it is difficult to use allclose due to not-so-good accuracy.
+        for qdq, qop in zip(result, result_int):
+            assert is_sorted(qdq)
+            assert is_sorted(qop)

Review Comment:
   Why should they be sorted?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193416091


##########
tests/python/relay/test_pass_fake_quantization_to_integer.py:
##########
@@ -1114,5 +1114,53 @@ def test_fake_quantize_take():
     compare_fq_to_int(op, [x_np])
 
 
+def test_fake_quantize_softmax():
+    shape = [5, 10]
+    x_ = relay.var("x", shape=shape, dtype="int8")
+
+    is_sorted = lambda a: np.all(a[:-1] <= a[1:])
+
+    for scale in [1.0, 0.1, 0.01]:
+        x = relay.qnn.op.dequantize(x_, relay.const(scale), relay.const(0))
+        op = relay.op.nn.softmax(x, axis=1)
+        op = relay.qnn.op.quantize(
+            op, relay.const(1.0 / 256.0), relay.const(-128), out_dtype="int8"
+        )
+
+        x_np = np.random.randint(-128, 127, size=shape, dtype="int8")
+        x_np = np.sort(x_np)
+        args = [x_np]
+
+        mod = tvm.IRModule.from_expr(op)
+        mod = tvm.relay.transform.InferType()(mod)
+        mod_int = tvm.relay.transform.FakeQuantizationToInteger(
+            hard_fail=True, optional_qnn_ops=["nn.softmax"]
+        )(mod)
+        assert not tvm.ir.structural_equal(mod, mod_int)
+
+        result = (
+            relay.create_executor("vm", mod=mod, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+        result_int = (
+            relay.create_executor("vm", mod=mod_int, device=tvm.cpu(), target="llvm")
+            .evaluate()(*args)
+            .numpy()
+        )
+
+        # Check at least the softmax output is in ascending order,
+        # since it is difficult to use allclose due to not-so-good accuracy.
+        for qdq, qop in zip(result, result_int):
+            assert is_sorted(qdq)
+            assert is_sorted(qop)

Review Comment:
   The input is sorted at L1131.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1193445645


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.

Review Comment:
   Added comment



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] maekawatoshiki commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "maekawatoshiki (via GitHub)" <gi...@apache.org>.

maekawatoshiki commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1161725841


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);

Review Comment:
   >  I don't quite understand why they negate here.
   
   From my understanding, since `x_p` is 0. or less than 0., `Divide(Negative(x_p), x_0)` is to get positive values.
   
   > floor missing after divide?
   
   This is an integer arithmetic so that it would not need floor, I think.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org

[GitHub] [tvm] AndrewZhaoLuo commented on a diff in pull request #14536: [QNN] Implement 'qnn.softmax'

Posted by "AndrewZhaoLuo (via GitHub)" <gi...@apache.org>.

AndrewZhaoLuo commented on code in PR #14536:
URL: https://github.com/apache/tvm/pull/14536#discussion_r1162180893


##########
src/relay/qnn/op/softmax.cc:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/qnn/op/softmax.cc
+ * \brief QNN softmax operator.
+ */
+#include <tvm/relay/analysis.h>
+#include <tvm/relay/op_attr_types.h>
+
+#include "op_common.h"
+#include "tvm/ir/expr.h"
+#include "tvm/relay/attrs/nn.h"
+#include "tvm/relay/type.h"
+#include "tvm/runtime/data_type.h"
+#include "tvm/runtime/logging.h"
+#include "tvm/topi/reduction.h"
+
+namespace tvm {
+namespace relay {
+namespace qnn {
+
+bool QnnSoftmaxRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                   const TypeReporter& reporter) {
+  // Expected Types: input, scale, zero_point, output_scale, output_zero_point, output
+  ICHECK_EQ(types.size(), 6);
+  const auto* x = types[0].as<TensorTypeNode>();
+  if (x == nullptr) return false;
+  ICHECK(x->dtype == DataType::Int(8))
+      << "Expected quantized softmax type(int8) for input but was " << x->dtype;
+
+  // Check the types of scale and zero points.
+  for (size_t i = 1; i < 5; ++i) {
+    if (types[i].as<IncompleteTypeNode>()) {
+      return false;
+    }
+  }
+
+  ICHECK(IsScalarType(types[1], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[2], DataType::Int(32)));    // zero_point
+  ICHECK(IsScalarType(types[3], DataType::Float(32)));  // scale
+  ICHECK(IsScalarType(types[4], DataType::Int(32)));    // zero_point
+
+  // Assign types for scale and zero points.
+  reporter->Assign(types[1], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[2], TensorType({}, DataType::Int(32)));    // zero_point
+  reporter->Assign(types[3], TensorType({}, DataType::Float(32)));  // scale
+  reporter->Assign(types[4], TensorType({}, DataType::Int(32)));    // zero_point
+
+  // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay
+  // IdentityRel infer type function.
+  Array<Type> tensor_types = {types[0], types[5]};
+  return IdentityRel(tensor_types, 2, attrs, reporter);
+}
+
+// Positional relay function to create quantized softmax operator used by frontend FFI.
+Expr MakeQuantizedSoftmax(Expr x, int axis, Expr scale, Expr zero_point, Expr output_scale,
+                          Expr output_zero_point) {
+  auto attrs = make_object<SoftmaxAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("qnn.softmax");
+  return Call(op, {x, scale, zero_point, output_scale, output_zero_point}, Attrs(attrs), {});
+}
+
+/*
+ * \brief Canonicalizes the QNN softmax op.
+ */
+Expr QnnSoftmaxCanonicalize(const Attrs& attrs, const Array<Expr>& new_args,
+                            const Array<tvm::relay::Type>& arg_types) {
+  // Expected: input, scale, zero_point, output_scale, output_zero_point
+  ICHECK_EQ(new_args.size(), 5);
+
+  const Expr input_scale = new_args[1];
+  const Expr input_zero_point = new_args[2];
+  const Expr output_scale = new_args[3];
+  const Expr output_zero_point = new_args[4];
+  const int axis = attrs.as<SoftmaxAttrs>()->axis;
+
+  // Refer to the Algorithm 1 in https://arxiv.org/pdf/2207.01405.pdf
+
+  const Expr quantized_data =
+      Subtract(Cast(new_args[0], DataType::Int(32)), Cast(input_zero_point, DataType::Int(32)));
+
+  const Expr x_0 = ConvertDtype(
+      Round(Divide(MakeConstantScalar(DataType::Float(32), 1.f), input_scale)), DataType::Int(32));
+  const Expr max = Max(quantized_data, {axis}, true, false);
+  const Expr x = Subtract(quantized_data, max);
+
+  const auto const_i32 = [&](int32_t val) { return MakeConstantScalar(DataType::Int(32), val); };
+  const int n = 8;
+  const int m = 30;
+  const int bits = 8;
+  const Expr x_p = Subtract(Add(x, RightShift(x, const_i32(1))), RightShift(x, const_i32(4)));
+  const Expr q = Divide(Negative(x_p), x_0);
+  const Expr r = Subtract(x_p, Multiply(q, Negative(x_0)));
+  const Expr x_b = Add(RightShift(r, const_i32(1)), x_0);
+  const Expr exps = RightShift(LeftShift(x_b, const_i32(n)), q);

Review Comment:
   Hmm, I believe it is due to the fact that the integers are all normalized by subtracting the maximum value. Therefore all values are either <= 0



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@tvm.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org