You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ak...@apache.org on 2021/08/10 07:00:37 UTC
[incubator-mxnet] branch v1.x updated: [FEATURE] Asymmetric fc fc (#20430)

This is an automated email from the ASF dual-hosted git repository.

akarbown pushed a commit to branch v1.x
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/v1.x by this push:
     new b1853e8  [FEATURE] Asymmetric fc fc (#20430)
b1853e8 is described below

commit b1853e8e305e2cb9f7d6f2cea8a6cec7fe6f01f7
Author: Sylwester Fraczek <sy...@intel.com>
AuthorDate: Tue Aug 10 08:58:58 2021 +0200

    [FEATURE] Asymmetric fc fc (#20430)
    
    * add fc->u8->fc fuse
    
    fix lint errors
    
    * Refactor shifted quantization
    
    * refactoring shifted quantization function
    
    * small refactoring and renaming
    
    * Code cleaning and reorganising in files
    
    * Sanity fixup
    
    * flag fixup
    
    * another ci fix
    
    remove unused function and change include path to relative
    
    * fix ci
    
    add default: to switch because clang gives error
    
    * enable shifted quant fc with_eltwise
    
    * fixed fc-fc after 'fuse fc+sum for quantizaiton' change
    
    * lint fixes
    
    * add #if MXNET_USE_MKLDNN to RescaleWeights
    
    * move FCInputIndex constructor to header
    
    * add FCInputIndex and add MXNET_USE_MKLDNN
    
    * move functions from header to cc
    
    * LOG running shifted quantization only when enabled
    
    * review fixes and other fixes
    
    * formatting
    
    * review fixes
    
    * clang-format'ted
    
    * fix CI
    
    Co-authored-by: DominikaJedynak <do...@gmail.com>
    Co-authored-by: Dominika Jedynak <do...@intel.com>
---
 python/mxnet/contrib/quantization.py               |   2 +-
 .../nn/mkldnn/mkldnn_fully_connected-inl.h         |  79 ++++++
 src/operator/nn/mkldnn/mkldnn_fully_connected.cc   |   9 +
 .../quantization/asymmetric_quantize_graph_pass.cc | 277 +++++++++++++++++++++
 src/operator/quantization/quantize_graph_pass.cc   | 183 +-------------
 src/operator/quantization/quantize_graph_pass.h    |  65 +++++
 src/operator/subgraph/mkldnn/mkldnn_fc-inl.h       |   3 +
 src/operator/subgraph/mkldnn/mkldnn_fc.cc          |  88 +------
 tests/python/quantization/test_quantization.py     |  84 ++++++-
 9 files changed, 529 insertions(+), 261 deletions(-)

diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 82d1b95..94ead97 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -900,7 +900,7 @@ def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full', quant
     while True:
         try:
             network(*data_nd)
-        except TypeError:
+        except (TypeError, ValueError):
             del data_nd[-1]
             del calib_data.provide_data[-1]
             continue
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
index 352f7d9..d2ccdef 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected-inl.h
@@ -30,6 +30,7 @@
 #if MXNET_USE_MKLDNN == 1
 
 #include <memory>
+#include <unordered_map>
 #include <string>
 #include <vector>
 
@@ -47,6 +48,7 @@ struct MKLDNNFCParam : public dmlc::Parameter<MKLDNNFCParam> {
   float sum_scale = 1.0f;
   dmlc::optional<float> min_calib_range;  // min float value calculated from calibration dataset
   dmlc::optional<float> max_calib_range;  // max float value calculated from calibration dataset
+  dmlc::optional<bool> shifted_output;
   dmlc::optional<bool> channel_wise_quantize;
 
   DMLC_DECLARE_PARAMETER(MKLDNNFCParam) {
@@ -73,6 +75,9 @@ struct MKLDNNFCParam : public dmlc::Parameter<MKLDNNFCParam> {
             "The maximum scalar value in the form of float32 obtained "
             "through calibration. If present, it will be used to by "
             "quantized fullyconnected op to calculate primitive scale");
+    DMLC_DECLARE_FIELD(shifted_output)
+        .set_default(dmlc::optional<bool>())
+        .describe("Whether quantized output should be shifted to u8.");
     DMLC_DECLARE_FIELD(channel_wise_quantize)
         .set_default(dmlc::optional<bool>())
         .describe("Whether support channel-wise-quantize for weight.");
@@ -86,6 +91,80 @@ struct MKLDNNFCFullParam {
   std::vector<float> output_scales = {0.0f};
 };
 
+static inline size_t GetInSumIndex(const MKLDNNFCFullParam& param) {
+  assert(param.mkldnn_param.with_sum);
+  return fullc::kWeight + 1 + (param.default_param.no_bias ? 0 : 1);
+}
+
+class FCInputIndex {
+ public:
+  explicit FCInputIndex(const MKLDNNFCFullParam full_param) {
+    auto& mkldnn_param   = full_param.mkldnn_param;
+    const bool has_bias  = !full_param.default_param.no_bias;
+    const bool quantized = mkldnn_param.quantized;
+    const bool sum_input_quantized =
+        quantized && mkldnn_param.with_sum && !mkldnn_param.enable_float_output;
+    const bool channel_wise = quantized && mkldnn_param.channel_wise_quantize.has_value() &&
+                              mkldnn_param.channel_wise_quantize.value();
+
+    // Calculate position of particular input in the input vector:
+    int index     = 0;
+    data          = index++;
+    weight        = index++;
+    bias          = has_bias ? index++ : 0;
+    num_quantized = index + (sum_input_quantized ? 1 : 0);
+    sum           = mkldnn_param.with_sum ? index++ : 0;
+    num_base      = index;
+
+    data_min   = quantized ? index++ : 0;
+    data_max   = quantized ? index++ : 0;
+    weight_min = (quantized && !channel_wise) ? index++ : 0;
+    weight_max = (quantized && !channel_wise) ? index++ : 0;
+    bias_min   = (quantized && !channel_wise && has_bias) ? index++ : 0;
+    bias_max   = (quantized && !channel_wise && has_bias) ? index++ : 0;
+    sum_min    = sum_input_quantized ? index++ : 0;
+    sum_max    = sum_input_quantized ? index++ : 0;
+    num_total  = index;
+  }
+
+  // true if sum input is used and it is float number
+  bool IsSumInputFloat() const {
+    return (sum && !sum_min);
+  }
+  int GetTotal() const {
+    return num_total;
+  }
+  int GetBase() const {
+    return num_base;
+  }
+
+  // return number of standard inputs which are quantized (represented as
+  // integer)
+  int GetQuantized() const {
+    return num_quantized;
+  }
+
+  // Represent index of particular input in the input vector:
+  int data;
+  int weight;
+  int bias;
+  int sum;
+  int data_min;
+  int data_max;
+  int weight_min;
+  int weight_max;
+  int bias_min;
+  int bias_max;
+  int sum_min;
+  int sum_max;
+
+ private:
+  int num_base;       // Number of standard inputs
+  int num_total;      // Number of total inputs: standard + additional needed for
+                      // quantization
+  int num_quantized;  // Number of standard inputs which are quantized
+};
+
 mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(const MKLDNNFCFullParam& full_param,
                                                            const bool is_train,
                                                            const NDArray& data,
diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
index 814ca45..4e0aa5a 100644
--- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
+++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc
@@ -28,6 +28,7 @@
 #include <unordered_map>
 
 #include "mkldnn_fully_connected-inl.h"
+#include "../../quantization/quantization_utils.h"
 
 namespace mxnet {
 namespace op {
@@ -55,6 +56,14 @@ mkldnn::inner_product_forward::primitive_desc GetFCFwdImpl(const MKLDNNFCFullPar
                        full_param.eltwise_param.alpha,
                        full_param.eltwise_param.beta);
   }
+  if (full_param.mkldnn_param.shifted_output.has_value() &&
+      full_param.mkldnn_param.shifted_output.value()) {
+    auto max = full_param.mkldnn_param.max_calib_range.value();
+    auto min = full_param.mkldnn_param.min_calib_range.value();
+    float scale = GetQuantizeScale(mshadow::kUint8, 0, max - min);
+    float shift = -min * scale;
+    ops.append_eltwise(1.f, dnnl::algorithm::eltwise_linear, 1.f, shift);
+  }
   if (full_param.mkldnn_param.with_sum) {
     ops.append_sum(full_param.mkldnn_param.sum_scale);
   }
diff --git a/src/operator/quantization/asymmetric_quantize_graph_pass.cc b/src/operator/quantization/asymmetric_quantize_graph_pass.cc
new file mode 100644
index 0000000..185448b
--- /dev/null
+++ b/src/operator/quantization/asymmetric_quantize_graph_pass.cc
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2021 by Contributors
+ * \file asymmetric_quantize_graph_pass.cc
+ * \brief
+ */
+#if MXNET_USE_MKLDNN == 1
+#include "quantize_graph_pass.h"
+
+namespace mxnet {
+namespace op {
+namespace asym_quant {
+
+using nnvm::Graph;
+using nnvm::ObjectPtr;
+
+template <bool require_bias>
+static bool IsOneDNNFullyConnected(const ObjectPtr& n) {
+  if (n->op() == Op::Get("_sg_mkldnn_fully_connected")) {
+    auto const& param = nnvm::get<MKLDNNFCFullParam>(n->attrs.parsed);
+    FCInputIndex idx(param);
+    if (!(param.mkldnn_param.channel_wise_quantize.has_value() &&
+          param.mkldnn_param.channel_wise_quantize.value())) {
+      return !require_bias ||
+             (param.default_param.no_bias == false && n->inputs[idx.bias].node->is_variable());
+    }
+  }
+  return false;
+}
+
+static bool IsQuantize(const ObjectPtr& n) {
+  if (n->op() == Op::Get("_contrib_quantize_v2")) {
+    auto const& param = nnvm::get<QuantizeV2Param>(n->attrs.parsed);
+    if (param.min_calib_range.has_value() && param.min_calib_range.value() < 0.0f) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static NDArray* FindInArgByName(const Graph& g, const std::string& name) {
+  const std::vector<std::string>& in_arg_names =
+      g.GetAttr<std::vector<std::string>>("in_arg_names");
+  size_t i = std::distance(in_arg_names.begin(),
+                           std::find(in_arg_names.begin(), in_arg_names.end(), name));
+  if (i == in_arg_names.size()) {
+    LOG(FATAL) << name << " not found in in_arg_names";
+  }
+  return g.GetAttr<NDArray**>("in_args")[i];
+}
+
+// Rescales weights, min_weight and max_weight. Returns bias_int32_rescale.
+static float RescaleWeights(const Graph& g, const ObjectPtr& fc, NDArray* weight_tensor) {
+  FCInputIndex idx(nnvm::get<MKLDNNFCFullParam>(fc->attrs.parsed));
+
+  float* min_weight =
+      FindInArgByName(g, fc->inputs[idx.weight_min].node->attrs.name)->data().dptr<float>();
+  float* max_weight =
+      FindInArgByName(g, fc->inputs[idx.weight_max].node->attrs.name)->data().dptr<float>();
+  float min_bias =
+      *FindInArgByName(g, fc->inputs[idx.bias_min].node->attrs.name)->data().dptr<float>();
+  float max_bias =
+      *FindInArgByName(g, fc->inputs[idx.bias_max].node->attrs.name)->data().dptr<float>();
+
+  float min_data           = std::stof(fc->inputs[idx.data].node->attrs.dict.at("min_calib_range"));
+  float max_data           = std::stof(fc->inputs[idx.data].node->attrs.dict.at("max_calib_range"));
+  float data_scale_        = kUint8Range / (max_data - min_data);
+  float weight_scale       = GetQuantizeScale(mshadow::kInt8, *min_weight, *max_weight);
+  float bias_scale         = GetQuantizeScale(mshadow::kInt8, min_bias, max_bias);
+  float bias_int32_rescale = data_scale_ * weight_scale / bias_scale;
+
+  // // TODO(zhennan): mkldnn has bug to handle INT_MAX in bias, so set the
+  // // maximum value of bias to INT_MAX / 2.
+  float bias_max_rescale =
+      mshadow::red::limits::MaxValue<int32_t>() / 2 / MaxAbs(min_bias, max_bias) / bias_scale;
+  if (bias_int32_rescale > bias_max_rescale) {
+    LOG(INFO) << "RESCALING WEIGHTS in shifted quantization because bias scale "
+                 "is too big in layer "
+              << fc->attrs.name;
+    // avoid overflow on bias
+    bias_int32_rescale   = bias_max_rescale;
+    float weight_rescale = bias_int32_rescale * bias_scale / data_scale_ / weight_scale;
+
+    size_t weight_size = weight_tensor->shape().Size();
+    int8_t* weight_ptr = weight_tensor->data().dptr<int8_t>();
+    for (int32_t i = 0; i < static_cast<int32_t>(weight_size); ++i) {
+      weight_ptr[i] = std::round(weight_ptr[i] * weight_rescale);
+    }
+    *min_weight *= weight_rescale;
+    *max_weight *= weight_rescale;
+  }
+  return bias_int32_rescale;
+}
+
+static void ShiftBias(int32_t* bias_ptr_int32,
+                      size_t bias_size,
+                      NDArray* weight_tensor,
+                      int32_t shift_value) {
+  CHECK_EQ(static_cast<size_t>(weight_tensor->shape()[0]), bias_size);
+  int8_t* weight_ptr = weight_tensor->data().dptr<int8_t>();
+  for (dim_t i = 0; i < weight_tensor->shape()[0]; ++i) {
+    for (dim_t j = 0; j < weight_tensor->shape()[1]; j++) {
+      bias_ptr_int32[i] -= shift_value * (*weight_ptr++);
+    }
+  }
+}
+
+enum class Pattern { QuantizeFc, FcFc, None };
+
+static Pattern FindPattern(const ObjectPtr& node) {
+  if (IsOneDNNFullyConnected<true>(node)) {
+    if (IsQuantize(node->inputs[0].node)) {
+      return Pattern::QuantizeFc;
+    } else if (IsOneDNNFullyConnected<false>(node->inputs[0].node)) {
+      return Pattern::FcFc;
+    }
+  }
+  return Pattern::None;
+}
+
+static void QuantizeFcShiftedQuantization(const ObjectPtr& node,
+                                          Graph&& g,
+                                          std::vector<NDArray*>* new_arg_vector,
+                                          std::vector<std::string>* new_arg_names) {
+  ObjectPtr& quantize       = node->inputs[0].node;
+  ObjectPtr& bias_node      = node->inputs[2].node;
+  std::string bias_name_old = bias_node->attrs.name;
+  NDArray* bias_in_arg_ptr  = FindInArgByName(g, bias_name_old);
+  if (bias_in_arg_ptr->dtype() != mshadow::kInt8)
+    return;
+  std::string bias_name_s32 = bias_node->attrs.name + "_s32";
+  bias_node                 = CreateNode("nullptr", bias_name_s32);
+  new_arg_names->push_back(bias_name_s32);
+
+  quantize->attrs.dict["shifted"] = "True";
+  if (quantize->op()->attr_parser)
+    quantize->op()->attr_parser(&(quantize->attrs));
+
+  NDArray* weight_tensor = FindInArgByName(g, node->inputs[1].node->attrs.name);
+
+  float bias_int32_rescale = RescaleWeights(g, node, weight_tensor);
+
+  new_arg_vector->push_back(new NDArray(
+      kDefaultStorage, bias_in_arg_ptr->shape(), Context::CPU(), false, mshadow::kInt32));
+  int32_t* bias_ptr_int32 = new_arg_vector->back()->data().dptr<int32_t>();
+  size_t bias_size        = bias_in_arg_ptr->shape().Size();
+  int8_t* bias_ptr_old    = bias_in_arg_ptr->data().dptr<int8_t>();
+
+  for (size_t i = 0; i < bias_size; ++i) {
+    bias_ptr_int32[i] = static_cast<int32_t>(std::round(bias_ptr_old[i] * bias_int32_rescale));
+  }
+  float min_data      = std::stof(quantize->attrs.dict.at("min_calib_range"));
+  float max_data      = std::stof(quantize->attrs.dict.at("max_calib_range"));
+  float data_scale    = kUint8Range / (max_data - min_data);
+  int32_t shift_value = static_cast<int32_t>(std::round(data_scale * -min_data));
+  ShiftBias(bias_ptr_int32, bias_size, weight_tensor, shift_value);
+}
+
+static void FcFcShiftedQuantization(const ObjectPtr& node,
+                                    Graph&& g,
+                                    std::vector<NDArray*>* new_arg_vector,
+                                    std::vector<std::string>* new_arg_names) {
+  ObjectPtr& first_fc       = node->inputs[0].node;
+  ObjectPtr& bias_node      = node->inputs[2].node;
+  std::string bias_name_old = bias_node->attrs.name;
+  NDArray* bias_in_arg_ptr  = FindInArgByName(g, bias_name_old);
+  if (bias_in_arg_ptr->dtype() != mshadow::kInt8)
+    return;
+  std::string bias_name_s32 = bias_node->attrs.name + "_s32";
+  bias_node                 = CreateNode("nullptr", bias_name_s32);
+  new_arg_names->push_back(bias_name_s32);
+
+  first_fc->attrs.dict["shifted_output"] = "True";
+  if (first_fc->op()->attr_parser)
+    first_fc->op()->attr_parser(&(first_fc->attrs));
+
+  NDArray* weight_tensor = FindInArgByName(g, node->inputs[1].node->attrs.name);
+
+  float bias_int32_rescale = RescaleWeights(g, node, weight_tensor);
+
+  new_arg_vector->push_back(new NDArray(
+      kDefaultStorage, bias_in_arg_ptr->shape(), Context::CPU(), false, mshadow::kInt32));
+
+  int32_t* bias_ptr_int32 = new_arg_vector->back()->data().dptr<int32_t>();
+  size_t bias_size        = bias_in_arg_ptr->shape().Size();
+  int8_t* bias_ptr_old    = bias_in_arg_ptr->data().dptr<int8_t>();
+
+  for (size_t i = 0; i < bias_size; ++i) {
+    bias_ptr_int32[i] = static_cast<int32_t>(std::round(bias_ptr_old[i] * bias_int32_rescale));
+  }
+
+  float min_data      = std::stof(first_fc->attrs.dict.at("min_calib_range"));
+  float max_data      = std::stof(first_fc->attrs.dict.at("max_calib_range"));
+  float data_scale    = kUint8Range / (max_data - min_data);
+  int32_t shift_value = static_cast<int32_t>(std::round(data_scale * -min_data));
+  ShiftBias(bias_ptr_int32, bias_size, weight_tensor, shift_value);
+}
+
+static Graph OneDNNShiftedQuantization(Graph&& g) {
+  bool disable_shifted_quant =
+      dmlc::GetEnv("MXNET_DISABLE_SHIFTED_QUANTIZATION_OPTIMIZATIONS", true);
+  bool quantize_fc = !dmlc::GetEnv("MXNET_DISABLE_SHIFTED_QUANTIZE_FC_OPTIMIZATION", false);
+  bool fc_fc       = !dmlc::GetEnv("MXNET_DISABLE_SHIFTED_FC_FC_OPTIMIZATION", false);
+  if (!disable_shifted_quant) {
+    LOG(INFO) << "Running OneDNN shifted quantization";
+  }
+  // No change to aux params
+  g.attrs["new_aux_names"] = std::make_shared<nnvm::any>(std::vector<std::string>());
+  g.attrs["new_aux"]       = std::make_shared<nnvm::any>(std::vector<NDArray*>());
+
+  // New args to replace the old
+  std::vector<std::string> new_arg_names;
+  std::vector<NDArray*> new_arg_vector;
+
+  if (!disable_shifted_quant) {
+    unsigned quantize_fc_counter = 0;
+    unsigned fc_fc_counter       = 0;
+    DFSVisit(g.outputs, [&](const ObjectPtr& node) {
+      Pattern p = FindPattern(node);
+      switch (p) {
+        case Pattern::QuantizeFc:
+          if (quantize_fc) {
+            QuantizeFcShiftedQuantization(
+                node, std::forward<Graph>(g), &new_arg_vector, &new_arg_names);
+            ++quantize_fc_counter;
+          }
+          break;
+        case Pattern::FcFc:
+          if (fc_fc) {
+            FcFcShiftedQuantization(node, std::forward<Graph>(g), &new_arg_vector, &new_arg_names);
+            ++fc_fc_counter;
+          }
+          break;
+        default:
+          break;
+      }
+    });
+    if (quantize_fc_counter > 0) {
+      LOG(INFO) << "applied shifted quantization on QUANTIZE->FC " << quantize_fc_counter
+                << " times";
+    }
+    if (fc_fc_counter > 0) {
+      LOG(INFO) << "applied shifted quantization on FC->FC " << fc_fc_counter << " times";
+    }
+  }
+  g.attrs["new_arg_names"] = std::make_shared<nnvm::any>(new_arg_names);
+  g.attrs["new_args"]      = std::make_shared<nnvm::any>(new_arg_vector);
+  return g;
+}
+
+NNVM_REGISTER_PASS(OneDNNShiftedQuantization)
+    .describe("Enables shifted quantization.")
+    .set_body(OneDNNShiftedQuantization)
+    .set_change_graph(true);
+
+}  // namespace asym_quant
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 74da6e9..9a8cbc2 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -19,20 +19,13 @@
 
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file quantization.cc
+ * \file quantize_graph_pass.cc
  * \brief
  */
 
-#include <mxnet/op_attr_types.h>
-#include <nnvm/graph.h>
-#include <nnvm/pass.h>
-#include <queue>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "quantize_v2-inl.h"
-#include "../nn/mkldnn/mkldnn_fully_connected-inl.h"
-#include "../../common/utils.h"
+#include "quantize_graph_pass.h"
+#include <memory>
+#include <utility>
 
 namespace mxnet {
 namespace op {
@@ -56,20 +49,6 @@ static inline size_t GetNumOutputs(ObjectPtr node) {
   return num_outputs;
 }
 
-ObjectPtr CreateNode(std::string op_name, std::string node_name) {
-  ObjectPtr node = Node::Create();
-  node->attrs.name = node_name;
-  if (op_name == "nullptr") {
-    node->attrs.op = nullptr;
-    // ugly workaround because VariableParam is not exposed
-    node->attrs.parsed =
-      nnvm::Symbol::CreateVariable(node->attrs.name).outputs[0].node->attrs.parsed;
-  } else {
-    node->attrs.op = Op::Get(op_name);
-  }
-  return node;
-}
-
 /*!
  * \brief Insert a node named with node_name holding the op of op_name
  * before the node current and after the node previous.
@@ -580,155 +559,6 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
   return g;
 }
 
-static NDArray* FindInArgByName(const Graph &g, const std::string& name) {
-  const std::vector<std::string>& in_arg_names =
-      g.GetAttr<std::vector<std::string>>("in_arg_names");
-  size_t i = std::distance(in_arg_names.begin(),
-                           std::find(in_arg_names.begin(), in_arg_names.end(), name));
-  if (i == in_arg_names.size()) {
-    LOG(FATAL) << name << " not found in in_arg_names";
-  }
-  return g.GetAttr<NDArray **>("in_args")[i];
-}
-
-static inline bool IsOneDNNFullyConnected(const ObjectPtr& n) {
-#if MXNET_USE_MKLDNN == 1
-  if (n->op() == Op::Get("_sg_mkldnn_fully_connected")) {
-    auto const& param = nnvm::get<MKLDNNFCFullParam>(n->attrs.parsed);
-    if (param.default_param.no_bias == false &&
-        n->inputs[2].node->is_variable()) {
-      if (!(param.mkldnn_param.channel_wise_quantize.has_value() &&
-            param.mkldnn_param.channel_wise_quantize.value())) {
-        return true;
-      }
-    }
-  }
-#endif
-  return false;
-}
-
-static inline bool IsQuantize(const ObjectPtr& n) {
-  if (n->op() == Op::Get("_contrib_quantize_v2")) {
-    auto const &param = nnvm::get<QuantizeV2Param>(n->attrs.parsed);
-    if (param.min_calib_range.has_value() &&
-        param.min_calib_range.value() < 0.0f) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Rescales weights, min_weight and max_weight. Returns bias_int32_rescale.
-static inline float RescaleWeights(const Graph &g, const ObjectPtr &fc, NDArray* weight_tensor) {
-  ObjectPtr &quantize = fc->inputs[0].node;
-  auto min_data = std::stof(quantize->attrs.dict.at("min_calib_range"));
-  auto max_data = std::stof(quantize->attrs.dict.at("max_calib_range"));
-
-  float *min_weight = FindInArgByName(g, fc->inputs[5].node->attrs.name)->data().dptr<float>();
-  float *max_weight = FindInArgByName(g, fc->inputs[6].node->attrs.name)->data().dptr<float>();
-  float min_bias = *FindInArgByName(g, fc->inputs[7].node->attrs.name)->data().dptr<float>();
-  float max_bias = *FindInArgByName(g, fc->inputs[8].node->attrs.name)->data().dptr<float>();
-
-  float data_scale_ = kUint8Range / (max_data - min_data);
-  float weight_scale = GetQuantizeScale(mshadow::kInt8, *min_weight, *max_weight);
-  float bias_scale = GetQuantizeScale(mshadow::kInt8, min_bias, max_bias);
-  float bias_int32_rescale = data_scale_ * weight_scale / bias_scale;
-
-  // // TODO(zhennan): mkldnn has bug to handle INT_MAX in bias, so set the
-  // // maximum value of bias to INT_MAX / 2.
-  float bias_max_rescale = mshadow::red::limits::MaxValue<int32_t>() / 2 /
-                           MaxAbs(min_bias, max_bias) / bias_scale;
-  if (bias_int32_rescale > bias_max_rescale) {
-    LOG(INFO) << "RESCALING WEIGHTS in shifted quantization because bias scale "
-                 "is too big in layer " << fc->attrs.name;
-    // avoid overflow on bias
-    bias_int32_rescale = bias_max_rescale;
-    float weight_rescale =
-        bias_int32_rescale * bias_scale / data_scale_ / weight_scale;
-
-    size_t weight_size = weight_tensor->shape().Size();
-    int8_t *weight_ptr = weight_tensor->data().dptr<int8_t>();
-    for (int32_t i = 0; i < static_cast<int32_t>(weight_size); ++i) {
-      weight_ptr[i] = std::round(weight_ptr[i] * weight_rescale);
-    }
-    *min_weight *= weight_rescale;
-    *max_weight *= weight_rescale;
-  }
-  return bias_int32_rescale;
-}
-
-static inline void ShiftBias(int32_t* bias_ptr_int32, size_t bias_size,
-                             NDArray* weight_tensor, int32_t shift_value) {
-  CHECK_EQ(static_cast<size_t>(weight_tensor->shape()[0]), bias_size);
-  int8_t* weight_ptr = weight_tensor->data().dptr<int8_t>();
-  for (dim_t i = 0; i < weight_tensor->shape()[0]; ++i) {
-    for (dim_t j = 0; j < weight_tensor->shape()[1]; j++) {
-      bias_ptr_int32[i] -= shift_value * (*weight_ptr++);
-    }
-  }
-}
-
-Graph OneDNNShiftedQuantization(Graph&& g) {
-  bool disable_shifted_quant =
-      dmlc::GetEnv("MXNET_DISABLE_SHIFTED_QUANTIZATION_OPTIMIZATIONS", true);
-  LOG(INFO) << "Running OneDNN shifted quantization: " << !disable_shifted_quant;
-  // No change to aux params
-  g.attrs["new_aux_names"] = std::make_shared<nnvm::any>(std::vector<std::string>());
-  g.attrs["new_aux"] = std::make_shared<nnvm::any>(std::vector<NDArray *>());
-
-  // New args to replace the old
-  std::vector<std::string> new_arg_names;
-  std::vector<NDArray *> new_arg_vector;
-
-#if MXNET_USE_MKLDNN == 1
-  if (!disable_shifted_quant) {
-    DFSVisit(g.outputs, [&](const ObjectPtr &fc) {
-      // Find Quantize->FC pattern and rescale bias from int8 to int32 and shift
-      if (IsOneDNNFullyConnected(fc)) {
-        ObjectPtr &quantize = fc->inputs[0].node;
-        if (IsQuantize(quantize)) {
-          ObjectPtr& bias_node = fc->inputs[2].node;
-          std::string bias_name_old = bias_node->attrs.name;
-          NDArray* bias_in_arg_ptr = FindInArgByName(g, bias_name_old);
-          if (bias_in_arg_ptr->dtype() != mshadow::kInt8) return;
-          std::string bias_name_s32 = bias_node->attrs.name + "_s32";
-          bias_node = CreateNode("nullptr", bias_name_s32);
-          new_arg_names.push_back(bias_name_s32);
-
-          quantize->attrs.dict["shifted"] = "True";
-          if (quantize->op()->attr_parser) quantize->op()->attr_parser(&(quantize->attrs));
-
-          NDArray *weight_tensor = FindInArgByName(g, fc->inputs[1].node->attrs.name);
-
-          float bias_int32_rescale = RescaleWeights(g, fc, weight_tensor);
-
-          new_arg_vector.push_back(
-              new NDArray(kDefaultStorage, bias_in_arg_ptr->shape(),
-                          Context::CPU(), false, mshadow::kInt32));
-          int32_t *bias_ptr_int32 = new_arg_vector.back()->data().dptr<int32_t>();
-          size_t bias_size = bias_in_arg_ptr->shape().Size();
-          int8_t *bias_ptr_old = bias_in_arg_ptr->data().dptr<int8_t>();
-
-          for (size_t i = 0; i < bias_size; ++i) {
-            bias_ptr_int32[i] = static_cast<int32_t>(
-                std::round(bias_ptr_old[i] * bias_int32_rescale));
-          }
-          float min_data = std::stof(quantize->attrs.dict.at("min_calib_range"));
-          float max_data = std::stof(quantize->attrs.dict.at("max_calib_range"));
-          float data_scale = kUint8Range / (max_data - min_data);
-          int32_t shift_value = static_cast<int32_t>(std::round(data_scale * -min_data));
-          ShiftBias(bias_ptr_int32, bias_size, weight_tensor, shift_value);
-          LOG(INFO) << "applied shifted quantization on QUANTIZE->FC";
-        }
-      }
-    });
-  }
-#endif
-  g.attrs["new_arg_names"] = std::make_shared<nnvm::any>(new_arg_names);
-  g.attrs["new_args"] = std::make_shared<nnvm::any>(new_arg_vector);
-  return g;
-}
-
 NNVM_REGISTER_PASS(QuantizeGraph)
 .describe("")
 .set_body(QuantizeGraph)
@@ -740,10 +570,5 @@ NNVM_REGISTER_PASS(SetCalibTableToQuantizedGraph)
 .set_body(SetCalibTableToQuantizedGraph)
 .set_change_graph(true);
 
-NNVM_REGISTER_PASS(OneDNNShiftedQuantization)
-.describe("Enables shifted quantization.")
-.set_body(OneDNNShiftedQuantization)
-.set_change_graph(true);
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/quantization/quantize_graph_pass.h b/src/operator/quantization/quantize_graph_pass.h
new file mode 100644
index 0000000..cd24854
--- /dev/null
+++ b/src/operator/quantization/quantize_graph_pass.h
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2021 by Contributors
+ * \file quantize_graph_pass.h
+ * \brief
+ */
+#ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZE_GRAPH_PASS_H_
+#define MXNET_OPERATOR_QUANTIZATION_QUANTIZE_GRAPH_PASS_H_
+
+#include <mxnet/op_attr_types.h>
+#include <nnvm/graph.h>
+#include <nnvm/pass.h>
+#include <queue>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <string>
+#include "quantize_v2-inl.h"
+#include "../nn/mkldnn/mkldnn_fully_connected-inl.h"
+#include "../../common/utils.h"
+
+namespace mxnet {
+namespace op {
+
+using nnvm::Symbol;
+using nnvm::Node;
+using nnvm::ObjectPtr;
+using nnvm::NodeEntry;
+using nnvm::Graph;
+
+inline ObjectPtr CreateNode(std::string op_name, std::string node_name) {
+  ObjectPtr node = Node::Create();
+  node->attrs.name = node_name;
+  if (op_name == "nullptr") {
+    node->attrs.op = nullptr;
+    // ugly workaround because VariableParam is not exposed
+    node->attrs.parsed =
+      nnvm::Symbol::CreateVariable(node->attrs.name).outputs[0].node->attrs.parsed;
+  } else {
+    node->attrs.op = Op::Get(op_name);
+  }
+  return node;
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_GRAPH_PASS_H_
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h b/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h
index 4a39bf0..806ecfe 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc-inl.h
@@ -65,6 +65,9 @@ static inline bool IsOutputUint8(const MKLDNNFCFullParam& full_param) {
        alg == mkldnn::algorithm::eltwise_sqrt || alg == mkldnn::algorithm::eltwise_exp ||
        alg == mkldnn::algorithm::eltwise_abs)) {
     return true;
+  } else if (full_param.mkldnn_param.shifted_output.has_value() &&
+             full_param.mkldnn_param.shifted_output.value()) {
+               return true;
   }
 
   return false;
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc.cc b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
index e4baa0c..5578106 100644
--- a/src/operator/subgraph/mkldnn/mkldnn_fc.cc
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc.cc
@@ -46,80 +46,6 @@
 namespace mxnet {
 namespace op {
 
-static inline size_t GetInSumIndex(const MKLDNNFCFullParam& param) {
-  assert(param.mkldnn_param.with_sum);
-  return fullc::kWeight + 1 + (param.default_param.no_bias ? 0 : 1);
-}
-
-class FCInputIndex {
- public:
-  explicit FCInputIndex(const MKLDNNFCFullParam full_param) {
-    auto& mkldnn_param   = full_param.mkldnn_param;
-    const bool has_bias  = !full_param.default_param.no_bias;
-    const bool quantized = mkldnn_param.quantized;
-    const bool sum_input_quantized =
-        quantized && mkldnn_param.with_sum && !mkldnn_param.enable_float_output;
-    const bool channel_wise = quantized && mkldnn_param.channel_wise_quantize.has_value() &&
-                              mkldnn_param.channel_wise_quantize.value();
-
-    // Calculate position of particular input in the input vector:
-    int index     = 0;
-    data          = index++;
-    weight        = index++;
-    bias          = has_bias ? index++ : 0;
-    num_quantized = index + (sum_input_quantized ? 1 : 0);
-    sum           = mkldnn_param.with_sum ? index++ : 0;
-    num_base      = index;
-
-    data_min   = quantized ? index++ : 0;
-    data_max   = quantized ? index++ : 0;
-    weight_min = (quantized && !channel_wise) ? index++ : 0;
-    weight_max = (quantized && !channel_wise) ? index++ : 0;
-    bias_min   = (quantized && !channel_wise && has_bias) ? index++ : 0;
-    bias_max   = (quantized && !channel_wise && has_bias) ? index++ : 0;
-    sum_min    = sum_input_quantized ? index++ : 0;
-    sum_max    = sum_input_quantized ? index++ : 0;
-    num_total  = index;
-  }
-
-  // true if sum input is used and it is float number
-  bool IsSumInputFloat() const {
-    return (sum && !sum_min);
-  }
-  int GetTotal() const {
-    return num_total;
-  }
-  int GetBase() const {
-    return num_base;
-  }
-
-  // return number of standard inputs which are quantized (represented as
-  // integer)
-  int GetQuantized() const {
-    return num_quantized;
-  }
-
-  // Represent index of particular input in the input vector:
-  int data;
-  int weight;
-  int bias;
-  int sum;
-  int data_min;
-  int data_max;
-  int weight_min;
-  int weight_max;
-  int bias_min;
-  int bias_max;
-  int sum_min;
-  int sum_max;
-
- private:
-  int num_base;       // Number of standard inputs
-  int num_total;      // Number of total inputs: standard + additional needed for
-                      // quantization
-  int num_quantized;  // Number of standard inputs which are quantized
-};
-
 class SgMKLDNNFCOp {
  public:
   explicit SgMKLDNNFCOp(const nnvm::NodeAttrs& attrs)
@@ -547,10 +473,16 @@ void SgMKLDNNFCOp::Forward(const OpContext& ctx,
   MKLDNNStream::Get()->Submit();
 
   if (mkldnn_param.quantized && !mkldnn_param.enable_float_output) {
-    float* min_output_ptr = out_data[out_min_index].data().dptr<float>();
-    float* max_output_ptr = out_data[out_max_index].data().dptr<float>();
-    *min_output_ptr       = cached_min_output_;
-    *max_output_ptr       = cached_max_output_;
+    float *min_output_ptr = out_data[out_min_index].data().dptr<float>();
+    float *max_output_ptr = out_data[out_max_index].data().dptr<float>();
+
+    if (mkldnn_param.shifted_output.has_value() && mkldnn_param.shifted_output.value()) {
+      *min_output_ptr = 0;
+      *max_output_ptr = cached_max_output_ - cached_min_output_;
+    } else {
+      *min_output_ptr = cached_min_output_;
+      *max_output_ptr = cached_max_output_;
+    }
   }
 }
 
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 6c1878a..637f1fe 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -1255,7 +1255,7 @@ def test_get_optimal_thresholds():
 
 
 @with_seed()
-def test_onednn_shifted_quantization():
+def test_onednn_shifted_quantize_fc():
     batch_size = 1
     if not is_test_for_mkldnn():
         print("Test only for mkldnn")
@@ -1303,7 +1303,7 @@ def test_onednn_shifted_quantization():
         return fc_layer, quantize_attrs
 
     def get_fc_layer():
-        fc_layer = mx.gluon.nn.Dense(5, use_bias=True, flatten=True,
+        fc_layer = mx.gluon.nn.Dense(20, use_bias=True, flatten=True,
                                      weight_initializer=mx.initializer.Normal(),
                                      bias_initializer=mx.initializer.Normal())
         fc_layer.initialize()
@@ -1330,7 +1330,7 @@ def test_onednn_shifted_quantization():
         min_range = mx.nd.min(out).asscalar()
         max_range = mx.nd.max(out).asscalar()
         atol = 0.1 * max(abs(min_range), abs(max_range))
-        assert_almost_equal_with_err(out.asnumpy(), out_q.asnumpy(), rtol=0.1, atol=atol, etol=0.2)
+        assert_almost_equal_with_err(out_q.asnumpy(), out.asnumpy(), rtol=0.1, atol=atol, etol=0.2)
 
         if qdtype == 'auto':
             assert quantize_attrs['shifted'] == 'True'
@@ -1348,6 +1348,84 @@ def test_onednn_shifted_quantization():
             check(i, qdtype)
 
 
+@with_seed()
+def test_onednn_shifted_quantize_fc_fc():
+    batch_size = 2
+    if not is_test_for_mkldnn():
+        print("Test only for mkldnn")
+        return
+
+    def get_fc_fc_layers(with_eltwise):
+        class Net(mx.gluon.nn.HybridBlock):
+            def __init__(self):
+                super(Net, self).__init__()
+                self.fc1 = mx.gluon.nn.Dense(20, use_bias=True, flatten=True,
+                                             weight_initializer=mx.initializer.Normal(),
+                                             bias_initializer=mx.initializer.Normal())
+                self.relu = mx.gluon.nn.Activation('relu') if with_eltwise else None
+                self.fc2 = mx.gluon.nn.Dense(20, use_bias=True, flatten=True,
+                                             weight_initializer=mx.initializer.Normal(),
+                                             bias_initializer=mx.initializer.Normal())
+
+            def hybrid_forward(self, F, x):
+                out = self.fc1(x)
+                if self.relu is not None:
+                    out = self.relu(out)
+                out = self.fc2(out)
+                return out
+
+        net = Net()
+        net.initialize()
+        return net
+
+    def quantize_net(with_eltwise, qdtype, net, random_data):
+        calib_data = NDArrayIter(data=random_data, batch_size=batch_size)
+        calib_data = DummyIter(calib_data)
+        net = mx.contrib.quant.quantize_net(net, quantize_mode='smart',
+                                            quantized_dtype=qdtype,
+                                            exclude_layers=None,
+                                            exclude_layers_match=[],
+                                            calib_data=calib_data,
+                                            calib_mode='naive',
+                                            num_calib_examples=1,
+                                            ctx=mx.current_context())
+        net.hybridize(static_alloc=True, static_shape=True)
+        out = net(random_data)
+        out.wait_to_read()
+
+        _, sym = net._cached_graph
+        fc0_name = "quantized_sg_mkldnn_fully_connected%s_0" %("_eltwise" if with_eltwise else "")
+        fc0_attrs = sym.attr_dict()[fc0_name]
+
+        if qdtype == 'auto':
+            assert fc0_attrs['shifted_output'] == 'True'
+        else:
+            assert 'shifted_output' not in fc0_attrs
+
+        return out
+
+    def check(with_eltwise, qdtype, random_data):
+        net_ref = get_fc_fc_layers(with_eltwise)
+        out_ref = net_ref(random_data)
+        out_ref.wait_to_read()
+
+        out_q = quantize_net(with_eltwise, qdtype, net_ref, random_data)
+
+        min_range = mx.nd.min(out_ref).asscalar()
+        max_range = mx.nd.max(out_ref).asscalar()
+        atol = 0.1 * max(abs(min_range), abs(max_range))
+        assert_almost_equal_with_err(out_q.asnumpy(), out_ref.asnumpy(), rtol=0.1, atol=atol, etol=0.2)
+
+    with environment({'MXNET_DISABLE_SHIFTED_QUANTIZATION_OPTIMIZATIONS': '0',
+                      'MXNET_DISABLE_SHIFTED_QUANTIZE_FC_OPTIMIZATION': '0'}):
+        for with_eltwise in [False, True]:
+            for qdtype in ['int8', 'uint8', 'auto']:
+                print("with_eltwise:", with_eltwise)
+                print("qdtype:", qdtype)
+                data = mx.nd.random_uniform(low=0 if qdtype == 'uint8' else -1, high=1, shape=(batch_size, 10))
+                check(with_eltwise, qdtype, data)
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()