You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2020/07/24 18:47:08 UTC

[GitHub] [incubator-mxnet] kpuatamazon commented on a change in pull request #17559: [MXNET-1446] Quantization: intgemm matrix multiply wrappers

kpuatamazon commented on a change in pull request #17559:
URL: https://github.com/apache/incubator-mxnet/pull/17559#discussion_r460228415



##########
File path: src/operator/contrib/intgemm/prepare_weight_op.cc
##########
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_weight_op.cc
+ * \brief Converts weight matrices to intgemm's representation.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "../../../../3rdparty/intgemm/aligned.h"
+#include "../../../../3rdparty/intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareWeightParam : public dmlc::Parameter<PrepareWeightParam> {
+  bool already_quantized;
+  DMLC_DECLARE_PARAMETER(PrepareWeightParam) {
+    DMLC_DECLARE_FIELD(already_quantized).set_default(false)
+    .describe("Is the weight matrix already quantized?");
+  }
+};
+DMLC_REGISTER_PARAMETER(PrepareWeightParam);
+
+bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // Optimal maximum parameter.
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  if (in_attrs->size() == 2U) {
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+  }
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  if (in_attrs->size() == 1U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  } else if (in_attrs->size() == 2U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  }
+  return true;
+}
+
+bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  if (in_attrs->size() == 2U) {
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  }
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
+void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+
+  const TBlob &in = inputs.front();
+  const TBlob &out = outputs.front();
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
+  size_t inner = in.shape_[in.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires the output dimension (the product of all but the last dimension of the "
+    "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
+
+  int8_t *quantB = out.dptr<int8_t>();
+  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) <<
+    "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
+  if (in.type_flag_ == mshadow::kInt8) {
+    const int8_t *B = in.dptr<int8_t>();
+    ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
+  } else if (in.type_flag_ == mshadow::kFloat32) {
+    const float *B = in.dptr<float>();
+    // TODO(kpuatamazon): eliminate transpose here with https://github.com/kpu/intgemm/pull/56
+    intgemm::AlignedVector<float> B_transpose(inner * B_cols);
+    for (size_t i = 0; i < inner; ++i) {
+      for (size_t j = 0; j < B_cols; ++j) {
+        B_transpose[i * B_cols + j] = B[i + inner * j];
+      }
+    }
+    ::intgemm::Int8::PrepareB(
+        B_transpose.begin(),
+        quantB,
+        127.0 / *inputs[1].dptr<float>(),
+        inner,
+        B_cols);
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.describe(R"code(This operator converts a weight matrix in column-major format to intgemm's internal fast representation of weight matrices.  MXNet customarily stores weight matrices in column-major (transposed) format. This operator is not meant to be fast; it is meant to be run offline to quantize a model.

Review comment:
       I'm interested in your suggestions on how best to do this.  Keep in mind we do have a use case for storing 8-bit quantized models on disk.  




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org