You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/01/27 06:28:57 UTC
[GitHub] reminisce commented on a change in pull request #9552: [REQUEST FOR REVIEW | DO NOT MERGE] Model Quantization with Calibration

reminisce commented on a change in pull request #9552: [REQUEST FOR REVIEW | DO NOT MERGE] Model Quantization with Calibration
URL: https://github.com/apache/incubator-mxnet/pull/9552#discussion_r164265094
 
 

 ##########
 File path: python/mxnet/quantization.py
 ##########
 @@ -0,0 +1,467 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+
+try:
+    from scipy import stats
+except ImportError:
+    stats = None
+
+import numpy as np
+import ctypes
+import logging
+import os
+from .base import _LIB, check_call
+from .base import c_array, c_str, mx_uint, c_str_array
+from .base import NDArrayHandle, SymbolHandle
+from .symbol import Symbol, load
+from . import ndarray as nd
+from .ndarray import NDArray
+from .io import DataIter
+from .context import cpu, Context
+from .module import Module
+
+
+def _quantize_params(qsym, params):
+    """Given a quantized symbol and a dict of params that have not been quantized, generate quantized params.
+    Currently only supports quantizing the arg_params with names of `weight` or `bias`, not aux_params.
+    If `qsym` contains symbols that are excluded from being quantized, their corresponding params will
+    not be quantized, but saved together with quantized params of the symbols that have been quantized.
+
+    Parameters
+    ----------
+    qsym : Symbol
+        Quantized symbol from FP32 symbol.
+    params : dict of str->NDArray
+    """
+    inputs_name = qsym.list_arguments()
+    quantized_params = {}
+    for name in inputs_name:
+        if name.endswith(('weight_quantize', 'bias_quantize')):
+            original_name = name[:-len('_quantize')]
+            param = params[original_name]
+            val, vmin, vmax = nd.contrib.quantize(data=param, min_range=nd.min(param),
+                                                  max_range=nd.max(param), out_type='int8')
+            quantized_params[name] = val
+            quantized_params[name+'_min'] = vmin
+            quantized_params[name+'_max'] = vmax
+        elif name in params:
+            quantized_params[name] = params[name]
+    return quantized_params
+
+
+def _quantize_symbol(sym, excluded_symbols=None, offline_params=None):
+    """Given a symbol object representing a neural network of data type FP32, quantize it into a INT8 network.
+
+    Parameters
+    ----------
+    sym : Symbol
+        FP32 neural network symbol.
+    excluded_symbols : list of symbols
+        Nodes in the network that users do not want to replace with a symbol of INT8 data type.
+    offline_params : list of strs
+        Names of the parameters that users want to quantize offline. It's always recommended to quantize parameters
+        offline so that quantizing parameters during the inference can be avoided.
+    """
+    num_excluded_symbols = 0
+    excluded_handles = []
+    if excluded_symbols is not None:
+        assert isinstance(excluded_symbols, list)
+        num_excluded_symbols = len(excluded_symbols)
+        for s in excluded_symbols:
+            excluded_handles.append(s.handle)
+
+    num_offline = 0
+    offline = []
+    if offline_params is not None:
+        num_offline = len(offline_params)
+        for k in offline_params:
+            offline.append(c_str(k))
+
+    out = SymbolHandle()
+    check_call(_LIB.MXQuantizeSymbol(sym.handle,
+                                     ctypes.byref(out),
+                                     mx_uint(num_excluded_symbols),
+                                     c_array(SymbolHandle, excluded_handles),
+                                     mx_uint(num_offline),
+                                     c_array(ctypes.c_char_p, offline)))
+    return Symbol(out)
+
+
+class _LayerOutputCollector(object):
+    """Saves layer output NDArray in a dict with layer names as keys and lists of NDArrays as values.
+    The collected NDArrays will be used for calculating the optimal thresholds for quantization using
+    KL divergence."""
+    def __init__(self, include_layer=None, logger=None):
+        self.nd_dict = {}
+        self.include_layer = include_layer
+        self.logger = logger
+
+    def collect(self, name, ndarray):
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(ndarray, NDArrayHandle)
+        ndarray = NDArray(handle, writable=False).copyto(cpu())
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s output of shape %s" % (name, ndarray.shape))
+        if name in self.nd_dict:
+            self.nd_dict[name].append(ndarray)
+        else:
+            self.nd_dict[name] = [ndarray]
+
+
+class _LayerOutputMinMaxCollector(object):
+    """Saves layer output min and max values in a dict with layer names as keys.
+    The collected min and max values will be directly used as thresholds for quantization.
+    """
+    def __init__(self, include_layer=None, logger=None):
+        self.min_max_dict = {}
+        self.include_layer = include_layer
+        self.logger = logger
+
+    def collect(self, name, ndarray):
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(ndarray, NDArrayHandle)
+        ndarray = NDArray(handle, writable=False)
+        min_range = nd.min(ndarray).asscalar()
+        max_range = nd.max(ndarray).asscalar()
+        if name in self.min_max_dict:
+            cur_min_max = self.min_max_dict[name]
+            self.min_max_dict[name] = (min(cur_min_max[0], min_range), max(cur_min_max[1], max_range))
+        else:
+            self.min_max_dict[name] = (min_range, max_range)
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s output min_range=%f, max_range=%f" % (name, min_range, max_range))
+
+
+def _calibrate_quantized_sym(qsym, th_dict):
+    """Given a dictionary containing the thresholds for quantizing the layers, set the thresholds into
+    the quantized symbol as the params of requantize operators.
+    """
+    if th_dict is None or len(th_dict) == 0:
+        return qsym
+    num_layer_outputs = len(th_dict)
+    layer_output_names = []
+    min_vals = []
+    max_vals = []
+    for k, v in th_dict.items():
+        layer_output_names.append(k)
+        min_vals.append(v[0])
+        max_vals.append(v[1])
+
+    calibrated_sym = SymbolHandle()
+    check_call(_LIB.MXSetCalibTableToQuantizedSymbol(qsym.handle,
+                                                     mx_uint(num_layer_outputs),
+                                                     c_str_array(layer_output_names),
+                                                     c_array(ctypes.c_float, min_vals),
+                                                     c_array(ctypes.c_float, max_vals),
+                                                     ctypes.byref(calibrated_sym)))
+    return Symbol(calibrated_sym)
+
+
+def _collect_layer_statistics(mod, data, collector, max_num_examples=None, logger=None):
+    if not isinstance(data, DataIter):
+        raise ValueError('Only supports data as a type of DataIter, while received type %s' % str(type(data)))
+    mod._exec_group.execs[0].set_monitor_callback(collector.collect)
+    num_batches = 0
+    num_examples = 0
+    for batch in data:
+        mod.forward(data_batch=batch, is_train=False)
+        num_batches += 1
+        num_examples += data.batch_size
+        if max_num_examples is not None and num_examples >= max_num_examples:
+            break
+    if logger is not None:
+        logger.info("Collected statistics from %d batches with batch_size=%d" % (num_batches, data.batch_size))
+
+
+def _collect_layer_output_min_max(mod, data, include_layer=None, max_num_examples=None, logger=None):
+    """Collect min and max values from layer outputs and save them in a dictionary mapped by layer names."""
+    collector = _LayerOutputMinMaxCollector(include_layer=include_layer, logger=logger)
+    _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
+    return collector.min_max_dict
+
+
+def _collect_layer_outputs(mod, data, include_layer=None, max_num_examples=None, logger=None):
+    """Collect layer outputs and save them in a dictionary mapped by layer names."""
+    collector = _LayerOutputCollector(include_layer=include_layer, logger=logger)
+    _collect_layer_statistics(mod, data, collector, max_num_examples, logger)
+    return collector.nd_dict
+
+
+def _smooth_distribution(p, eps=0.0001):
+    """Given a discrete distribution (may have not been normalized to 1), smooth it by replacing zeros
+    with eps multiplied by a scaling factor and taking the corresponding amount off the non-zero values.
+    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
+    """
+    is_zeros = (p == 0).astype(np.float32)
+    is_nonzeros = (p != 0).astype(np.float32)
+    n_zeros = is_zeros.sum()
+    n_nonzeros = p.size - n_zeros
+    eps1 = eps * float(n_zeros) / float(n_nonzeros)
+    assert eps1 < 1.0, 'n_zeros=%d, n_nonzeros=%d, eps1=%f' % (n_zeros, n_nonzeros, eps1)
+    hist = p.astype(np.float32)
+    hist += eps * is_zeros + (-eps1) * is_nonzeros
+    assert (hist <= 0).sum() == 0
+    return hist
+
+
+def _get_optimal_threshold(arr, num_bins=8001, num_quantized_bins=255):
 
 Review comment:
   - `num_quantized_bins` represents the number of values in the int8 range. If we want to use 4 bits as quantized values, num_quantized_bins would be 15.
   - `num_bins` I tried different numbers of bins from 500 to 40,000. It has little effect on the optimal thresholds. So I picked a value in between. Too small values might not be suitable considering the tensor size is large, and too big value leads to more compute time of KL divergence. Here is a good article explaining the good rules of choosing the number of bins. http://www.statisticshowto.com/choose-bin-sizes-statistics/

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services