You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2020/03/22 19:50:56 UTC

[GitHub] [incubator-mxnet] sxjscience commented on a change in pull request #17841: Gluon data 2.0: c++ dataloader and built-in image/bbox transforms

sxjscience commented on a change in pull request #17841: Gluon data 2.0: c++ dataloader and built-in image/bbox transforms
URL: https://github.com/apache/incubator-mxnet/pull/17841#discussion_r396133747
 
 

 ##########
 File path: python/mxnet/gluon/data/batchify.py
 ##########
 @@ -0,0 +1,433 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=
+"""Batchify function."""
+from __future__ import absolute_import
+
+import warnings
+import numpy as np
+
+from ...context import Context, cpu
+from ... import ndarray as nd
+from ... import numpy as _np
+from ...util import is_np_array
+
+class Stack(object):
+    r"""Stack the input data samples to construct the batch.
+    The N input samples must have the same shape/length and will be stacked to construct a batch.
+    Examples
+    --------
+    >>> from gluoncv.data import batchify
+    >>> # Stack multiple lists
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6, 8]
+    >>> c = [8, 9, 1, 2]
+    >>> batchify.Stack()([a, b, c])
+    [[1. 2. 3. 4.]
+     [4. 5. 6. 8.]
+     [8. 9. 1. 2.]]
+    <NDArray 3x4 @cpu(0)>
+    >>> # Stack multiple numpy.ndarrays
+    >>> import numpy as np
+    >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = np.array([[5, 6, 7, 8], [1, 2, 3, 4]])
+    >>> batchify.Stack()([a, b])
+    [[[1. 2. 3. 4.]
+      [5. 6. 7. 8.]]
+     [[5. 6. 7. 8.]
+      [1. 2. 3. 4.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    >>> # Stack multiple NDArrays
+    >>> import mxnet as mx
+    >>> a = nd.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = nd.array([[5, 6, 7, 8], [1, 2, 3, 4]])
+    >>> batchify.Stack()([a, b])
+    [[[1. 2. 3. 4.]
+      [5. 6. 7. 8.]]
+     [[5. 6. 7. 8.]
+      [1. 2. 3. 4.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    """
+    def __init__(self, use_shared_mem=False):
+        self._use_shared_mem = use_shared_mem
+
+    def __call__(self, data):
+        """Batchify the input data
+        Parameters
+        ----------
+        data : list
+            The input data samples
+        Returns
+        -------
+        batch_data : NDArray
+        """
+        _arr = _np if is_np_array() else nd
+        if isinstance(data[0], _arr.NDArray):
+            dtype = data[0].dtype
+            if self._use_shared_mem:
+                out = _arr.empty((len(data),) + data[0].shape, dtype=dtype,
+                                  ctx=Context('cpu_shared', 0))
+                return _arr.stack(*data, out=out)
+            else:
+                return _arr.stack(*data)
+        elif isinstance(data[0], (tuple, list)):
+            data = zip(*data)
+            return [self.__call__(i) for i in data]
+        else:
+            out = np.asarray(data)
+            dtype = out.dtype
+            if self._use_shared_mem:
+                return _arr.array(out, ctx=Context('cpu_shared', 0), dtype=dtype)
+            else:
+                return _arr.array(out, dtype=dtype)
+
+    def __mx_handle__(self):
+        from ._internal import StackBatchify
+        return StackBatchify()
+
+def _pad_arrs_to_max_length(arrs, pad_val, use_shared_mem, dtype):
+    """Inner Implementation of the Pad batchify
+    Parameters
+    ----------
+    arrs : list
+    pad_val : number
+    use_shared_mem : bool, default False
+    Returns
+    -------
+    ret : NDArray
+    original_length : NDArray
+    """
+    _arr = _np if is_np_array() else nd
+    if isinstance(arrs[0], _arr.NDArray):
+        dtype = arrs[0].dtype if dtype is None else dtype
+        arrs = [arr.asnumpy() for arr in arrs]
+    elif not isinstance(arrs[0], np.ndarray):
+        arrs = [np.asarray(ele) for ele in arrs]
+    else:
+        dtype = arrs[0].dtype if dtype is None else dtype
+
+    ret_shape = list(arrs[0].shape)
+    original_length = []
+    for pad_axis in range(len(ret_shape)):
+        curr_lengths = [ele.shape[pad_axis] for ele in arrs]
+        original_length.append(curr_lengths)
+        max_size = max(curr_lengths)
+        ret_shape[pad_axis] = max_size
+    ret_shape = (len(arrs), ) + tuple(ret_shape)
+
+    ret = np.full(shape=ret_shape, fill_value=pad_val, dtype=dtype)
+
+    for i, arr in enumerate(arrs):
+        if arr.shape == ret_shape[1:]:
+            ret[i] = arr
+        else:
+            slices = [slice(None) for _ in range(arr.ndim)]
+            for pad_axis in range(arr.ndim):
+                slices[pad_axis] = slice(0, arr.shape[pad_axis])
+                assert slices[pad_axis].start != slices[pad_axis].stop
+            slices = [slice(i, i + 1)] + slices
+            ret[tuple(slices)] = arr
+
+
+    ctx = Context('cpu_shared', 0) if use_shared_mem else cpu()
+    ret = _arr.array(ret, ctx=ctx, dtype=dtype)
+    original_length = _arr.array(original_length, ctx=ctx, dtype=np.int32)
+
+    return ret, original_length
+
+
+class Pad(object):
+    """Pad the input ndarrays along the specific padding axis and stack them to get the output.
+    Input of the function will be N samples. Each sample should contain a single element that
+    can be 1) numpy.ndarray, 2) mxnet.nd.NDArray, 3) list of numbers.
+    You can set the `pad_val` to determine the padding value.
+
+    The arrays will be padded to the largest dimensions(at most 5 dimensions to pad) and then
+    stacked to form the final output. In addition, the function will output the original dimensions
+    at the `axis` if ret_length is turned on.
+    Parameters
+    ----------
+    pad_val : float or int, default None
+        The padding value.
+    ret_length : bool, default False
+        Whether to return the valid length in the output.
+    dtype : str or numpy.dtype, default None
+        The value type of the output. If it is set to None, the input data type is used.
+    Examples
+    --------
+    >>> from gluoncv.data import batchify
+    >>> # Inputs are multiple lists
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6]
+    >>> c = [8, 2]
+    >>> batchify.Pad()([a, b, c])
+    [[ 1  2  3  4]
+     [ 4  5  6  0]
+     [ 8  2  0  0]]
+    <NDArray 3x4 @cpu(0)>
+    >>> # Also output the lengths
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6]
+    >>> c = [8, 2]
+    >>> batchify.Pad(ret_length=True)([a, b, c])
+    (
+     [[1 2 3 4]
+      [4 5 6 0]
+      [8 2 0 0]]
+     <NDArray 3x4 @cpu(0)>,
+     [4 3 2]
+     <NDArray 3 @cpu(0)>)
+    >>> # Inputs are multiple ndarrays
+    >>> import numpy as np
+    >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = np.array([[5, 8], [1, 2]])
+    >>> batchify.Pad(pad_val=-1)([a, b])
+    [[[ 1  2  3  4]
+      [ 5  6  7  8]]
+     [[ 5  8 -1 -1]
+      [ 1  2 -1 -1]]]
+    <NDArray 2x2x4 @cpu(0)>
+    >>> # Inputs are multiple NDArrays
+    >>> import mxnet as mx
+    >>> a = nd.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = nd.array([[5, 8], [1, 2]])
+    >>> batchify.Pad(pad_val=-1)([a, b])
+    [[[ 1.  2.  3.  4.]
+      [ 5.  6.  7.  8.]]
+     [[ 5.  8. -1. -1.]
+      [ 1.  2. -1. -1.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    """
+    def __init__(self, pad_val=None, ret_length=False, dtype=None, use_shared_mem=False):
 
 Review comment:
   In the new numpy version of GluonNLP, I removed `ret_length` and just call `pad_val` as `val`. Also, I removed the warning. In addition, there is a round_to flag as in https://github.com/dmlc/gluon-nlp/blob/3f7465ab5d0f926c2c6f424644daaa30668570ba/src/gluonnlp/data/batchify/batchify.py#L222

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services