You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2018/01/03 19:54:01 UTC

[incubator-mxnet] branch master updated: Contrib operators for object-detection bounding box related stuffs (#8603)

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 7e7cf3a  Contrib operators for object-detection bounding box related stuffs (#8603)
7e7cf3a is described below

commit 7e7cf3a65ae8c7b6fe80b3eedc89d5220b92815b
Author: Joshua Z. Zhang <ch...@gmail.com>
AuthorDate: Wed Jan 3 11:53:58 2018 -0800

    Contrib operators for object-detection bounding box related stuffs (#8603)
    
    * add box_nms, box_iou, bipartite_matching
    
    * add more test cases
---
 src/operator/contrib/bounding_box-inl.h        | 730 +++++++++++++++++++++++++
 src/operator/contrib/bounding_box.cc           | 194 +++++++
 src/operator/contrib/bounding_box.cu           |  50 ++
 tests/python/unittest/test_contrib_operator.py | 231 ++++++++
 4 files changed, 1205 insertions(+)

diff --git a/src/operator/contrib/bounding_box-inl.h b/src/operator/contrib/bounding_box-inl.h
new file mode 100644
index 0000000..24ee8e7
--- /dev/null
+++ b/src/operator/contrib/bounding_box-inl.h
@@ -0,0 +1,730 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file bounding_box-inl.h
+ * \brief bounding box util functions and operators
+ * \author Joshua Zhang
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_H_
+#define MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_H_
+#include <mxnet/operator_util.h>
+#include <dmlc/optional.h>
+#include <nnvm/tuple.h>
+#include <vector>
+#include <utility>
+#include <string>
+#include <algorithm>
+#include "../mshadow_op.h"
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#include "../tensor/sort_op.h"
+
+namespace mxnet {
+namespace op {
+namespace box_common_enum {
+enum BoxType {kCorner, kCenter};
+}
+namespace box_nms_enum {
+enum BoxNMSOpInputs {kData};
+enum BoxNMSOpOutputs {kOut, kTemp};
+enum BoxNMSOpResource {kTempSpace};
+}  // box_nms_enum
+
+struct BoxNMSParam : public dmlc::Parameter<BoxNMSParam> {
+  float overlap_thresh;
+  int topk;
+  int coord_start;
+  int score_index;
+  int id_index;
+  bool force_suppress;
+  int in_format;
+  int out_format;
+  DMLC_DECLARE_PARAMETER(BoxNMSParam) {
+    DMLC_DECLARE_FIELD(overlap_thresh).set_default(0.5)
+    .describe("Overlapping(IoU) threshold to suppress object with smaller score.");
+    DMLC_DECLARE_FIELD(topk).set_default(-1)
+    .describe("Apply nms to topk boxes with descending scores, -1 to no restriction.");
+    DMLC_DECLARE_FIELD(coord_start).set_default(2)
+    .describe("Start index of the consecutive 4 coordinates.");
+    DMLC_DECLARE_FIELD(score_index).set_default(1)
+    .describe("Index of the scores/confidence of boxes.");
+    DMLC_DECLARE_FIELD(id_index).set_default(-1)
+    .describe("Optional, index of the class categories, -1 to disable.");
+    DMLC_DECLARE_FIELD(force_suppress).set_default(false)
+    .describe("Optional, if set false and id_index is provided, nms will only apply"
+    " to boxes belongs to the same category");
+    DMLC_DECLARE_FIELD(in_format).set_default(box_common_enum::kCorner)
+    .add_enum("corner", box_common_enum::kCorner)
+    .add_enum("center", box_common_enum::kCenter)
+    .describe("The input box encoding type. \n"
+        " \"corner\" means boxes are encoded as [xmin, ymin, xmax, ymax],"
+        " \"center\" means boxes are encodes as [x, y, width, height].");
+    DMLC_DECLARE_FIELD(out_format).set_default(box_common_enum::kCorner)
+    .add_enum("corner", box_common_enum::kCorner)
+    .add_enum("center", box_common_enum::kCenter)
+    .describe("The output box encoding type. \n"
+        " \"corner\" means boxes are encoded as [xmin, ymin, xmax, ymax],"
+        " \"center\" means boxes are encodes as [x, y, width, height].");
+  }
+};  // BoxNMSParam
+
+inline bool BoxNMSShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_attrs,
+                           std::vector<TShape> *out_attrs) {
+  const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  if (in_attrs->at(0).ndim() == 0U && out_attrs->at(0).ndim() == 0U) {
+    return false;
+  }
+
+  TShape& ishape = (*in_attrs)[0];
+  int indim = ishape.ndim();
+  CHECK(indim >= 2)
+    << "input must have dim >= 2"
+    << " the last two dimensions are num_box and box_width "
+    << ishape << " provided";
+  int width_elem = ishape[indim - 1];
+  int expected = 5;
+  if (param.id_index > 0) {
+    expected += 1;
+  }
+  CHECK_GE(width_elem, expected)
+    << "the last dimension must have at least 5 elements"
+    << " namely (score, coordinates x 4) "
+    << width_elem << " provided, " << expected << " expected.";
+  // check indices
+  int coord_start = param.coord_start;
+  int coord_end = param.coord_start + 3;
+  int score_index = param.score_index;
+  CHECK(score_index >= 0 && score_index < width_elem)
+    << "score_index: " << score_index << " out of range: (0, "
+    << width_elem << ")";
+  CHECK(score_index < coord_start || score_index > coord_end)
+    << "score_index: " << score_index << " conflict with coordinates: ("
+    << coord_start << ", " << coord_end << ").";
+  CHECK(coord_start >= 0 && coord_end < width_elem)
+    << "coordinates: (" << coord_start << ", " << coord_end
+    << ") out of range:: (0, " << width_elem << ")";
+  if (param.id_index >= 0) {
+    int id_index = param.id_index;
+    CHECK(id_index >= 0 && id_index < width_elem)
+      << "id_index: " << id_index << " out of range: (0, "
+      << width_elem << ")";
+    CHECK(id_index < coord_start || id_index > coord_end)
+      << "id_index: " << id_index << " conflict with coordinates: ("
+      << coord_start << ", " << coord_end << ").";
+    CHECK_NE(id_index, score_index)
+      << "id_index: " << id_index << " conflict with score_index: " << score_index;
+  }
+  TShape oshape = ishape;
+  oshape[indim - 1] = 1;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, ishape);  // out_shape[0] == in_shape
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);  // out_shape[1]
+  return true;
+}
+
+inline uint32_t BoxNMSNumVisibleOutputs(const NodeAttrs& attrs) {
+  return static_cast<uint32_t>(1);
+}
+
+struct corner_to_center {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *data, int stride) {
+    int index = i * stride;
+    DType left = data[index];
+    if (left < 0) return;
+    DType top = data[index+1];
+    DType right = data[index+2];
+    DType bot = data[index+3];
+    data[index] = (left + right) / 2;
+    data[index+1] = (top + bot) / 2;
+    data[index+2] = right - left;
+    data[index+3] = bot - top;
+  }
+};
+
+struct center_to_corner {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *data, int stride) {
+    int index = i * stride;
+    DType x = data[index];
+    if (x < 0) return;
+    DType y = data[index+1];
+    DType width = data[index+2] / 2;
+    DType height = data[index+3] / 2;
+    data[index] = x - width;
+    data[index+1] = y - height;
+    data[index+2] = x + width;
+    data[index+3] = y + height;
+  }
+};
+
+template<typename DType>
+MSHADOW_XINLINE DType BoxArea(const DType *box, int encode) {
+  DType a1 = box[0];
+  DType a2 = box[1];
+  DType a3 = box[2];
+  DType a4 = box[3];
+  DType width, height;
+  if (box_common_enum::kCorner == encode) {
+    width = a3 - a1;
+    height = a4 - a2;
+  } else {
+    width = a3;
+    height = a4;
+  }
+  if (width < 0 || height < 0) {
+    return DType(0);
+  } else {
+    return width * height;
+  }
+}
+
+// compute areas specialized for nms to reduce computation
+struct compute_area {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in,
+                                  const DType *indices, int topk, int num_elem,
+                                  int stride, int encode) {
+    int b = i / topk;
+    int k = i % topk;
+    int index = static_cast<int>(indices[b * num_elem + k]);
+    int in_index = index * stride;
+    out[index] = BoxArea(in + in_index, encode);
+  }
+};
+
+// compute line intersect along either height or width
+template<typename DType>
+MSHADOW_XINLINE DType Intersect(const DType *a, const DType *b, int encode) {
+  DType a1 = a[0];
+  DType a2 = a[2];
+  DType b1 = b[0];
+  DType b2 = b[2];
+  DType w;
+  if (box_common_enum::kCorner == encode) {
+    DType left = a1 > b1 ? a1 : b1;
+    DType right = a2 < b2 ? a2 : b2;
+    w = right - left;
+  } else {
+    DType aw = a2 / 2;
+    DType bw = b2 / 2;
+    DType al = a1 - aw;
+    DType ar = a1 + aw;
+    DType bl = b1 - bw;
+    DType br = b1 + bw;
+    DType left = bl > al ? bl : al;
+    DType right = br < ar ? br : ar;
+    w = right - left;
+  }
+  return w > 0 ? w : DType(0);
+}
+
+/*!
+   * \brief Implementation of the non-maximum suppression operation
+   *
+   * \param i the launched thread index
+   * \param index sorted index in descending order
+   * \param input the input of nms op
+   * \param areas pre-computed box areas
+   * \param k nms topk number
+   * \param ref compare reference position
+   * \param num number of input boxes in each batch
+   * \param stride input stride, usually 6 (id-score-x1-y1-x2-y2)
+   * \param offset_box box offset, usually 2
+   * \param thresh nms threshold
+   * \param force force suppress regardless of class id
+   * \param offset_id class id offset, used when force == false, usually 0
+   * \param encode box encoding type, corner(0) or center(1)
+   * \tparam DType the data type
+   */
+struct nms_impl {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *index, const DType *input,
+                                  const DType *areas, int k, int ref, int num,
+                                  int stride, int offset_box, int offset_id,
+                                  float thresh, bool force, int encode) {
+    int b = i / k;  // batch
+    int pos = i % k + ref + 1;  // position
+    if (index[b * num + ref] < 0) return;  // reference has been suppressed
+    if (index[b * num + pos] < 0) return;  // self been suppressed
+    int ref_offset = static_cast<int>(index[b * num + ref]) * stride + offset_box;
+    int pos_offset = static_cast<int>(index[b * num + pos]) * stride + offset_box;
+    if (!force && offset_id >=0) {
+      int ref_id = static_cast<int>(input[ref_offset - offset_box + offset_id]);
+      int pos_id = static_cast<int>(input[pos_offset - offset_box + offset_id]);
+      if (ref_id != pos_id) return;  // different class
+    }
+    DType intersect = Intersect(input + ref_offset, input + pos_offset, encode);
+    intersect *= Intersect(input + ref_offset + 1, input + pos_offset + 1, encode);
+    int ref_area_offset = static_cast<int>(index[b * num + ref]);
+    int pos_area_offset = static_cast<int>(index[b * num + pos]);
+    DType iou = intersect / (areas[ref_area_offset] + areas[pos_area_offset] -
+      intersect);
+    if (iou > thresh) {
+      index[b * num + pos] = -1;
+    }
+  }
+};
+
+struct nms_assign {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, DType *record, const DType *input,
+                                  const DType *index, int k, int num, int stride) {
+    int count = 0;
+    for (int j = 0; j < k; ++j) {
+      int location = static_cast<int>(index[i * num + j]);
+      if (location >= 0) {
+        // copy to output
+        int out_location = (i * num + count) * stride;
+        int in_location = location * stride;
+        for (int s = 0; s < stride; ++s) {
+          out[out_location + s] = input[in_location + s];
+        }
+        // keep the index in the record for backward
+        record[i * num + count] = location;
+        ++count;
+      }
+    }
+  }
+};
+
+
+struct nms_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *in_grad, const DType *out_grad,
+                                  const DType *record, int num, int stride) {
+    int index = static_cast<int>(record[i]);
+    if (index < 0) return;
+    int loc = index * stride;
+    int from_loc = i * stride;
+    for (int j = 0; j < stride; ++j) {
+      in_grad[loc + j] = out_grad[from_loc + j];
+    }
+  }
+};
+
+template<typename xpu>
+void BoxNMSForward(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
+  const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  TShape in_shape = inputs[box_nms_enum::kData].shape_;
+  int indim = in_shape.ndim();
+  int num_batch = indim <= 2? 1 : in_shape.ProdShape(0, indim - 2);
+  int num_elem = in_shape[indim - 2];
+  int width_elem = in_shape[indim - 1];
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 3, DType> data = inputs[box_nms_enum::kData]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    Tensor<xpu, 3, DType> out = outputs[box_nms_enum::kOut]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    Tensor<xpu, 3, DType> record = outputs[box_nms_enum::kTemp]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, 1), s);
+
+    // prepare workspace
+    Shape<1> sort_index_shape = Shape1(num_batch * num_elem);
+    Shape<3> buffer_shape = Shape3(num_batch, num_elem, width_elem);
+    index_t workspace_size = 4 * sort_index_shape.Size();
+    if (req[0] == kWriteInplace) {
+      workspace_size += buffer_shape.Size();
+    }
+    Tensor<xpu, 1, DType> workspace = ctx.requested[box_nms_enum::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
+    Tensor<xpu, 1, DType> sorted_index(workspace.dptr_, sort_index_shape, s);
+    Tensor<xpu, 1, DType> scores(sorted_index.dptr_ + sorted_index.MSize(),
+      sort_index_shape, s);
+    Tensor<xpu, 1, DType> batch_id(scores.dptr_ + scores.MSize(), sort_index_shape,
+      s);
+    Tensor<xpu, 1, DType> areas(batch_id.dptr_ + batch_id.MSize(), sort_index_shape, s);
+    Tensor<xpu, 3, DType> buffer = data;
+    if (req[0] == kWriteInplace) {
+      // make copy
+      buffer = Tensor<xpu, 3, DType>(areas.dptr_ + areas.MSize(), buffer_shape, s);
+      buffer = F<mshadow_op::identity>(data);
+    }
+
+    // indecies
+    int score_index = param.score_index;
+    int coord_start = param.coord_start;
+    int id_index = param.id_index;
+
+    // sort topk
+    int topk = param.topk < 0? num_elem : std::min(num_elem, param.topk);
+    if (topk < 1) {
+      out = F<mshadow_op::identity>(buffer);
+      record = reshape(range<DType>(0, num_batch * num_elem), record.shape_);
+      return;
+    }
+    scores = reshape(slice<2>(buffer, score_index, score_index + 1), scores.shape_);
+    sorted_index = range<DType>(0, num_batch * num_elem);
+    mxnet::op::SortByKey(scores, sorted_index, false);
+    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(num_elem));
+    mxnet::op::SortByKey(batch_id, scores, true);
+    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(num_elem));
+    mxnet::op::SortByKey(batch_id, sorted_index, true);
+
+    // pre-compute areas of candidates
+    areas = 0;
+    Kernel<compute_area, xpu>::Launch(s, num_batch * topk, areas.dptr_,
+     buffer.dptr_ + coord_start, sorted_index.dptr_, topk, num_elem, width_elem,
+     param.in_format);
+
+    // apply nms
+    // go through each box as reference, suppress if overlap > threshold
+    // sorted_index with -1 is marked as suppressed
+    for (int ref = 0; ref < topk; ++ref) {
+      int num_worker = topk - ref - 1;
+      if (num_worker < 1) continue;
+      Kernel<nms_impl, xpu>::Launch(s, num_batch * num_worker, sorted_index.dptr_,
+        buffer.dptr_, areas.dptr_, num_worker, ref, num_elem, width_elem,
+        coord_start, id_index, param.overlap_thresh, param.force_suppress, param.in_format);
+    }
+
+    // store the results to output, keep a record for backward
+    record = -1;
+    out = -1;
+    Kernel<nms_assign, xpu>::Launch(s, num_batch, out.dptr_, record.dptr_,
+      buffer.dptr_, sorted_index.dptr_, topk, num_elem, width_elem);
+
+    // convert encoding
+    if (param.in_format != param.out_format) {
+      if (box_common_enum::kCenter == param.out_format) {
+        Kernel<corner_to_center, xpu>::Launch(s, num_batch * num_elem,
+          out.dptr_ + coord_start, width_elem);
+      } else {
+        Kernel<center_to_corner, xpu>::Launch(s, num_batch * num_elem,
+          out.dptr_ + coord_start, width_elem);
+      }
+    }
+  });
+}
+
+template<typename xpu>
+void BoxNMSBackward(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const std::vector<TBlob>& inputs,
+                 const std::vector<OpReqType>& req,
+                 const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 4U);
+  CHECK_EQ(outputs.size(), 1U);
+  const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  TShape in_shape = outputs[box_nms_enum::kData].shape_;
+  int indim = in_shape.ndim();
+  int num_batch = indim <= 2? 1 : in_shape.ProdShape(0, indim - 2);
+  int num_elem = in_shape[indim - 2];
+  int width_elem = in_shape[indim - 1];
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 3, DType> out_grad = inputs[box_nms_enum::kOut]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    Tensor<xpu, 3, DType> in_grad = outputs[box_nms_enum::kData]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    Tensor<xpu, 3, DType> record = inputs[box_nms_enum::kTemp + 2]
+     .get_with_shape<xpu, 3, DType>(Shape3(num_batch, num_elem, 1), s);
+
+    in_grad = 0;
+    Kernel<nms_backward, xpu>::Launch(s, num_batch * num_elem, in_grad.dptr_,
+      out_grad.dptr_, record.dptr_, num_elem, width_elem);
+  });
+}
+
+struct BoxOverlapParam : public dmlc::Parameter<BoxOverlapParam> {
+  int format;
+  DMLC_DECLARE_PARAMETER(BoxOverlapParam) {
+    DMLC_DECLARE_FIELD(format).set_default(box_common_enum::kCorner)
+    .add_enum("corner", box_common_enum::kCorner)
+    .add_enum("center", box_common_enum::kCenter)
+    .describe("The box encoding type. \n"
+        " \"corner\" means boxes are encoded as [xmin, ymin, xmax, ymax],"
+        " \"center\" means boxes are encodes as [x, y, width, height].");
+  }
+};  // BoxOverlapParam
+
+inline bool BoxOverlapShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_attrs,
+                           std::vector<TShape> *out_attrs) {
+  // const BoxOverlapParam& param = nnvm::get<BoxOverlapParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape& lshape = (*in_attrs)[0];
+  TShape& rshape = (*in_attrs)[1];
+
+  CHECK_GE(lshape.ndim(), 2)
+    << "lhs must have dim >= 2 "
+    << lshape.ndim() << " provided";
+  int ldim = lshape[lshape.ndim() - 1];
+  CHECK_EQ(ldim, 4)
+    << "last dimension of lhs must be 4 "
+    << ldim << " provided";
+  CHECK_GE(rshape.ndim(), 2)
+    << "rhs must have dim >= 2 "
+    << rshape.ndim() << " provided";
+  int rdim = rshape[rshape.ndim() - 1];
+  CHECK_EQ(rdim, 4)
+    << "last dimension of rhs must be 4 "
+    << rdim << " provided";
+
+  // assign output shape
+  TShape oshape(lshape.ndim() + rshape.ndim() - 2);
+  int idx = 0;
+  for (index_t i = 0; i < lshape.ndim() - 1; ++i) {
+    oshape[idx++] = lshape[i];
+  }
+  for (index_t i = 0; i < rshape.ndim() - 1; ++i) {
+    oshape[idx++] = rshape[i];
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  return true;
+}
+
+struct compute_overlap {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs,
+                                  const DType *rhs, int num,
+                                  int begin, int stride, int encode) {
+    int l = i / num;
+    int r = i % num;
+    int l_index = l * stride + begin;
+    int r_index = r * stride + begin;
+    DType intersect = Intersect(lhs + l_index, rhs + r_index, encode);
+    intersect *= Intersect(lhs + l_index + 1, rhs + r_index + 1, encode);
+    if (intersect <= 0) {
+      out[i] = DType(0);
+      return;
+    }
+    DType l_area = BoxArea(lhs + l_index, encode);
+    DType r_area = BoxArea(rhs + r_index, encode);
+    out[i] = intersect / (l_area + r_area - intersect);
+  }
+};
+
+template<typename xpu>
+void BoxOverlapForward(const nnvm::NodeAttrs& attrs,
+                const OpContext& ctx,
+                const std::vector<TBlob>& inputs,
+                const std::vector<OpReqType>& req,
+                const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const BoxOverlapParam& param = nnvm::get<BoxOverlapParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  TShape lshape = inputs[0].shape_;
+  TShape rshape = inputs[1].shape_;
+  int lsize = lshape.ProdShape(0, lshape.ndim() - 1);
+  int rsize = rshape.ProdShape(0, rshape.ndim() - 1);
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 1, DType> lhs = inputs[0]
+     .get_with_shape<xpu, 1, DType>(Shape1(lsize * 4), s);
+    Tensor<xpu, 1, DType> rhs = inputs[1]
+     .get_with_shape<xpu, 1, DType>(Shape1(rsize * 4), s);
+    Tensor<xpu, 1, DType> out = outputs[0]
+     .get_with_shape<xpu, 1, DType>(Shape1(lsize * rsize), s);
+
+    Kernel<compute_overlap, xpu>::Launch(s, lsize * rsize, out.dptr_,
+     lhs.dptr_, rhs.dptr_, rsize, 0, 4, param.format);
+  });
+}
+
+template<typename xpu>
+void BoxOverlapBackward(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<TBlob>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> in_grad_lhs = outputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> in_grad_rhs = outputs[1].FlatTo2D<xpu, DType>(s);
+    // TODO(Joshua Zhang): allow backprop?
+    in_grad_lhs = 0;
+    in_grad_rhs = 0;
+  });
+}
+
+struct BipartiteMatchingParam : public dmlc::Parameter<BipartiteMatchingParam> {
+  bool is_ascend;
+  float threshold;
+  int topk;
+  DMLC_DECLARE_PARAMETER(BipartiteMatchingParam) {
+    DMLC_DECLARE_FIELD(is_ascend).set_default(false)
+    .describe("Use ascend order for scores instead of descending. "
+    "Please set threshold accordingly.");
+    DMLC_DECLARE_FIELD(threshold)
+    .describe("Ignore matching when score < thresh, if is_ascend=false, "
+    "or ignore score > thresh, if is_ascend=true.");
+    DMLC_DECLARE_FIELD(topk).set_default(-1)
+    .describe("Limit the number of matches to topk, set -1 for no limit");
+  }
+};  // BipartiteMatchingParam
+
+inline bool MatchingShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  // const BipartiteMatchingParam& param = nnvm::get<BipartiteMatchingParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  TShape& dshape = (*in_attrs)[0];
+
+  CHECK_GE(dshape.ndim(), 2)
+    << "score matrix must have dim >= 2 "
+    << dshape.ndim() << " provided";
+
+  // assign output shape
+  TShape oshape(dshape.ndim() - 1);
+  for (index_t i = 0; i < dshape.ndim() - 1; ++i) {
+    oshape[i] = dshape[i];
+  }
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  oshape[oshape.ndim() - 1] = dshape[dshape.ndim() - 1];
+  SHAPE_ASSIGN_CHECK(*out_attrs, 1, oshape);
+  return true;
+}
+
+struct bipartite_matching {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *row_marker, DType *col_marker,
+                                  const DType *scores, const DType *sorted_index,
+                                  int num_batch, int num_row, int num_col,
+                                  float threshold, bool is_ascend, int topk) {
+    int stride = num_row * num_col;
+    const DType *index = sorted_index + i * stride;
+    const DType *score = scores + i * stride;
+    DType *rmarker = row_marker + i * num_row;
+    DType *cmarker = col_marker + i * num_col;
+    int count = 0;
+    for (int j = 0; j < stride; ++j) {
+      int idx = static_cast<int>(index[j]) % stride;
+      int r = idx / num_col;
+      int c = idx % num_col;
+      if (rmarker[r] == -1 && cmarker[c] == -1) {
+        if ((!is_ascend && score[j] > threshold) ||
+            (is_ascend && score[j] < threshold)) {
+          rmarker[r] = c;
+          cmarker[c] = r;
+          ++count;
+          if (topk > 0 && count > topk) {
+            break;
+          }
+        } else {
+          // already encounter bad scores
+          break;
+        }
+      }
+    }
+  }
+};
+
+template<typename xpu>
+void BipartiteMatchingForward(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U);
+  const BipartiteMatchingParam& param = nnvm::get<BipartiteMatchingParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  TShape dshape = inputs[0].shape_;
+  int row = dshape[dshape.ndim() - 2];
+  int col = dshape[dshape.ndim() - 1];
+  int batch_size = dshape.Size() / row / col;
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 1, DType> scores = inputs[0]
+     .get_with_shape<xpu, 1, DType>(Shape1(dshape.Size()), s);
+    Tensor<xpu, 2, DType> row_marker = outputs[0]
+     .get_with_shape<xpu, 2, DType>(Shape2(batch_size, row), s);
+    Tensor<xpu, 2, DType> col_marker = outputs[1]
+     .get_with_shape<xpu, 2, DType>(Shape2(batch_size, col), s);
+    Shape<1> sort_index_shape = Shape1(dshape.Size());
+    index_t workspace_size = sort_index_shape.Size() * 3;
+    Tensor<xpu, 1, DType> workspace = ctx.requested[0]
+      .get_space_typed<xpu, 1, DType>(Shape1(workspace_size), s);
+    Tensor<xpu, 1, DType> sorted_index(workspace.dptr_,
+      sort_index_shape, s);
+    Tensor<xpu, 1, DType> batch_id(sorted_index.dptr_ + sorted_index.MSize(),
+      sort_index_shape, s);
+    Tensor<xpu, 1, DType> scores_copy(batch_id.dptr_ + batch_id.MSize(),
+      sort_index_shape, s);
+
+    // sort according to score
+    scores_copy = F<mshadow_op::identity>(scores);
+    sorted_index = range<DType>(0, dshape.Size());
+    mxnet::op::SortByKey(scores_copy, sorted_index, param.is_ascend);
+    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(row * col));
+    mxnet::op::SortByKey(batch_id, scores_copy, true);
+    batch_id = F<mshadow_op::floor>(sorted_index / ScalarExp<DType>(row * col));
+    mxnet::op::SortByKey(batch_id, sorted_index, true);
+
+    // bipartite matching, parallelization is limited to batch_size
+    row_marker = -1;
+    col_marker = -1;
+    Kernel<bipartite_matching, xpu>::Launch(s, batch_size, row_marker.dptr_,
+     col_marker.dptr_, scores_copy.dptr_, sorted_index.dptr_, batch_size, row, col,
+     param.threshold, param.is_ascend, param.topk);
+  });
+}
+
+template<typename xpu>
+void BipartiteMatchingBackward(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> in_grad = outputs[0].FlatTo2D<xpu, DType>(s);
+    // TODO(Joshua Zhang): allow backprop?
+    in_grad = 0;
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_CONTRIB_BOUNDING_BOX_INL_H_
diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
new file mode 100644
index 0000000..36dc3db
--- /dev/null
+++ b/src/operator/contrib/bounding_box.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+ /*!
+  *  Copyright (c) 2017 by Contributors
+  * \file bounding_box.cc
+  * \brief Bounding box util functions and operators
+  * \author Joshua Zhang
+  */
+
+#include "./bounding_box-inl.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(BoxNMSParam);
+DMLC_REGISTER_PARAMETER(BoxOverlapParam);
+DMLC_REGISTER_PARAMETER(BipartiteMatchingParam);
+
+NNVM_REGISTER_OP(_contrib_box_nms)
+.add_alias("_contrib_box_non_maximum_suppression")
+.describe(R"code(Apply non-maximum suppression to input.
+
+The output will be sorted in descending order according to `score`. Boxes with
+overlaps larger than `overlap_thresh` and smaller scores will be removed and
+filled with -1, the corresponding position will be recorded for backward propogation.
+
+During back-propagation, the gradient will be copied to the original
+position according to the input index. For positions that have been suppressed,
+the in_grad will be assigned 0.
+In summary, gradients are sticked to its boxes, will either be moved or discarded
+according to its original index in input.
+
+Input requirements:
+1. Input tensor have at least 2 dimensions, (n, k), any higher dims will be regarded
+as batch, e.g. (a, b, c, d, n, k) == (a*b*c*d, n, k)
+2. n is the number of boxes in each batch
+3. k is the width of each box item.
+
+By default, a box is [id, score, xmin, ymin, xmax, ymax, ...],
+additional elements are allowed.
+- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means
+we will skip highly overlapped boxes if one is `apple` while the other is `car`.
+- `coord_start`: required, default=2, the starting index of the 4 coordinates.
+Two formats are supported:
+  `corner`: [xmin, ymin, xmax, ymax]
+  `center`: [x, y, width, height]
+- `score_index`: required, default=1, box score/confidence.
+When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed.
+- `in_format` and `out_format`: default='corner', specify in/out box formats.
+
+Examples::
+
+  x = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2],
+       [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]]
+  box_nms(x, overlap_thresh=0.1, coord_start=2, score_index=1, id_index=0,
+      force_suppress=True, in_format='corner', out_typ='corner') =
+      [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
+       [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
+  out_grad = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
+              [0.3, 0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]]
+  # exe.backward
+  in_grad = [[0.2, 0.2, 0.2, 0.2, 0.2, 0.2], [0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<BoxNMSParam>)
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs", BoxNMSNumVisibleOutputs)
+.set_attr<nnvm::FInferShape>("FInferShape", BoxNMSShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BoxNMSForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_contrib_box_nms"})
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.add_arguments(BoxNMSParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_contrib_box_nms)
+.set_num_inputs(3)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<BoxNMSParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BoxNMSBackward<cpu>)
+.add_arguments(BoxNMSParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_box_iou)
+.describe(R"doc(Bounding box overlap of two arrays.
+  The overlap is defined as Intersection-over-Union, aka, IOU.
+  - lhs: (a_1, a_2, ..., a_n, 4) array
+  - rhs: (b_1, b_2, ..., b_n, 4) array
+  - output: (a_1, a_2, ..., a_n, b_1, b_2, ..., b_n) array
+
+  Note::
+
+    Zero gradients are back-propagated in this op for now.
+
+  Example::
+
+    x = [[0.5, 0.5, 1.0, 1.0], [0.0, 0.0, 0.5, 0.5]]
+    y = [0.25, 0.25, 0.75, 0.75]
+    box_iou(x, y, format='corner') = [[0.1428], [0.1428]]
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<BoxOverlapParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", BoxOverlapShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FCompute>("FCompute<cpu>", BoxOverlapForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_contrib_box_iou"})
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.add_arguments(BoxOverlapParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_contrib_box_iou)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<BoxOverlapParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BoxOverlapBackward<cpu>)
+.add_arguments(BoxOverlapParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_contrib_bipartite_matching)
+.describe(R"doc(Compute bipartite matching.
+  The matching is performed on score matrix with shape [B, N, M]
+  - B: batch_size
+  - N: number of rows to match
+  - M: number of columns as reference to be matched against.
+
+  Returns:
+  x : matched column indices. -1 indicating non-matched elements in rows.
+  y : matched row indices.
+
+  Note::
+
+    Zero gradients are back-propagated in this op for now.
+
+  Example::
+
+    s = [[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]]
+    x, y = bipartite_matching(x, threshold=1e-12, is_ascend=False)
+    x = [1, -1, 0]
+    y = [2, 0]
+
+)doc" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<BipartiteMatchingParam>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", MatchingShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<FCompute>("FCompute<cpu>", BipartiteMatchingForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  ElemwiseGradUseNone{"_backward_contrib_bipartite_matching"})
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.add_arguments(BipartiteMatchingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_contrib_bipartite_matching)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<BipartiteMatchingParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BipartiteMatchingBackward<cpu>)
+.add_arguments(BipartiteMatchingParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
new file mode 100644
index 0000000..6662d932
--- /dev/null
+++ b/src/operator/contrib/bounding_box.cu
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+ /*!
+  *  Copyright (c) 2017 by Contributors
+  * \file bounding_box.cu
+  * \brief Bounding box util functions and operators
+  * \author Joshua Zhang
+  */
+
+#include "./bounding_box-inl.h"
+#include "../elemwise_op_common.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(_contrib_box_nms)
+.set_attr<FCompute>("FCompute<gpu>", BoxNMSForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_contrib_box_nms)
+.set_attr<FCompute>("FCompute<gpu>", BoxNMSBackward<gpu>);
+
+NNVM_REGISTER_OP(_contrib_box_iou)
+.set_attr<FCompute>("FCompute<gpu>", BoxOverlapForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_contrib_box_iou)
+.set_attr<FCompute>("FCompute<gpu>", BoxOverlapBackward<gpu>);
+
+NNVM_REGISTER_OP(_contrib_bipartite_matching)
+.set_attr<FCompute>("FCompute<gpu>", BipartiteMatchingForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_contrib_bipartite_matching)
+.set_attr<FCompute>("FCompute<gpu>", BipartiteMatchingBackward<gpu>);
+}  // namespace op
+}  // namespace mxnet
diff --git a/tests/python/unittest/test_contrib_operator.py b/tests/python/unittest/test_contrib_operator.py
new file mode 100644
index 0000000..800426c
--- /dev/null
+++ b/tests/python/unittest/test_contrib_operator.py
@@ -0,0 +1,231 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+import random
+import itertools
+from numpy.testing import assert_allclose, assert_array_equal
+from mxnet.test_utils import *
+import unittest
+
+def test_box_nms_op():
+    def test_box_nms_forward(data, expected, thresh=0.5, topk=-1, coord=2, score=1, cid=0,
+                         force=False, in_format='corner', out_format='corner'):
+        data = mx.nd.array(data)
+        out = mx.contrib.nd.box_nms(data, overlap_thresh=thresh, topk=topk,
+                                coord_start=coord, score_index=score, id_index=cid,
+                                force_suppress=force, in_format=in_format, out_format=out_format)
+        assert_almost_equal(out.asnumpy(), expected)
+
+    def test_box_nms_backward(data, grad, expected, thresh=0.5, topk=-1, coord=2, score=1,
+                          cid=0, force=False, in_format='corner', out_format='corner'):
+        in_var = mx.sym.Variable('data')
+        arr_data = mx.nd.array(data)
+        arr_grad = mx.nd.empty(arr_data.shape)
+        op = mx.contrib.sym.box_nms(in_var, overlap_thresh=thresh, topk=topk,
+                                coord_start=coord, score_index=score, id_index=cid,
+                                force_suppress=force, in_format=in_format, out_format=out_format)
+        exe = op.bind(ctx=mx.context.Context.default_ctx, args=[arr_data], args_grad=[arr_grad])
+        exe.forward(is_train=True)
+        exe.backward(mx.nd.array(grad))
+        assert_almost_equal(arr_grad.asnumpy(), expected)
+
+    def corner_to_center(data):
+        out = np.reshape(data, (-1, 6)).copy()
+        out[:, 2] = (data[:, 2] + data[:, 4]) / 2.0
+        out[:, 3] = (data[:, 3] + data[:, 5]) / 2.0
+        out[:, 4] = data[:, 4] - data[:, 2]
+        out[:, 5] = data[:, 5] - data[:, 3]
+        invalid = np.where(data[:, 0] < 0)[0]
+        out[invalid, :] = -1
+        return out
+
+    def center_to_corner(data):
+        data = np.reshape(data, (-1, 6)).copy()
+        out[:, 2] = data[:, 2] - data[:, 4] / 2.0
+        out[:, 3] = data[:, 3] - data[:, 5] / 2.0
+        out[:, 4] = data[:, 2] + data[:, 4] / 2.0
+        out[:, 5] = data[:, 3] + data[:, 5] / 2.0
+        invalid = np.where(data[:, 0] < 0)[0]
+        out[invalid, :] = -1
+        return out
+
+    def swap_position(data, expected, coord=2, score=1, cid=0, new_col=0):
+        data = np.reshape(data, (-1, 6))
+        expected = np.reshape(expected, (-1, 6))
+        new_coord = random.randint(0, 6 + new_col - 4)
+        others = list(range(new_coord)) + list(range(new_coord + 4, 6 + new_col))
+        random.shuffle(others)
+        new_score = others[0]
+        new_cid = others[1]
+        new_data = np.full((data.shape[0], data.shape[1] + new_col), -1.0)
+        new_expected = np.full((expected.shape[0], expected.shape[1] + new_col), -1.0)
+        new_data[:, new_coord:new_coord+4] = data[:, coord:coord+4]
+        new_data[:, new_score] = data[:, score]
+        new_data[:, new_cid] = data[:, cid]
+        new_expected[:, new_coord:new_coord+4] = expected[:, coord:coord+4]
+        new_expected[:, new_score] = expected[:, score]
+        new_expected[:, new_cid] = expected[:, cid]
+        return new_data, new_expected, new_coord, new_score, new_cid
+
+    # manually set up test cases
+    boxes = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2],
+             [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]]
+
+    # case1
+    force=True
+    thresh=0.5
+    expected = [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
+                [0, 0.3, 0.1, 0.1, 0.14, 0.14], [-1, -1, -1, -1, -1, -1]]
+    grad = np.random.rand(4, 6)
+    expected_in_grad = grad[(1, 3, 2, 0), :]
+    expected_in_grad[1, :] = 0
+    test_box_nms_forward(np.array(boxes), np.array(expected), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes), grad, expected_in_grad, force=force, thresh=thresh)
+
+    # case2: multi batch
+    boxes2 = [boxes] * 3
+    expected2 = [expected] * 3
+    grad2 = np.array([grad.tolist()] * 3)
+    expected_in_grad2 = np.array([expected_in_grad.tolist()] * 3)
+    test_box_nms_forward(np.array(boxes2), np.array(expected2), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes2), grad2, expected_in_grad2, force=force, thresh=thresh)
+    # another new dim
+    boxes2 = [boxes2] * 2
+    expected2 = [expected2] * 2
+    grad2 = np.array([grad2.tolist()] * 2)
+    expected_in_grad2 = np.array([expected_in_grad2.tolist()] * 2)
+    test_box_nms_forward(np.array(boxes2), np.array(expected2), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes2), grad2, expected_in_grad2, force=force, thresh=thresh)
+
+    # case3: thresh
+    thresh = 0.1
+    boxes3 = boxes
+    expected3 = [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
+                [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
+    grad3 = np.random.rand(4, 6)
+    expected_in_grad3 = grad3[(1, 3, 2, 0), :]
+    expected_in_grad3[(1, 2), :] = 0
+    test_box_nms_forward(np.array(boxes3), np.array(expected3), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes3), grad3, expected_in_grad3, force=force, thresh=thresh)
+
+    # case4: non-force
+    boxes4 = boxes
+    force = False
+    expected4 = [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
+                [1, 0.4, 0.1, 0.1, 0.2, 0.2], [-1, -1, -1, -1, -1, -1]]
+    grad4 = np.random.rand(4, 6)
+    expected_in_grad4 = grad4[(1, 2, 3, 0), :]
+    expected_in_grad4[2, :] = 0
+    test_box_nms_forward(np.array(boxes4), np.array(expected4), force=force, thresh=thresh)
+    test_box_nms_backward(np.array(boxes4), grad4, expected_in_grad4, force=force, thresh=thresh)
+
+    # case5: different coding
+    boxes5 = corner_to_center(np.array(boxes4))
+    test_box_nms_forward(np.array(boxes5), np.array(expected4), force=force, thresh=thresh,
+        in_format='center')
+    expected5 = corner_to_center(np.array(expected4))
+    test_box_nms_forward(np.array(boxes4), np.array(expected5), force=force, thresh=thresh,
+        out_format='center')
+    test_box_nms_forward(np.array(boxes5), np.array(expected5), force=force, thresh=thresh,
+        in_format='center', out_format='center')
+
+    # case6: different position
+    boxes6, expected6, new_coord, new_score, new_id = swap_position(np.array(boxes4),
+        np.array(expected4), new_col=2)
+    test_box_nms_forward(np.array(boxes6), np.array(expected6), force=force, thresh=thresh,
+        coord=new_coord, score=new_score, cid=new_id)
+
+    # case7: no id, should be same with force=True
+    force = False
+    thresh = 0.5
+    test_box_nms_forward(np.array(boxes), np.array(expected), force=force, thresh=thresh, cid=-1)
+
+def test_box_iou_op():
+    def numpy_box_iou(a, b, fmt='corner'):
+        def area(left, top, right, bottom):
+            return np.maximum(0, right - left) * np.maximum(0, bottom - top)
+
+        assert a.shape[-1] == 4
+        assert b.shape[-1] == 4
+        oshape = a.shape[:-1] + b.shape[:-1]
+        a = a.reshape((-1, 4))
+        ashape = a.shape
+        b = b.reshape((-1, 4))
+        a = np.tile(a, reps=[1, b.shape[0]]).reshape((-1, 4))
+        b = np.tile(b, reps=[ashape[0], 1]).reshape((-1, 4))
+        if fmt == 'corner':
+            al, at, ar, ab = np.split(a, 4, axis=-1)
+            bl, bt, br, bb = np.split(b, 4, axis=-1)
+        elif fmt == 'center':
+            ax, ay, aw, ah = np.split(a, 4, axis=-1)
+            bx, by, bw, bh = np.split(b, 4, axis=-1)
+            al, at, ar, ab = ax - aw / 2, ay - ah / 2, ax + aw / 2, ay + ah / 2
+            bl, bt, br, bb = bx - bw / 2, by - bh / 2, bx + bw / 2, by + bh / 2
+        else:
+            raise NotImplementedError("Fmt {} not supported".format(fmt))
+        width = np.maximum(0, np.minimum(ar, br) - np.maximum(al, bl))
+        height = np.maximum(0, np.minimum(ab, bb) - np.maximum(at, bt))
+        intersect = width * height
+        union = area(al, at, ar, ab) + area(bl, bt, br, bb) - intersect
+        union[np.where(intersect <= 0)] = 1e-12
+        iou = intersect / union
+        return iou.reshape(oshape)
+
+    def generate_boxes(dims):
+        s1, off1, s2, off2 = np.random.rand(4) * 100
+        xy = np.random.rand(*(dims + [2])) * s1 + off1
+        wh = np.random.rand(*(dims + [2])) * s2 + off2
+        xywh = np.concatenate([xy, wh], axis=-1)
+        ltrb = np.concatenate([xy - wh / 2, xy + wh / 2], axis=-1)
+        return xywh, ltrb
+
+
+    for ndima in range(1, 6):
+        for ndimb in range(1, 6):
+            dims_a = np.random.randint(low=1, high=3, size=ndima).tolist()
+            dims_b = np.random.randint(low=1, high=3, size=ndimb).tolist()
+            # generate left, top, right, bottom
+            xywh_a, ltrb_a = generate_boxes(dims_a)
+            xywh_b, ltrb_b = generate_boxes(dims_b)
+
+            iou_np = numpy_box_iou(ltrb_a, ltrb_b, fmt='corner')
+            iou_np2 = numpy_box_iou(xywh_a, xywh_b, fmt='center')
+            iou_mx = mx.nd.contrib.box_iou(mx.nd.array(ltrb_a), mx.nd.array(ltrb_b), format='corner')
+            iou_mx2 = mx.nd.contrib.box_iou(mx.nd.array(xywh_a), mx.nd.array(xywh_b), format='center')
+            assert_allclose(iou_np, iou_np2, rtol=1e-5, atol=1e-5)
+            assert_allclose(iou_np, iou_mx.asnumpy(), rtol=1e-5, atol=1e-5)
+            assert_allclose(iou_np, iou_mx2.asnumpy(), rtol=1e-5, atol=1e-5)
+
+def test_bipartite_matching_op():
+    def assert_match(inputs, x, y, threshold, is_ascend=False):
+        inputs = mx.nd.array(inputs)
+        x = np.array(x)
+        y = np.array(y)
+        a, b = mx.nd.contrib.bipartite_matching(inputs, threshold=threshold, is_ascend=is_ascend)
+        print(a, b)
+        assert_array_equal(a.asnumpy().astype('int64'), x.astype('int64'))
+        assert_array_equal(b.asnumpy().astype('int64'), y.astype('int64'))
+    assert_match([[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]], [1, -1, 0], [2, 0], 1e-12, False)
+    assert_match([[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]], [-1, 0, 1], [1, 2], 100, True)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

-- 
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].