You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2018/05/24 18:06:52 UTC
[incubator-mxnet] branch master updated: SSD performance
optimization and benchmark script (#10483)
This is an automated email from the ASF dual-hosted git repository.
zhreshold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 166944c SSD performance optimization and benchmark script (#10483)
166944c is described below
commit 166944cea4954a86fe0bf785f9191a54177256be
Author: Tao Lv <ta...@intel.com>
AuthorDate: Fri May 25 02:06:46 2018 +0800
SSD performance optimization and benchmark script (#10483)
* add ssd benchmark
* optimize MultiBoxDetectionForward
* update default batch sizes for ssd benchmark
* remove commented python code
---
example/ssd/benchmark_score.py | 100 +++++++++++++++++++++++++++++
src/operator/contrib/multibox_detection.cc | 41 +++++++++---
2 files changed, 131 insertions(+), 10 deletions(-)
diff --git a/example/ssd/benchmark_score.py b/example/ssd/benchmark_score.py
new file mode 100644
index 0000000..6af1b21
--- /dev/null
+++ b/example/ssd/benchmark_score.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import os
+import sys
+import argparse
+import importlib
+import mxnet as mx
+import time
+import logging
+
+from symbol.symbol_factory import get_symbol
+from symbol.symbol_factory import get_symbol_train
+from symbol import symbol_builder
+
+
+parser = argparse.ArgumentParser(description='MxNet SSD benchmark')
+parser.add_argument('--network', '-n', type=str, default='vgg16_reduced')
+parser.add_argument('--batch_size', '-b', type=int, default=0)
+parser.add_argument('--shape', '-w', type=int, default=300)
+parser.add_argument('--class_num', '-class', type=int, default=20)
+
+
+def get_data_shapes(batch_size):
+ image_shape = (3, 300, 300)
+ return [('data', (batch_size,)+image_shape)]
+
+def get_data(batch_size):
+ data_shapes = get_data_shapes(batch_size)
+ data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in data_shapes]
+ batch = mx.io.DataBatch(data, [])
+ return batch
+
+
+if __name__ == '__main__':
+ args = parser.parse_args()
+ network = args.network
+ image_shape = args.shape
+ num_classes = args.class_num
+ b = args.batch_size
+ supported_image_shapes = [300, 512]
+ supported_networks = ['vgg16_reduced', 'inceptionv3', 'resnet50']
+
+ if network not in supported_networks:
+ raise Exception(network + " is not supported")
+
+ if image_shape not in supported_image_shapes:
+ raise Exception("Image shape should be either 300*300 or 512*512!")
+
+ if b == 0:
+ batch_sizes = [1, 2, 4, 8, 16, 32]
+ else:
+ batch_sizes = [b]
+
+ data_shape = (3, image_shape, image_shape)
+ net = get_symbol(network, data_shape[1], num_classes=num_classes,
+ nms_thresh=0.4, force_suppress=True)
+
+ num_batches = 100
+ dry_run = 5 # use 5 iterations to warm up
+
+ for bs in batch_sizes:
+ batch = get_data(bs)
+ mod = mx.mod.Module(net, label_names=None, context=mx.cpu())
+ mod.bind(for_training = False,
+ inputs_need_grad = False,
+ data_shapes = get_data_shapes(bs))
+ mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
+
+ # get data
+ data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
+ batch = mx.io.DataBatch(data, [])
+
+ for i in range(dry_run + num_batches):
+ if i == dry_run:
+ tic = time.time()
+ mod.forward(batch, is_train=False)
+ for output in mod.get_outputs():
+ output.wait_to_read()
+
+ avg_time = (time.time() - tic) / num_batches
+ fps = bs / avg_time
+ print("SSD-" + network + " with " + str(num_classes) + " classes and shape " + str(data_shape))
+ print("batchsize=" + str(bs) + " " + str(1000*avg_time) + " ms")
+ print("batchsize=" + str(bs) + " " + str(fps) + " imgs/s")
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 112c033..e5a7dd8 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -96,11 +96,16 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
const int num_anchors = cls_prob.size(2);
const int num_batches = cls_prob.size(0);
const DType *p_anchor = anchors.dptr_;
+
+ const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+ std::vector<DType> outputs;
+ outputs.reserve(num_anchors * 6);
for (int nbatch = 0; nbatch < num_batches; ++nbatch) {
const DType *p_cls_prob = cls_prob.dptr_ + nbatch * num_classes * num_anchors;
const DType *p_loc_pred = loc_pred.dptr_ + nbatch * num_anchors * 4;
DType *p_out = out.dptr_ + nbatch * num_anchors * 6;
- int valid_count = 0;
+
+#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < num_anchors; ++i) {
// find the predicted class id and probability
DType score = -1;
@@ -112,20 +117,33 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
id = j;
}
}
+
if (id > 0 && score < threshold) {
id = 0;
}
- if (id > 0) {
- // [id, prob, xmin, ymin, xmax, ymax]
- p_out[valid_count * 6] = id - 1; // remove background, restore original id
- p_out[valid_count * 6 + 1] = (id == 0 ? DType(-1) : score);
- int offset = i * 4;
- TransformLocations(p_out + valid_count * 6 + 2, p_anchor + offset,
- p_loc_pred + offset, clip, variances[0], variances[1],
- variances[2], variances[3]);
+
+ // [id, prob, xmin, ymin, xmax, ymax]
+ outputs[i * 6] = id - 1;
+ outputs[i * 6 + 1] = score;
+ int offset = i * 4;
+ TransformLocations(outputs.data() + i * 6 + 2, p_anchor + offset, p_loc_pred + offset, clip,
+ variances[0], variances[1], variances[2], variances[3]);
+ }
+
+ int valid_count = 0;
+ for (int i = 0; i < num_anchors; ++i) {
+ int offset1 = valid_count * 6;
+ int offset2 = i * 6;
+ if (outputs[offset2] >= 0) {
+ p_out[offset1] = outputs[offset2];
+ p_out[offset1 + 1] = outputs[offset2 + 1];
+ p_out[offset1 + 2] = outputs[offset2 + 2];
+ p_out[offset1 + 3] = outputs[offset2 + 3];
+ p_out[offset1 + 4] = outputs[offset2 + 4];
+ p_out[offset1 + 5] = outputs[offset2 + 5];
++valid_count;
}
- } // end iter num_anchors
+ }
if (valid_count < 1 || nms_threshold <= 0 || nms_threshold > 1) continue;
@@ -138,6 +156,7 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
sorter.push_back(SortElemDescend<DType>(p_out[i * 6 + 1], i));
}
std::stable_sort(sorter.begin(), sorter.end());
+
// re-order output
DType *ptemp = temp_space.dptr_ + nbatch * num_anchors * 6;
int nkeep = static_cast<int>(sorter.size());
@@ -153,7 +172,9 @@ inline void MultiBoxDetectionForward(const Tensor<cpu, 3, DType> &out,
p_out[i * 6 + j] = ptemp[sorter[i].index * 6 + j];
}
}
+
// apply nms
+#pragma omp parallel for num_threads(omp_threads)
for (int i = 0; i < nkeep; ++i) {
int offset_i = i * 6;
if (p_out[offset_i] < 0) continue; // skip eliminated
--
To stop receiving notification emails like this one, please contact
zhreshold@apache.org.