You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2019/03/03 04:37:35 UTC
[incubator-mxnet] branch master updated: fix memory-related issues
to enable ASAN tests (#14223)
This is an automated email from the ASF dual-hosted git repository.
haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 053ffc7 fix memory-related issues to enable ASAN tests (#14223)
053ffc7 is described below
commit 053ffc7f8f7eb085bfefb66749b5938f944844df
Author: Wang Jiajun <wa...@gmail.com>
AuthorDate: Sun Mar 3 12:37:11 2019 +0800
fix memory-related issues to enable ASAN tests (#14223)
* fix heap overflow
* fix memory leak of optimizer and executer
* uncomment memory pool free
* run cleanup in engine shutdown phase
* make asan tests blocking
* fix abort in mxnet shutdown, use forked submodules temporally for tests
* trigger CI
* change submodule mshadow
* change submodule dmlc-core
---
3rdparty/dmlc-core | 2 +-
3rdparty/mshadow | 2 +-
ci/docker/runtime_functions.sh | 2 --
cpp-package/example/alexnet.cpp | 1 +
cpp-package/example/charRNN.cpp | 10 ++++++++++
cpp-package/example/googlenet.cpp | 1 +
cpp-package/example/inception_bn.cpp | 1 +
cpp-package/example/lenet.cpp | 1 +
cpp-package/example/lenet_with_mxdataiter.cpp | 1 +
cpp-package/example/mlp_cpu.cpp | 1 +
cpp-package/example/mlp_csv.cpp | 1 +
cpp-package/example/mlp_gpu.cpp | 1 +
cpp-package/example/resnet.cpp | 1 +
cpp-package/example/test_optimizer.cpp | 1 +
cpp-package/example/test_score.cpp | 1 +
.../predict-cpp/image-classification-predict.cc | 4 +++-
include/mxnet/ndarray.h | 19 +++++++++++++------
src/common/object_pool.h | 7 +++----
src/engine/threaded_engine.cc | 2 +-
src/engine/threaded_engine.h | 3 ++-
20 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 0a0e8ad..55f3c7b 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
+Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 3dc8081..95ebe0f 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe
+Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1b4caa0..de1b779 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1033,8 +1033,6 @@ integrationtest_ubuntu_cpu_asan() {
set -ex
export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5
- # We do not want to fail the build on ASAN errors until memory leaks have been addressed.
- export ASAN_OPTIONS=exitcode=0
cd /work/mxnet/build/cpp-package/example/
/work/mxnet/cpp-package/example/get_data.sh
./mlp_cpu
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 1c0f713..e2083a0 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -319,6 +319,7 @@ int main(int argc, char const *argv[]) {
}
/*don't foget to release the executor*/
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 54b8eea..8951580 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -500,6 +500,9 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc
std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
SaveCheckpoint(filepath, RNN, exe);
}
+
+ delete exe;
+ delete opt;
}
/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
@@ -577,6 +580,9 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch
std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
SaveCheckpoint(filepath, RNN, exe);
}
+
+ delete exe;
+ delete opt;
}
void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
@@ -639,6 +645,8 @@ void predict(std::wstring* ptext, int sequence_length, const std::string param_f
next = charIndices[n];
ptext->push_back(next);
}
+
+ delete exe;
}
void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
@@ -693,6 +701,8 @@ void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std
next = charIndices[n];
ptext->push_back(next);
}
+
+ delete exe;
}
int main(int argc, char** argv) {
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index c8078af..26ba510 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -190,6 +190,7 @@ int main(int argc, char const *argv[]) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 456e0d9..2073ebe 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -232,6 +232,7 @@ int main(int argc, char const *argv[]) {
LG << "Validation Accuracy: " << val_acc.Get();
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 83c659c..4259454 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -173,6 +173,7 @@ class Lenet {
<< ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
}
delete exe;
+ delete opt;
}
private:
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 39550a3..33110fe 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -177,6 +177,7 @@ int main(int argc, char const *argv[]) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 93eaf05..5d46d40 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -139,6 +139,7 @@ int main(int argc, char** argv) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
index 43a14c8..f12b7c1 100644
--- a/cpp-package/example/mlp_csv.cpp
+++ b/cpp-package/example/mlp_csv.cpp
@@ -267,6 +267,7 @@ int main(int argc, char** argv) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index 0befde8..f606020 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -155,6 +155,7 @@ int main(int argc, char** argv) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 7c9dd4d..7200bd4 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -242,6 +242,7 @@ int main(int argc, char const *argv[]) {
LG << "Validation Accuracy: " << val_acc.Get();
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return 0;
}
diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp
index 42547ea..70190ef 100644
--- a/cpp-package/example/test_optimizer.cpp
+++ b/cpp-package/example/test_optimizer.cpp
@@ -30,6 +30,7 @@ int main(int argc, char** argv) {
opt = OptimizerRegistry::Find("adam");
int ret = (opt == 0) ? 1 : 0;
+ delete opt;
MXNotifyShutdown();
return ret;
}
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 7e5096a..687683f 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -156,6 +156,7 @@ int main(int argc, char** argv) {
}
delete exec;
+ delete opt;
MXNotifyShutdown();
return score >= MIN_SCORE ? 0 : 1;
}
diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc
index 2a605b8..3c72589 100644
--- a/example/image-classification/predict-cpp/image-classification-predict.cc
+++ b/example/image-classification/predict-cpp/image-classification-predict.cc
@@ -76,7 +76,9 @@ class BufferFile {
ifs.seekg(0, std::ios::beg);
std::cout << file_path.c_str() << " ... " << length_ << " bytes\n";
- buffer_.reset(new char[length_]);
+ // Buffer as null terminated to be converted to string
+ buffer_.reset(new char[length_ + 1]);
+ buffer_[length_] = 0;
ifs.read(buffer_.get(), length_);
ifs.close();
}
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index feb562a..c55cb01 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -848,13 +848,17 @@ class NDArray {
// The shape of aux data. The default value for the shape depends on the type of storage.
// If aux_shapes[i].Size() is zero, aux data i is empty.
mxnet::ShapeVector aux_shapes;
+ /*! \brief Reference to the storage to ensure proper destruct order */
+ std::shared_ptr<Storage> storage_ref_;
/*! \brief default cosntructor */
- Chunk() : static_data(true), delay_alloc(false) {}
+ Chunk() : static_data(true), delay_alloc(false),
+ storage_ref_(Storage::_GetSharedRef()) {}
/*! \brief construct a new chunk */
Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
- : static_data(false), delay_alloc(true), ctx(ctx_) {
+ : static_data(false), delay_alloc(true), ctx(ctx_),
+ storage_ref_(Storage::_GetSharedRef()) {
auto size = shape.Size();
storage_shape = shape;
var = Engine::Get()->NewVariable();
@@ -864,7 +868,8 @@ class NDArray {
}
Chunk(const TBlob &data, int dev_id)
- : static_data(true), delay_alloc(false) {
+ : static_data(true), delay_alloc(false),
+ storage_ref_(Storage::_GetSharedRef()) {
CHECK(storage_type == kDefaultStorage);
var = Engine::Get()->NewVariable();
if (data.dev_mask() == cpu::kDevMask) {
@@ -881,7 +886,8 @@ class NDArray {
}
Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
- : static_data(false), delay_alloc(false) {
+ : static_data(false), delay_alloc(false),
+ storage_ref_(Storage::_GetSharedRef()) {
var = Engine::Get()->NewVariable();
ctx = Context::CPUShared(0);
shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
@@ -897,7 +903,7 @@ class NDArray {
const mxnet::ShapeVector &aux_shapes_)
: static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
- aux_shapes(aux_shapes_) {
+ aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
shandle.ctx = ctx;
var = Engine::Get()->NewVariable();
// aux_handles always reflect the correct number of aux data
@@ -914,7 +920,8 @@ class NDArray {
Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
const std::vector<TBlob> &aux_data, int dev_id)
- : static_data(true), delay_alloc(false), storage_type(storage_type_) {
+ : static_data(true), delay_alloc(false), storage_type(storage_type_),
+ storage_ref_(Storage::_GetSharedRef()) {
using namespace mshadow;
CHECK_NE(storage_type, kDefaultStorage);
// init var
diff --git a/src/common/object_pool.h b/src/common/object_pool.h
index 576ff9a..3625533 100644
--- a/src/common/object_pool.h
+++ b/src/common/object_pool.h
@@ -132,10 +132,9 @@ struct ObjectPoolAllocatable {
template <typename T>
ObjectPool<T>::~ObjectPool() {
- // TODO(hotpxl): mind destruction order
- // for (auto i : allocated_) {
- // free(i);
- // }
+ for (auto i : allocated_) {
+ free(i);
+ }
}
template <typename T>
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index 6a60040..b5897a1 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -281,7 +281,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) {
ThreadedOpr::Delete(threaded_opr);
on_complete();
- }, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
+ }, Context::CPU(), {}, deps, FnProperty::kDeleteVar, 0,
"DeleteOperator");
}
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 4a2d419..ab06ca1 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -352,7 +352,8 @@ class ThreadedEngine : public Engine {
LOG(INFO) << "ExecuteOprBlock " << opr_block
<< "shutdown_phase=" << shutdown_phase_;
}
- if (!shutdown_phase_) {
+ // still run cleanup in shutdown_phase
+ if (!shutdown_phase_ || threaded_opr->prop == FnProperty::kDeleteVar) {
try {
OnStart(threaded_opr);
if (debug_info) {