You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ha...@apache.org on 2019/03/03 04:37:35 UTC

[incubator-mxnet] branch master updated: fix memory-related issues to enable ASAN tests (#14223)

This is an automated email from the ASF dual-hosted git repository.

haibin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 053ffc7  fix memory-related issues to enable ASAN tests (#14223)
053ffc7 is described below

commit 053ffc7f8f7eb085bfefb66749b5938f944844df
Author: Wang Jiajun <wa...@gmail.com>
AuthorDate: Sun Mar 3 12:37:11 2019 +0800

    fix memory-related issues to enable ASAN tests (#14223)
    
    * fix heap overflow
    
    * fix memory leak of optimizer and executer
    
    * uncomment memory pool free
    
    * run cleanup in engine shutdown phase
    
    * make asan tests blocking
    
    * fix abort in mxnet shutdown, use forked submodules temporally for tests
    
    * trigger CI
    
    * change submodule mshadow
    
    * change submodule dmlc-core
---
 3rdparty/dmlc-core                                    |  2 +-
 3rdparty/mshadow                                      |  2 +-
 ci/docker/runtime_functions.sh                        |  2 --
 cpp-package/example/alexnet.cpp                       |  1 +
 cpp-package/example/charRNN.cpp                       | 10 ++++++++++
 cpp-package/example/googlenet.cpp                     |  1 +
 cpp-package/example/inception_bn.cpp                  |  1 +
 cpp-package/example/lenet.cpp                         |  1 +
 cpp-package/example/lenet_with_mxdataiter.cpp         |  1 +
 cpp-package/example/mlp_cpu.cpp                       |  1 +
 cpp-package/example/mlp_csv.cpp                       |  1 +
 cpp-package/example/mlp_gpu.cpp                       |  1 +
 cpp-package/example/resnet.cpp                        |  1 +
 cpp-package/example/test_optimizer.cpp                |  1 +
 cpp-package/example/test_score.cpp                    |  1 +
 .../predict-cpp/image-classification-predict.cc       |  4 +++-
 include/mxnet/ndarray.h                               | 19 +++++++++++++------
 src/common/object_pool.h                              |  7 +++----
 src/engine/threaded_engine.cc                         |  2 +-
 src/engine/threaded_engine.h                          |  3 ++-
 20 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core
index 0a0e8ad..55f3c7b 160000
--- a/3rdparty/dmlc-core
+++ b/3rdparty/dmlc-core
@@ -1 +1 @@
-Subproject commit 0a0e8addf92e1287fd7a25c6314016b8c0138dee
+Subproject commit 55f3c7bc1d875fbc7d34fc26651bb8c6818c8355
diff --git a/3rdparty/mshadow b/3rdparty/mshadow
index 3dc8081..95ebe0f 160000
--- a/3rdparty/mshadow
+++ b/3rdparty/mshadow
@@ -1 +1 @@
-Subproject commit 3dc80815d965b56b9a975dc27229361955bf66fe
+Subproject commit 95ebe0f109ae021d0d66e2a627ccfc55c3253b55
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 1b4caa0..de1b779 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -1033,8 +1033,6 @@ integrationtest_ubuntu_cpu_asan() {
     set -ex
     export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5
 
-    # We do not want to fail the build on ASAN errors until memory leaks have been addressed.
-    export ASAN_OPTIONS=exitcode=0
     cd /work/mxnet/build/cpp-package/example/
     /work/mxnet/cpp-package/example/get_data.sh
     ./mlp_cpu
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 1c0f713..e2083a0 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -319,6 +319,7 @@ int main(int argc, char const *argv[]) {
   }
   /*don't foget to release the executor*/
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 54b8eea..8951580 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -500,6 +500,9 @@ void train(const std::string file, int batch_size, int max_epoch, int start_epoc
     std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
     SaveCheckpoint(filepath, RNN, exe);
   }
+
+  delete exe;
+  delete opt;
 }
 
 /*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
@@ -577,6 +580,9 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch
     std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
     SaveCheckpoint(filepath, RNN, exe);
   }
+
+  delete exe;
+  delete opt;
 }
 
 void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
@@ -639,6 +645,8 @@ void predict(std::wstring* ptext, int sequence_length, const std::string param_f
     next = charIndices[n];
     ptext->push_back(next);
   }
+
+  delete exe;
 }
 
 void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
@@ -693,6 +701,8 @@ void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std
     next = charIndices[n];
     ptext->push_back(next);
   }
+
+  delete exe;
 }
 
 int main(int argc, char** argv) {
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index c8078af..26ba510 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -190,6 +190,7 @@ int main(int argc, char const *argv[]) {
   }
 
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 456e0d9..2073ebe 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -232,6 +232,7 @@ int main(int argc, char const *argv[]) {
     LG << "Validation Accuracy: " << val_acc.Get();
   }
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 83c659c..4259454 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -173,6 +173,7 @@ class Lenet {
          << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
     }
     delete exe;
+    delete opt;
   }
 
  private:
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 39550a3..33110fe 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -177,6 +177,7 @@ int main(int argc, char const *argv[]) {
   }
 
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 93eaf05..5d46d40 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -139,6 +139,7 @@ int main(int argc, char** argv) {
   }
 
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
index 43a14c8..f12b7c1 100644
--- a/cpp-package/example/mlp_csv.cpp
+++ b/cpp-package/example/mlp_csv.cpp
@@ -267,6 +267,7 @@ int main(int argc, char** argv) {
     }
 
     delete exec;
+    delete opt;
     MXNotifyShutdown();
     return 0;
 }
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index 0befde8..f606020 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -155,6 +155,7 @@ int main(int argc, char** argv) {
   }
 
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 7c9dd4d..7200bd4 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -242,6 +242,7 @@ int main(int argc, char const *argv[]) {
     LG << "Validation Accuracy: " << val_acc.Get();
   }
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/test_optimizer.cpp b/cpp-package/example/test_optimizer.cpp
index 42547ea..70190ef 100644
--- a/cpp-package/example/test_optimizer.cpp
+++ b/cpp-package/example/test_optimizer.cpp
@@ -30,6 +30,7 @@ int main(int argc, char** argv) {
   opt = OptimizerRegistry::Find("adam");
   int ret = (opt == 0) ? 1 : 0;
 
+  delete opt;
   MXNotifyShutdown();
   return ret;
 }
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 7e5096a..687683f 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -156,6 +156,7 @@ int main(int argc, char** argv) {
   }
 
   delete exec;
+  delete opt;
   MXNotifyShutdown();
   return score >= MIN_SCORE ? 0 : 1;
 }
diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc
index 2a605b8..3c72589 100644
--- a/example/image-classification/predict-cpp/image-classification-predict.cc
+++ b/example/image-classification/predict-cpp/image-classification-predict.cc
@@ -76,7 +76,9 @@ class BufferFile {
     ifs.seekg(0, std::ios::beg);
     std::cout << file_path.c_str() << " ... " << length_ << " bytes\n";
 
-    buffer_.reset(new char[length_]);
+    // Buffer as null terminated to be converted to string
+    buffer_.reset(new char[length_ + 1]);
+    buffer_[length_] = 0;
     ifs.read(buffer_.get(), length_);
     ifs.close();
   }
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index feb562a..c55cb01 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -848,13 +848,17 @@ class NDArray {
     // The shape of aux data. The default value for the shape depends on the type of storage.
     // If aux_shapes[i].Size() is zero, aux data i is empty.
     mxnet::ShapeVector aux_shapes;
+    /*! \brief Reference to the storage to ensure proper destruct order */
+    std::shared_ptr<Storage> storage_ref_;
 
     /*! \brief default cosntructor */
-    Chunk() : static_data(true), delay_alloc(false) {}
+    Chunk() : static_data(true), delay_alloc(false),
+              storage_ref_(Storage::_GetSharedRef()) {}
 
     /*! \brief construct a new chunk */
     Chunk(mxnet::TShape shape, Context ctx_, bool delay_alloc_, int dtype)
-        : static_data(false), delay_alloc(true), ctx(ctx_) {
+        : static_data(false), delay_alloc(true), ctx(ctx_),
+          storage_ref_(Storage::_GetSharedRef()) {
       auto size = shape.Size();
       storage_shape = shape;
       var = Engine::Get()->NewVariable();
@@ -864,7 +868,8 @@ class NDArray {
     }
 
     Chunk(const TBlob &data, int dev_id)
-        : static_data(true), delay_alloc(false) {
+        : static_data(true), delay_alloc(false),
+          storage_ref_(Storage::_GetSharedRef()) {
       CHECK(storage_type == kDefaultStorage);
       var = Engine::Get()->NewVariable();
       if (data.dev_mask() == cpu::kDevMask) {
@@ -881,7 +886,8 @@ class NDArray {
     }
 
     Chunk(int shared_pid, int shared_id, const mxnet::TShape& shape, int dtype)
-        : static_data(false), delay_alloc(false) {
+        : static_data(false), delay_alloc(false),
+          storage_ref_(Storage::_GetSharedRef()) {
       var = Engine::Get()->NewVariable();
       ctx = Context::CPUShared(0);
       shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);
@@ -897,7 +903,7 @@ class NDArray {
           const mxnet::ShapeVector &aux_shapes_)
         : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
           aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
-          aux_shapes(aux_shapes_) {
+          aux_shapes(aux_shapes_), storage_ref_(Storage::_GetSharedRef()) {
       shandle.ctx = ctx;
       var = Engine::Get()->NewVariable();
       // aux_handles always reflect the correct number of aux data
@@ -914,7 +920,8 @@ class NDArray {
 
     Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
           const std::vector<TBlob> &aux_data, int dev_id)
-        : static_data(true), delay_alloc(false), storage_type(storage_type_) {
+        : static_data(true), delay_alloc(false), storage_type(storage_type_),
+          storage_ref_(Storage::_GetSharedRef()) {
       using namespace mshadow;
       CHECK_NE(storage_type, kDefaultStorage);
       // init var
diff --git a/src/common/object_pool.h b/src/common/object_pool.h
index 576ff9a..3625533 100644
--- a/src/common/object_pool.h
+++ b/src/common/object_pool.h
@@ -132,10 +132,9 @@ struct ObjectPoolAllocatable {
 
 template <typename T>
 ObjectPool<T>::~ObjectPool() {
-  // TODO(hotpxl): mind destruction order
-  // for (auto i : allocated_) {
-  //   free(i);
-  // }
+  for (auto i : allocated_) {
+    free(i);
+  }
 }
 
 template <typename T>
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index 6a60040..b5897a1 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -281,7 +281,7 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
   this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) {
       ThreadedOpr::Delete(threaded_opr);
       on_complete();
-    }, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
+    }, Context::CPU(), {}, deps, FnProperty::kDeleteVar, 0,
     "DeleteOperator");
 }
 
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 4a2d419..ab06ca1 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -352,7 +352,8 @@ class ThreadedEngine : public Engine {
       LOG(INFO) << "ExecuteOprBlock " << opr_block
                 << "shutdown_phase=" << shutdown_phase_;
     }
-    if (!shutdown_phase_) {
+    // still run cleanup in shutdown_phase
+    if (!shutdown_phase_ || threaded_opr->prop == FnProperty::kDeleteVar) {
       try {
         OnStart(threaded_opr);
         if (debug_info) {