You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/12/24 18:23:45 UTC

[GitHub] arcadiaphy edited a comment on issue #13710: Error in `python': malloc(): memory corruption: 0x00007f540c0a6190

arcadiaphy edited a comment on issue #13710: Error in `python': malloc(): memory corruption: 0x00007f540c0a6190
URL: https://github.com/apache/incubator-mxnet/issues/13710#issuecomment-449760975
 
 
   I have met the same problem and tried to find the exact place the bug happened.
   
   ENV:
   ```
   Centos 7.0
   Python 2.7.5
   mxnet 1.3.0
   gluon-cv 0.3.0
   ````
   
   Minimal script to reproduce the bug:
   ```
   import numpy as np
   import mxnet as mx
   from mxnet import autograd
   from gluoncv.model_zoo import get_model
   from gluoncv import data as gdata
   from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
   
   net_name = 'ssd_300_vgg16_atrous_voc'
   net = get_model(net_name, pretrained_base=False)
   net.initialize()
   
   width = 300
   height = 300
   with autograd.train_mode():
       _, _, anchors = net(mx.nd.zeros((1, 3, height, width)))
   trans = SSDDefaultTrainTransform(height, width, anchors)
   
   # the bug happens when transform voc 2007 data, here I only create a random data
   image = mx.nd.uniform(low=0, high=255, shape=(500, 400, 3)).astype('uint8')
   box = np.array([[ 47., 239., 194., 370.,  11.,   0.],
                   [  7.,  11., 351., 497.,  14.,   0.]])
   for _ in range(10):
       trans(image, box)
   ```
   
   The backtrace of gdb on debug version mxnet
   ```
   #0  0x00007ffff6d53277 in raise () at /lib64/libc.so.6
   #1  0x00007ffff6d54968 in abort () at /lib64/libc.so.6
   #2  0x00007ffff6d95d37 in __libc_message () at /lib64/libc.so.6
   #3  0x00007ffff6d9fc86 in _int_malloc () at /lib64/libc.so.6
   #4  0x00007ffff6da284c in malloc () at /lib64/libc.so.6
   #5  0x00007fffd8b84ecd in operator new(unsigned long) () at /lib64/libstdc++.so.6
   #6  0x00007fff9697f27c in __gnu_cxx::new_allocator<unsigned long>::allocate(unsigned long, void const*) (this=0x7fff6b45b760, __n=17464)
       at /usr/include/c++/4.8.2/ext/new_allocator.h:104
   #7  0x00007fff96aa2835 in std::_Vector_base<unsigned long, std::allocator<unsigned long> >::_M_allocate(unsigned long) (this=0x7fff6b45b760, __n=17464)
       at /usr/include/c++/4.8.2/bits/stl_vector.h:168
   #8  0x00007fff96aa807f in std::_Vector_base<unsigned long, std::allocator<unsigned long> >::_M_create_storage(unsigned long) (this=0x7fff6b45b760, __n=17464)
       at /usr/include/c++/4.8.2/bits/stl_vector.h:181
   #9  0x00007fff96aa3999 in std::_Vector_base<unsigned long, std::allocator<unsigned long> >::_Vector_base(unsigned long, std::allocator<unsigned long> const&) (this=0x7fff6b45b760, __n=17464, __a=...) at /usr/include/c++/4.8.2/bits/stl_vector.h:136
   #10 0x00007fff96aa041c in std::vector<unsigned long, std::allocator<unsigned long> >::vector(unsigned long, std::allocator<unsigned long> const&) (this=0x7fff6b45b760, __n=17464, __a=...) at /usr/include/c++/4.8.2/bits/stl_vector.h:270
   #11 0x00007fff96a9ca55 in mxnet::op::SortByKey<int, float>(mshadow::Tensor<mshadow::cpu, 1, int>, mshadow::Tensor<mshadow::cpu, 1, float>, bool, mshadow::Tensor<mshadow::cpu, 1, char>*, int, int) (keys=..., values=..., is_ascend=true, workspace=0x0, begin_bit=0, end_bit=32) at src/operator/contrib/./../tensor/sort_op.h:50
   #12 0x00007fff96a8a08d in mxnet::op::BipartiteMatchingForward<mshadow::cpu>(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&) (attrs=..., ctx=..., inputs=std::vector of length 1, capacity 1 = {...}, req=std::vector of length 2, capacity 2 = {...}, outputs=std::vector of length 2, capacity 2 = {...})
       at src/operator/contrib/./bounding_box-inl.h:779
   #13 0x00007fff96861e8d in std::_Function_handler<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&), void (*)(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)>::_M_invoke(std::_Any_data const&, nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&) (__functor=..., __args#0=..., __args#1=..., __args#2=std::vector of length 1, capacity 1 = {...}, __args#3=std::vector of length 2, capacity 2 = {...}, __args#4=std::vector of length 2, capacity 2 = {...}) at /usr/include/c++/4.8.2/functional:2071
   #14 0x00007fff98c896b8 in std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)>::operator()(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&) const (this=0x1eb15d8, __args#0=..., __args#1=..., __args#2=std::vector of length 1, capacity 1 = {...}, __args#3=std::vector of length 2, capacity 2 = {...}, __args#4=std::vector of length 2, capacity 2 = {...})
       at /usr/include/c++/4.8.2/functional:2471
   #15 0x00007fff98dae979 in mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const (__closure=0x1eb1550, rctx=...)
       at src/imperative/./imperative_utils.h:401
   #16 0x00007fff98db4649 in std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext) (__functor=..., __args#0=...) at /usr/include/c++/4.8.2/functional:2071
   #17 0x00007fff99481517 in std::function<void (mxnet::RunContext)>::operator()(mxnet::RunContext) const (this=0x1e5fd20, __args#0=...)
       at /usr/include/c++/4.8.2/functional:2471
   #18 0x00007fff9949245a in mxnet::engine::ThreadedEngine::__lambda26::operator()(mxnet::RunContext, mxnet::Engine::CallbackOnComplete) const (__closure=0x1e5fd20, ctx=..., on_complete=...) at src/engine/threaded_engine.cc:342
   #19 0x00007fff99493523 in std::_Function_handler<void(mxnet::RunContext, mxnet::engine::CallbackOnComplete), mxnet::engine::ThreadedEngine::PushSync(mxnet::Engine::SyncFn, mxnet::Context, const std::vector<mxnet::engine::Var*>&, const std::vector<mxnet::engine::Var*>&, mxnet::FnProperty, int, char const*)::__lambda26>::_M_invoke(const std::_Any_data &, mxnet::RunContext, mxnet::engine::CallbackOnComplete) (__functor=..., __args#0=..., __args#1=...) at /usr/include/c++/4.8.2/functional:2071
   #20 0x00007fff9948218f in std::function<void (mxnet::RunContext, mxnet::engine::CallbackOnComplete)>::operator()(mxnet::RunContext, mxnet::engine::CallbackOnComplete) const (this=0x1d294b0, __args#0=..., __args#1=...) at /usr/include/c++/4.8.2/functional:2471
   #21 0x00007fff99487980 in mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*) (this=0x1121140, run_ctx=..., opr_block=0x112a6b8)
       at src/engine/./threaded_engine.h:363
   #22 0x00007fff9949b103 in mxnet::engine::ThreadedEnginePerDevice::CPUWorker<(dmlc::ConcurrentQueueType)0>(mxnet::Context, mxnet::engine::ThreadedEnginePerDevice::ThreadWorkerBlock<(dmlc::ConcurrentQueueType)0>*, std::shared_ptr<dmlc::ManualEvent> const&) (this=0x1121140, ctx=..., block=0x18ac100, ready_event=std::shared_ptr (count 2, weak 0) 0x1b886c8) at src/engine/threaded_engine_perdevice.cc:296
   #23 0x00007fff99499491 in mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, bool)::{lambda()#1}::operator()() const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}::operator()(dmlc::ManualEvent) const (__closure=0x1b95f20, ready_event=std::shared_ptr (count 2, weak 0) 0x1b886c8)
       at src/engine/threaded_engine_perdevice.cc:116
   ```
   
   The bug is related malloc, it disappears when you change the malloc implementation. I'm not sure if there is still some hidden bug.
   ```
   LD_PRELOAD=/usr/lib64/libtcmalloc.so python reproduce_bug.py
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services