You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/05/18 05:30:46 UTC

[GitHub] zheng-da opened a new issue #10994: MKLDNN fails in the backward computation when forward runs with is_train=False

zheng-da opened a new issue #10994: MKLDNN fails in the backward computation when forward runs with is_train=False
URL: https://github.com/apache/incubator-mxnet/issues/10994
 
 
   This is a pretty special case. When we run forward with is_train=False and MKLDNN is enabled, backward fails with a memory error. @ashokei @pengzhao-intel @TaoLv 
   
   ```python
   def test_hybrid_static_memory():
       x = mx.nd.random.uniform(shape=(2, 3, 32, 32))
       x.attach_grad()
   
       net1 = gluon.model_zoo.vision.get_resnet(
           1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
       net2 = gluon.model_zoo.vision.get_resnet(
           1, 18, pretrained=True, prefix='net_', ctx=mx.context.current_context())
       net1(x)
       net2(x)
   
       net1.save_params('test.params')
       net2.load_params('test.params')
   
       def test(net, x):
           with mx.autograd.record(False):
               y = net(x) + net(x)
               y.backward()
   
           grads = {k: v.grad() for k, v in net.collect_params().items() if v.grad_req != 'null'}
   
           return y, grads
   
       y1, grads1 = test(net1, x)
       y2, grads2 = test(net2, x)
   
       assert_almost_equal(y1.asnumpy(), y2.asnumpy(), rtol=1e-3, atol=1e-5)
       for key in grads1:
           print(key)
           try:
               assert_almost_equal(grads1[key].asnumpy(), grads2[key].asnumpy(), rtol=1e-3, atol=1e-5)
           except Exception as e:
               print(e)
   ```
   
   The memory error is something like this:
   ```
   *** Error in `/usr/bin/python': corrupted double-linked list: 0x00007f426ee97880 ***
   ======= Backtrace: =========
   /lib/x86_64-linux-gnu/libc.so.6(+0x777e5)[0x7f4314aa77e5]
   /lib/x86_64-linux-gnu/libc.so.6(+0x80baf)[0x7f4314ab0baf]
   /lib/x86_64-linux-gnu/libc.so.6(cfree+0x4c)[0x7f4314ab453c]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmkldnn.so.0(mkldnn_primitive_desc_destroy+0xf)[0x7f4308f39bcf]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt19_Sp_counted_deleterIP21mkldnn_primitive_descPF15mkldnn_status_tS1_ESaIvELN9__gnu_cxx12_Lock_policyE2EE10_M_disposeEv+0x2c)[0x7f42db233e4c]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt16_Sp_counted_baseILN9__gnu_cxx12_Lock_policyE2EE10_M_releaseEv+0x42)[0x7f42db22c9a2]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt14__shared_countILN9__gnu_cxx12_Lock_policyE2EED1Ev+0x27)[0x7f42db22a8ad]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12__shared_ptrI21mkldnn_primitive_descLN9__gnu_cxx12_Lock_policyE2EED1Ev+0x1c)[0x7f42db2241ba]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt10shared_ptrI21mkldnn_primitive_descED1Ev+0x18)[0x7f42db2241d6]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN6mkldnn6handleIP21mkldnn_primitive_descNS_13handle_traitsIS2_EEED1Ev+0x18)[0x7f42db2241f2]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN6mkldnn16pooling_backward14primitive_descD1Ev+0x18)[0x7f42db266438]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet2op24MKLDNNPoolingGradComputeERKNS_9OpContextERKNS0_12PoolingParamERKNS_7NDArrayES9_PS8_NS_9OpReqTypeES9_+0xa07)[0x7f42db2643f2]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet2op23PoolingGradComputeExCPUERKN4nnvm9NodeAttrsERKNS_9OpContextERKSt6vectorINS_7NDArrayESaIS9_EERKS8_INS_9OpReqTypeESaISE_EESD_+0x401)[0x7f42dd262e2d]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvRKN4nnvm9NodeAttrsERKN5mxnet9OpContextERKSt6vectorINS4_7NDArrayESaIS9_EERKS8_INS4_9OpReqTypeESaISE_EESD_EPSJ_E9_M_invokeERKSt9_Any_dataS3_S7_SD_SI_SD_+0x91)[0x7f42db3727e4]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvRKN4nnvm9NodeAttrsERKN5mxnet9OpContextERKSt6vectorINS4_7NDArrayESaIS9_EERKS8_INS4_9OpReqTypeESaISE_EESD_EEclES3_S7_SD_SI_SD_+0xa6)[0x7f42dd5a5940]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet10imperative14PushFComputeExERKSt8functionIFvRKN4nnvm9NodeAttrsERKNS_9OpContextERKSt6vectorINS_7NDArrayESaISA_EERKS9_INS_9OpReqTypeESaISF_EESE_EEPKNS2_2OpES5_RKNS_7ContextERKS9_IPNS_6engine3VarESaISW_EES10_RKS9_INS_8ResourceESaIS11_EERKS9_IPSA_SaIS16_EES1A_SJ_ENKUlNS_10RunContextEE_clES1B_+0xf7)[0x7f42dd59f493]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextEEZNS0_10imperative14PushFComputeExERKSt8functionIFvRKN4nnvm9NodeAttrsERKNS0_9OpContextERKSt6vectorINS0_7NDArrayESaISD_EERKSC_INS0_9OpReqTypeESaISI_EESH_EEPKNS5_2OpES8_RKNS0_7ContextERKSC_IPNS0_6engine3VarESaISZ_EES13_RKSC_INS0_8ResourceESaIS14_EERKSC_IPSD_SaIS19_EES1D_SM_EUlS1_E_E9_M_invokeERKSt9_Any_dataOS1_+0x44)[0x7f42dd5aa81f]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextEEEclES1_+0x56)[0x7f42ddc2e03c]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet6engine14ThreadedEngine10BulkAppendESt8functionIFvNS_10RunContextEEENS_7ContextERKSt6vectorIPNS0_3VarESaIS9_EESD_ENKUlS3_E_clES3_+0x61)[0x7f42ddc4124b]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextEEZNS0_6engine14ThreadedEngine10BulkAppendESt8functionIS2_ENS0_7ContextERKSt6vectorIPNS3_3VarESaISA_EESE_EUlS1_E_E9_M_invokeERKSt9_Any_dataOS1_+0x44)[0x7f42ddc44329]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextEEEclES1_+0x56)[0x7f42ddc2e03c]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZN5mxnet6engine14ThreadedEngine9BulkFlushEvENKUlNS_10RunContextENS0_18CallbackOnCompleteEE_clES2_S3_+0x43)[0x7f42ddc35651]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvN5mxnet10RunContextENS0_6engine18CallbackOnCompleteEEZNS2_14ThreadedEngine9BulkFlushEvEUlS1_S3_E_E9_M_invokeERKSt9_Any_dataOS1_OS3_+0x67)[0x7f42ddc388fa]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvN5mxnet10RunContextENS0_6engine18CallbackOnCompleteEEEclES1_S3_+0x67)[0x7f42ddc2ef89]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet6engine14ThreadedEngine15ExecuteOprBlockENS_10RunContextEPNS0_8OprBlockE+0x39f)[0x7f42ddc34f6f]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZN5mxnet6engine23ThreadedEnginePerDevice9CPUWorkerILN4dmlc19ConcurrentQueueTypeE0EEEvNS_7ContextEPNS1_17ThreadWorkerBlockIXT_EEERKSt10shared_ptrINS3_11ManualEventEE+0xaf)[0x7f42ddc495d1]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZZZN5mxnet6engine23ThreadedEnginePerDevice13PushToExecuteEPNS0_8OprBlockEbENKUlvE_clEvENKUlSt10shared_ptrIN4dmlc11ManualEventEEE_clES8_+0x42)[0x7f42ddc47868]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvSt10shared_ptrIN4dmlc11ManualEventEEEZZN5mxnet6engine23ThreadedEnginePerDevice13PushToExecuteEPNS6_8OprBlockEbENKUlvE_clEvEUlS3_E_E9_M_invokeERKSt9_Any_dataOS3_+0x5c)[0x7f42ddc4bf74]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNKSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEEclES3_+0x49)[0x7f42ddc5134b]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES4_EE9_M_invokeIILm0EEEEvSt12_Index_tupleIIXspT_EEE+0x68)[0x7f42ddc512be]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES4_EEclEv+0x2c)[0x7f42ddc511a0]
   /home/ubuntu/incubator-mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt6thread5_ImplISt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN4dmlc11ManualEventEEEES6_EEE6_M_runEv+0x1c)[0x7f42ddc51130]
   /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb8c80)[0x7f42ff8abc80]
   /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba)[0x7f4314e016ba]
   /lib/x86_64-linux-gnu/libc.so.6(clone+0x6d)[0x7f4314b3741d]
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services