You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2020/09/11 01:01:28 UTC

[GitHub] [incubator-mxnet] samskalicky commented on pull request #19112: [1.x] Backport Fix for duplicate subgraph inputs/outputs (#16131)

samskalicky commented on pull request #19112:
URL: https://github.com/apache/incubator-mxnet/pull/19112#issuecomment-690810445


   @ZhennanQin @pengzhao-intel Im debugging a test failure with this PR:
   ```
   [2020-09-10T19:49:11.883Z] ======================================================================
   [2020-09-10T19:49:11.883Z] ERROR: test_subgraph.test_mobilenetv2_struct
   [2020-09-10T19:49:11.883Z] ----------------------------------------------------------------------
   [2020-09-10T19:49:11.883Z] Traceback (most recent call last):
   [2020-09-10T19:49:11.883Z]   File "/usr/local/lib/python3.5/dist-packages/nose/case.py", line 198, in runTest
   [2020-09-10T19:49:11.883Z]     self.test(*self.arg)
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/tests/python/mkl/../unittest/common.py", line 215, in test_new
   [2020-09-10T19:49:11.883Z]     orig_test(*args, **kwargs)
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/tests/python/mkl/test_subgraph.py", line 815, in test_mobilenetv2_struct
   [2020-09-10T19:49:11.883Z]     check_fusion(net, data_shape, attrs, out_types=['int8', 'auto'])
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/tests/python/mkl/../unittest/common.py", line 215, in test_new
   [2020-09-10T19:49:11.883Z]     orig_test(*args, **kwargs)
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/tests/python/mkl/test_subgraph.py", line 271, in check_fusion
   [2020-09-10T19:49:11.883Z]     exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/python/mxnet/symbol/symbol.py", line 2119, in bind
   [2020-09-10T19:49:11.883Z]     ctypes.byref(handle)))
   [2020-09-10T19:49:11.883Z]   File "/work/mxnet/python/mxnet/base.py", line 246, in check_call
   [2020-09-10T19:49:11.883Z]     raise get_last_ffi_error()
   [2020-09-10T19:49:11.883Z] mxnet.base.MXNetError: Traceback (most recent call last):
   [2020-09-10T19:49:11.883Z]   [bt] (9) /usr/bin/python3(PyEval_EvalFrameEx+0x4eff) [0x53fe5f]
   [2020-09-10T19:49:11.883Z]   [bt] (8) /usr/bin/python3(PyObject_Call+0x47) [0x5c59d7]
   [2020-09-10T19:49:11.883Z]   [bt] (7) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(+0x9fcb) [0x7f7bac29efcb]
   [2020-09-10T19:49:11.883Z]   [bt] (6) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(_ctypes_callproc+0x49a) [0x7f7bac2ab01a]
   [2020-09-10T19:49:11.883Z]   [bt] (5) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call+0x2eb) [0x7f7bac2b088b]
   [2020-09-10T19:49:11.883Z]   [bt] (4) /usr/lib/python3.5/lib-dynload/_ctypes.cpython-35m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7f7bac2b0e20]
   [2020-09-10T19:49:11.883Z]   [bt] (3) /work/mxnet/python/mxnet/../../lib/libmxnet.so(MXExecutorBindEX+0xdcb) [0x7f7b0675858b]
   [2020-09-10T19:49:11.883Z]   [bt] (2) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::Executor::Bind(nnvm::Symbol, mxnet::Context const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, mxnet::Context, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, mxnet::Executor*)+0x39b) [0x7f7b05d9d67b]
   [2020-09-10T19:49:11.883Z]   [bt] (1) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x58f27f1) [0x7f7b05d9b7f1]
   [2020-09-10T19:49:11.883Z]   [bt] (0) /work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x61) [0x7f7b01151441]
   [2020-09-10T19:49:11.883Z]   File "src/executor/graph_executor.cc", line 1892
   [2020-09-10T19:49:11.883Z] MXNetError: Check failed: arg_names.size() == in_args_map.size() (8 vs. 7) : 
   ```
   https://jenkins.mxnet-ci.amazon-ml.com/blue/rest/organizations/jenkins/pipelines/mxnet-validation/pipelines/unix-cpu/branches/PR-19112/runs/1/nodes/296/steps/781/log/?start=0
   And I think I narrowed it down to this part of the mkldnn conv subgraph:
   https://github.com/apache/incubator-mxnet/blame/v1.x/src/operator/subgraph/mkldnn/mkldnn_conv_property.h#L273-L277
   Commenting out the rotation seems to resolve the issue. 
   
   But now im getting a segfault:
   ```
   Thread 1 "python" received signal SIGSEGV, Segmentation fault.
   0x00007fffe60e4bc9 in nnvm::pass::(anonymous namespace)::MXAllocMemory(nnvm::Graph const&, nnvm::IndexedGraph const&, std::pair<unsigned int, unsigned int> const&, std::vector<int, std::allocator<int> >*, std::vector<int, std::allocator<int> >*, std::vector<unsigned int, std::allocator<unsigned int> > const&, nnvm::pass::(anonymous namespace)::MXGraphAllocator*) () from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   (gdb) bt
   #0  0x00007fffe60e4bc9 in nnvm::pass::(anonymous namespace)::MXAllocMemory(nnvm::Graph const&, nnvm::IndexedGraph const&, std::pair<unsigned int, unsigned int> const&, std::vector<int, std::allocator<int> >*, std::vector<int, std::allocator<int> >*, std::vector<unsigned int, std::allocator<unsigned int> > const&, nnvm::pass::(anonymous namespace)::MXGraphAllocator*) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #1  0x00007fffe60e6884 in nnvm::pass::(anonymous namespace)::MXPlanMemory(nnvm::Graph) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #2  0x00007fffe60ac9bc in std::_Function_handler<nnvm::Graph (nnvm::Graph), nnvm::Graph (*)(nnvm::Graph)>::_M_invoke(std::_Any_data const&, nnvm::Graph&&) () from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #3  0x00007fffe7f8a4ce in nnvm::ApplyPasses(nnvm::Graph, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #4  0x00007fffe61318ad in nnvm::ApplyPass(nnvm::Graph, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #5  0x00007fffe69e97c9 in mxnet::exec::GraphExecutor::FinishInitGraph(nnvm::Symbol, nnvm::Graph, mxnet::Executor*, std::unordered_map<nnvm::NodeEntry, mxnet::NDArray, nnvm::NodeEntryHash, nnvm::NodeEntryEqual, std::allocator<std::pair<nnvm::NodeEntry const, mxnet::NDArray> > > const&) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #6  0x00007fffe69eb1c6 in mxnet::exec::GraphExecutor::Init(nnvm::Symbol, mxnet::Context const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, mxnet::Context, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, mxnet::Executor*, std::unordered_map<nnvm::NodeEntry, mxnet::NDArray, nnvm::NodeEntryHash, nnvm::NodeEntryEqual, std::allocator<std::pair<nnvm::NodeEntry const, mxnet::NDArray> > > const&) () from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   #7  0x00007fffe69f7f86 in mxnet::Executor::Bind(nnvm::Symbol, mxnet::Context const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, mxnet::Context, std::less<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, mxnet::Context> > > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, mxnet::Executor*) ()
      from /home/ubuntu/subgraph_fixv18/python/mxnet/../../lib/libmxnet.so
   ```
   which reminds me vaguely of @DickJC123's issue: https://github.com/apache/incubator-mxnet/issues/16685


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org