You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by GitBox <gi...@apache.org> on 2018/11/15 01:03:56 UTC

[GitHub] wegel removed a comment on issue #9974: DataLoader with workers not compatible with ImageRecordDataset

wegel removed a comment on issue #9974: DataLoader with workers not compatible with ImageRecordDataset
URL: https://github.com/apache/incubator-mxnet/issues/9974#issuecomment-438876850
 
 
   I seem to have the same problem using the latest nightly (installed yesterday using `pip install mxnet-cu92 gluoncv`.
   
   ```
   INFO:root:[Epoch 0][Batch 1309], Speed: 3.994 samples/sec, CrossEntropy=4.666, SmoothL1=1.513
   INFO:root:[Epoch 0][Batch 1319], Speed: 4.015 samples/sec, CrossEntropy=4.663, SmoothL1=1.508
   INFO:root:[Epoch 0] Training cost: 1343.465, CrossEntropy=4.658, SmoothL1=1.506
   Process Process-7:
   Process Process-6:
   Traceback (most recent call last):
   Traceback (most recent call last):
     File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
       self.run()
     File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
       self.run()
     File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
       self._target(*self._args, **self._kwargs)
     File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
       self._target(*self._args, **self._kwargs)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in worker_loop
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in worker_loop
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in <listcomp>
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in <listcomp>
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 131, in __getitem__
       item = self._data[idx]
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 131, in __getitem__
       item = self._data[idx]
     File "/usr/local/lib/python3.6/dist-packages/gluoncv/data/recordio/detection.py", line 70, in __getitem__
       img, label = super(RecordFileDetection, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/gluoncv/data/recordio/detection.py", line 70, in __getitem__
       img, label = super(RecordFileDetection, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/vision/datasets.py", line 257, in __getitem__
       record = super(ImageRecordDataset, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/vision/datasets.py", line 257, in __getitem__
       record = super(ImageRecordDataset, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 189, in __getitem__
       return self._record.read_idx(self._record.keys[idx])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 189, in __getitem__
       return self._record.read_idx(self._record.keys[idx])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 265, in read_idx
       return self.read()
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 265, in read_idx
       return self.read()
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 163, in read
       ctypes.byref(size)))
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 163, in read
       ctypes.byref(size)))
     File "/usr/local/lib/python3.6/dist-packages/mxnet/base.py", line 252, in check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
     File "/usr/local/lib/python3.6/dist-packages/mxnet/base.py", line 252, in check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   mxnet.base.MXNetError: [18:31:51] src/recordio.cc:65: Check failed: header[0] == RecordIOWriter::kMagic Invalid RecordIO File
   
   Stack trace returned 10 entries:
   [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x382eea) [0x7f41fce44eea]
   [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x31f99c3) [0x7f41ffcbb9c3]
   [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(MXRecordIOReaderReadRecord+0x2a) [0x7f41ff536caa]
   [bt] (3) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f4225775dae]
   [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f422577571f]
   [bt] (5) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7f4225989ba4]
   [bt] (6) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12223) [0x7f422598a223]
   [bt] (7) python(_PyObject_FastCallKeywords+0x19c) [0x59f9fc]
   [bt] (8) python() [0x511b0a]
   [bt] (9) python(_PyEval_EvalFrameDefault+0x467) [0x4f5277]
   
   
   mxnet.base.MXNetError: [18:31:51] src/recordio.cc:65: Check failed: header[0] == RecordIOWriter::kMagic Invalid RecordIO File
   
   Stack trace returned 10 entries:
   [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x382eea) [0x7f41fce44eea]
   [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x31f99c3) [0x7f41ffcbb9c3]
   [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(MXRecordIOReaderReadRecord+0x2a) [0x7f41ff536caa]
   [bt] (3) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f4225775dae]
   [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f422577571f]
   [bt] (5) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7f4225989ba4]
   [bt] (6) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12223) [0x7f422598a223]
   [bt] (7) python(_PyObject_FastCallKeywords+0x19c) [0x59f9fc]
   [bt] (8) python() [0x511b0a]
   [bt] (9) python(_PyEval_EvalFrameDefault+0x467) [0x4f5277]
   
   
   Process Process-5:
   Corrupt JPEG data: premature end of data segment
   Traceback (most recent call last):
     File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
       self.run()
     File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
       self._target(*self._args, **self._kwargs)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in worker_loop
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in <listcomp>
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 131, in __getitem__
       item = self._data[idx]
     File "/usr/local/lib/python3.6/dist-packages/gluoncv/data/recordio/detection.py", line 70, in __getitem__
       img, label = super(RecordFileDetection, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/vision/datasets.py", line 257, in __getitem__
       record = super(ImageRecordDataset, self).__getitem__(idx)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 189, in __getitem__
       return self._record.read_idx(self._record.keys[idx])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 265, in read_idx
       return self.read()
     File "/usr/local/lib/python3.6/dist-packages/mxnet/recordio.py", line 163, in read
       ctypes.byref(size)))
     File "/usr/local/lib/python3.6/dist-packages/mxnet/base.py", line 252, in check_call
       raise MXNetError(py_str(_LIB.MXGetLastError()))
   mxnet.base.MXNetError: [18:31:51] src/recordio.cc:65: Check failed: header[0] == RecordIOWriter::kMagic Invalid RecordIO File
   
   Stack trace returned 10 entries:
   [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x382eea) [0x7f41fce44eea]
   [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x31f99c3) [0x7f41ffcbb9c3]
   [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(MXRecordIOReaderReadRecord+0x2a) [0x7f41ff536caa]
   [bt] (3) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f4225775dae]
   [bt] (4) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f422577571f]
   [bt] (5) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2b4) [0x7f4225989ba4]
   [bt] (6) /usr/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12223) [0x7f422598a223]
   [bt] (7) python(_PyObject_FastCallKeywords+0x19c) [0x59f9fc]
   [bt] (8) python() [0x511b0a]
   [bt] (9) python(_PyEval_EvalFrameDefault+0x467) [0x4f5277]
   
   
   ^CTraceback (most recent call last):
     File "train_ssd.py", line 259, in <module>
   Process Process-8:
       train(net, train_data, val_data, eval_metric, ctx, args)
     File "train_ssd.py", line 219, in train
       map_name, mean_ap = validate(net, val_data, ctx, eval_metric)
     File "train_ssd.py", line 116, in validate
       for batch in val_data:
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 242, in __next__
       if self._rcvd_idx in self._data_buffer:
   KeyboardInterrupt
   Traceback (most recent call last):
     File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
       self.run()
     File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
       self._target(*self._args, **self._kwargs)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 166, in worker_loop
       idx, samples = key_queue.get()
     File "/usr/lib/python3.6/multiprocessing/queues.py", line 94, in get
       res = self._recv_bytes()
     File "/usr/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
       buf = self._recv_bytes(maxlength)
     File "/usr/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
       buf = self._recv(4)
     File "/usr/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
       chunk = read(handle, remaining)
   root@f58bc533d3e6:/data/mxnet# ^C
   root@f58bc533d3e6:/data/mxnet# MXNET_CUDNN_AUTOTUNE_DEFAULT=0 python train_ssd.py --num-worker=1
   INFO:root:Namespace(batch_size=4, data_shape=512, dataset='voc', epochs=240, gpus='0', log_interval=10, lr=0.001, lr_decay=0.1, lr_decay_epoch='160,200', momentum=0.9, network='vgg16_atrous', num_workers=1, resume='', save_interval=10, save_prefix='ssd_512_resnet50_v1_coco', seed=233, start_epoch=0, val_interval=1, wd=0.0005)
   INFO:root:Start training from [Epoch 0]
   ^CProcess Process-1:
   Traceback (most recent call last):
     File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
       self.run()
     File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
       self._target(*self._args, **self._kwargs)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in worker_loop
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py", line 169, in <listcomp>
       batch = batchify_fn([dataset[i] for i in samples])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataset.py", line 133, in __getitem__
       return self._fn(*item)
     File "/usr/local/lib/python3.6/dist-packages/gluoncv/data/transforms/presets/ssd.py", line 176, in __call__
       gt_bboxes = mx.nd.array(bbox[np.newaxis, :, :4])
     File "/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/utils.py", line 146, in array
       return _array(source_array, ctx=ctx, dtype=dtype)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py", line 2435, in array
       arr[:] = source_array
     File "/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py", line 444, in __setitem__
       self._set_nd_basic_indexing(key, value)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py", line 710, in _set_nd_basic_indexing
       self._sync_copyfrom(value)
     File "/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py", line 876, in _sync_copyfrom
       ctypes.c_size_t(source_array.size)))
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services