You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ma...@apache.org on 2018/08/10 20:26:25 UTC
[incubator-mxnet] branch master updated: [MXAPPS-805] Notebook
execution failures in CI. (#12068)
This is an automated email from the ASF dual-hosted git repository.
marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git
The following commit(s) were added to refs/heads/master by this push:
new 89717d4 [MXAPPS-805] Notebook execution failures in CI. (#12068)
89717d4 is described below
commit 89717d4fff3bca37796f54e4c7d324cee90c60fb
Author: Vishaal Kapoor <40...@users.noreply.github.com>
AuthorDate: Fri Aug 10 13:26:17 2018 -0700
[MXAPPS-805] Notebook execution failures in CI. (#12068)
* [MXAPPS-805] Notebook execution failures in CI.
* Add a retry policy when starting a notebook executor to handle the failure to
start a notebook executor (due to a port collision, kernel taking too
long to start, etc.).
* Change logging level for tests to INFO so that we have more
informative test output.
* Make retry logic for Jupyter notebook execution specific to the error
message we are looking for to prevent false positives in the retry logic.
---
.../straight_dope/test_notebooks_multi_gpu.py | 2 ++
.../straight_dope/test_notebooks_single_gpu.py | 3 ++-
tests/utils/notebook_test/__init__.py | 26 +++++++++++++++++-----
3 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
index 2038ada..ef07550 100644
--- a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
@@ -20,6 +20,7 @@
This file tests that the notebooks requiring multi GPUs run without
warning or exception.
"""
+import logging
import unittest
from straight_dope_test_utils import _test_notebook
from straight_dope_test_utils import _download_straight_dope_notebooks
@@ -27,6 +28,7 @@ from straight_dope_test_utils import _download_straight_dope_notebooks
class StraightDopeMultiGpuTests(unittest.TestCase):
@classmethod
def setUpClass(self):
+ logging.basicConfig(level=logging.INFO)
assert _download_straight_dope_notebooks()
# Chapter 7
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index 06ced96..fca49f4 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -21,6 +21,7 @@
warning or exception.
"""
import glob
+import logging
import re
import os
import unittest
@@ -51,9 +52,9 @@ NOTEBOOKS_WHITELIST = [
class StraightDopeSingleGpuTests(unittest.TestCase):
@classmethod
def setUpClass(self):
+ logging.basicConfig(level=logging.INFO)
assert _download_straight_dope_notebooks()
-
def test_completeness(self):
"""
Make sure that every tutorial that isn't in the whitelist is considered for testing by this
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 2cdb613..25e96ab 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,6 +32,9 @@ import nbformat
IPYTHON_VERSION = 4 # Pin to ipython version 4.
TIME_OUT = 10*60 # Maximum 10 mins/test. Reaching timeout causes test failure.
+RETRIES = 8
+KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info'
+
def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
"""Run tutorial Jupyter notebook to catch any execution error.
@@ -72,15 +75,28 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
os.makedirs(working_dir)
try:
notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION)
- # Adding a small delay to allow time for sockets to be freed
- # stop-gap measure to battle the 1000ms linger of socket hard coded
- # in the kernel API code
- time.sleep(1.1)
if kernel is not None:
eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
else:
eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
- nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+
+ # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel
+ # taking to long to start, or a port collision, etc.
+ for i in range(RETRIES):
+ try:
+ nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+ except RuntimeError as rte:
+ # We check if the exception has to do with the Jupyter kernel failing to start. If
+ # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal
+ # to inspect the exception message, but necessary for retry logic, as Jupyter client
+ # throws the generic RuntimeError that can be confused with other Runtime errors.
+ if str(rte) != KERNEL_ERROR_MSG:
+ raise rte
+
+ logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
+ time.sleep(1)
+ continue
+ break
except Exception as err:
err_msg = str(err)
errors.append(err_msg)