You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by ma...@apache.org on 2018/08/10 20:26:25 UTC
[incubator-mxnet] branch master updated: [MXAPPS-805] Notebook execution failures in CI. (#12068)

This is an automated email from the ASF dual-hosted git repository.

marcoabreu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 89717d4  [MXAPPS-805] Notebook execution failures in CI. (#12068)
89717d4 is described below

commit 89717d4fff3bca37796f54e4c7d324cee90c60fb
Author: Vishaal Kapoor <40...@users.noreply.github.com>
AuthorDate: Fri Aug 10 13:26:17 2018 -0700

    [MXAPPS-805] Notebook execution failures in CI. (#12068)
    
    * [MXAPPS-805] Notebook execution failures in CI.
    
    * Add a retry policy when starting a notebook executor to handle the failure to
     start a notebook executor (due to a port collision, kernel taking too
     long to start, etc.).
    
    * Change logging level for tests to INFO so that we have more
     informative test output.
    
    * Make retry logic for Jupyter notebook execution specific to the error
    message we are looking for to prevent false positives in the retry logic.
---
 .../straight_dope/test_notebooks_multi_gpu.py      |  2 ++
 .../straight_dope/test_notebooks_single_gpu.py     |  3 ++-
 tests/utils/notebook_test/__init__.py              | 26 +++++++++++++++++-----
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
index 2038ada..ef07550 100644
--- a/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_multi_gpu.py
@@ -20,6 +20,7 @@
     This file tests that the notebooks requiring multi GPUs run without
     warning or exception.
 """
+import logging
 import unittest
 from straight_dope_test_utils import _test_notebook
 from straight_dope_test_utils import _download_straight_dope_notebooks
@@ -27,6 +28,7 @@ from straight_dope_test_utils import _download_straight_dope_notebooks
 class StraightDopeMultiGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
     # Chapter 7
diff --git a/tests/nightly/straight_dope/test_notebooks_single_gpu.py b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
index 06ced96..fca49f4 100644
--- a/tests/nightly/straight_dope/test_notebooks_single_gpu.py
+++ b/tests/nightly/straight_dope/test_notebooks_single_gpu.py
@@ -21,6 +21,7 @@
     warning or exception.
 """
 import glob
+import logging
 import re
 import os
 import unittest
@@ -51,9 +52,9 @@ NOTEBOOKS_WHITELIST = [
 class StraightDopeSingleGpuTests(unittest.TestCase):
     @classmethod
     def setUpClass(self):
+        logging.basicConfig(level=logging.INFO)
         assert _download_straight_dope_notebooks()
 
-
     def test_completeness(self):
         """
         Make sure that every tutorial that isn't in the whitelist is considered for testing by this
diff --git a/tests/utils/notebook_test/__init__.py b/tests/utils/notebook_test/__init__.py
index 2cdb613..25e96ab 100644
--- a/tests/utils/notebook_test/__init__.py
+++ b/tests/utils/notebook_test/__init__.py
@@ -32,6 +32,9 @@ import nbformat
 
 IPYTHON_VERSION = 4  # Pin to ipython version 4.
 TIME_OUT = 10*60  # Maximum 10 mins/test. Reaching timeout causes test failure.
+RETRIES = 8
+KERNEL_ERROR_MSG = 'Kernel died before replying to kernel_info'
+
 
 def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='tmp_notebook'):
     """Run tutorial Jupyter notebook to catch any execution error.
@@ -72,15 +75,28 @@ def run_notebook(notebook, notebook_dir, kernel=None, no_cache=False, temp_dir='
         os.makedirs(working_dir)
     try:
         notebook = nbformat.read(notebook_path + '.ipynb', as_version=IPYTHON_VERSION)
-        # Adding a small delay to allow time for sockets to be freed
-        # stop-gap measure to battle the 1000ms linger of socket hard coded
-        # in the kernel API code
-        time.sleep(1.1)
         if kernel is not None:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT, kernel_name=kernel)
         else:
             eprocessor = ExecutePreprocessor(timeout=TIME_OUT)
-        nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+
+        # There is a low (< 1%) chance that starting a notebook executor will fail due to the kernel
+        # taking to long to start, or a port collision, etc.
+        for i in range(RETRIES):
+            try:
+                nb, _ = eprocessor.preprocess(notebook, {'metadata': {'path': working_dir}})
+            except RuntimeError as rte:
+                # We check if the exception has to do with the Jupyter kernel failing to start. If
+                # not, we rethrow to prevent the notebook from erring RETRIES times. It is not ideal
+                # to inspect the exception message, but necessary for retry logic, as Jupyter client
+                # throws the generic RuntimeError that can be confused with other Runtime errors.
+                if str(rte) != KERNEL_ERROR_MSG:
+                    raise rte
+
+                logging.info("Error starting preprocessor: {}. Attempt {}/{}".format(str(rte), i+1, RETRIES))
+                time.sleep(1)
+                continue
+            break
     except Exception as err:
         err_msg = str(err)
         errors.append(err_msg)