You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2020/07/19 21:13:50 UTC
[incubator-mxnet] branch master updated: Unittest tolerance handling improvements (#18694)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 146b49e  Unittest tolerance handling improvements (#18694)
146b49e is described below

commit 146b49ead32b941f74db694f2d453cb25650d252
Author: Dick Carter <dc...@nvidia.com>
AuthorDate: Sun Jul 19 14:12:50 2020 -0700

    Unittest tolerance handling improvements (#18694)
    
    * Add sm arch 80 to Makefile
    
    * Add TF32 to cuBLAS GEMMs
    
    Signed-off-by: Serge Panev <sp...@nvidia.com>
    
    * Add CUDA version guards
    
    Signed-off-by: Serge Panev <sp...@nvidia.com>
    
    * Remove useless TF32 for double and old CUDA version
    
    Signed-off-by: Serge Panev <sp...@nvidia.com>
    
    * Factorize VERSION_ADJUSTED_TF32_MATH
    
    Signed-off-by: Serge Panev <sp...@nvidia.com>
    
    * Add TF32 considerations to test_util.py:check_consistency()
    
    * Bypass test_gluon_gpu.py:test_large_models if gmem >32GB
    
    * Default tols in assert_almost_equal() now a function of dtype and ctx
    
    * Expand types listed by default_tols()
    
    * Fix pylint
    
    * All with_seed() tests to waitall in teardown
    
    * Elevate MXNET_TEST_SEED logging to WARNING
    
    * Revert test_gluon_gpu.py:test_rnn_layer to default tols
    
    * Fix test_gluon_model_zoo_gpu.py::test_inference and test_operator_gpy.py::test_np_linalg_{solve,tensorinv}
    
    * test_numpy_interoperability.py to not fix seed for rest of CI
    
    * Further fix to test_np_linalg_tensorinv
    
    * Fix test_gluon_data.py:test_dataloader_context when run on 1-GPU system.
    
    * Fix test_operator_gpu.py::test_embedding_with_type
    
    * Fix test_operator_gpu.py::{test_*convolution_large_c,test_np_linalg_tensorsolve}
    
    * Remove unneeded print() from test_numpy_interoperability.py
    
    * Unify tol handling of check_consistency() and assert_almost_equal().  Test tweeks.
    
    * Add tol handling of assert_almost_equal() with number args
    
    * Add tol handling of bool comparisons
    
    * Fix test_numpy_op.py::test_np_random_rayleigh
    
    * Fix test_operator_gpu.py::test_batchnorm_with_type
    
    * Fix test_gluon.py::test_sync_batchnorm in cpu selftest
    
    * Improve unittest failure reporting
    
    * Add to robustness of test_operator_gpu.py::test_embedding_with_type
    
    * Check_consistency() to use equal backward gradients for increased test robustness
    
    * Fix test_operator_gpu.py::test_{fully_connected,gemm}.  Add default_numeric_eps().
    
    * test_utils.py fix for numeric gradient calc
    
    * Reinstate rtol=1e-2 for test_operator.py::test_order
    
    * Remove auto-cast of check_consistency() input data to least precise dtype (not needed)
    
    * Fix test_operator.py::test_{reciprocol,cbrt,rcbrt}_op
    
    * Expand default float64 numeric_eps for test_operator_gpu.py::test_sofmin
    
    * Fix segfault-on-error of @retry decorator. Add test isolation.
    
    * assert_almost_equal() to handle a,b scalars
    
    * Fix test_operator_gpu.py::test_gluon_{mvn,mvn_v1} race
    
    * Fix test_operator_gpu.py::test_flatten_slice_after_conv via scale
    
    * Remove test_utils.py:almost_equal_ignore_nan()
    
    * Fix sample vs. pop variance issue with test_numpy_op.py::test_npx_batch_norm
    
    * Expose test_utils.py:effective_dtype() and use to fix test_operator_gpu.py::test_np_linalg_svd
    
    * Fix true_divide int_array / int_scalar -> float_array to honor np_default_dtype
    
    * Try test_elemwise_binary_ops serial to avoid pytest worker crash
    
    * Fix (log_)softmax backward on empty ndarray
    
    * Temporarily log all CI seeds to troubleshoot seed non-determinism
    
    * Revert "Temporarily log all CI seeds to troubleshoot seed non-determinism"
    
    This reverts commit f60eff20785b812ac4fcd70d51359ee0cbfb3e47.
    
    * Temp log all CI seeds to troubleshoot unwanted seed determinism
    
    * Revert "Add sm arch 80 to Makefile"
    
    This reverts commit f9306cecc53b0633ef5f5b7b000802fbf0d73fe9.
    
    * Same fix of sample vs. pop variance issue, now with test_operator_gpu.py::test_batchnorm
    
    * Revert "Temp log all CI seeds to troubleshoot unwanted seed determinism"
    
    This reverts commit ff328efb0be3445690669d5437a6af575ff12b49.
    
    * Marking test_sparse_dot_grad with garbage_expected after teardown error
    
    * Fix flakiness of test_gluon_probability{_v1,_v2}.py::test_gluon_kl{_v1,}
    
    * Temp skip of test_aggregate_duplication on gpu
    
    * Add seeding to test_{numpy,}_contrib_gluon_data_vision.py.  Make created files unique.
    
    * Add ndarray module isolation to help debug test_bbox_augmenters worker crash
    
    * Marking test_sparse_square_sum serial after pytest worker crash
    
    * Fix flakiness of test_gluon_probability{_v1,_v2}.py::test_half_cauchy{_v1,}
    
    Co-authored-by: Serge Panev <sp...@nvidia.com>
    Co-authored-by: Bart Gawrych <ga...@intel.com>
---
 python/mxnet/test_utils.py                         | 377 +++++++++++++--------
 src/operator/linalg.h                              |   8 +
 src/operator/linalg_impl.h                         |  34 +-
 src/operator/nn/log_softmax.cc                     |   1 +
 src/operator/nn/softmax.cc                         |   1 +
 src/operator/numpy/np_true_divide-inl.h            |  14 +-
 tests/python/gpu/test_gluon_gpu.py                 |  16 +-
 tests/python/gpu/test_gluon_model_zoo_gpu.py       |   2 +-
 tests/python/gpu/test_operator_gpu.py              |  95 +++---
 tests/python/gpu/test_profiler_gpu.py              |   2 +
 tests/python/unittest/common.py                    |  22 +-
 tests/python/unittest/test_autograd.py             |   1 +
 .../unittest/test_contrib_gluon_data_vision.py     |   5 +-
 tests/python/unittest/test_gluon.py                |   4 +-
 tests/python/unittest/test_gluon_probability_v1.py |  20 +-
 tests/python/unittest/test_gluon_probability_v2.py |  21 +-
 tests/python/unittest/test_ndarray.py              |   2 +-
 .../test_numpy_contrib_gluon_data_vision.py        |   9 +-
 .../python/unittest/test_numpy_interoperability.py |   7 +-
 tests/python/unittest/test_numpy_op.py             |  46 ++-
 tests/python/unittest/test_operator.py             | 126 ++++---
 tests/python/unittest/test_sparse_operator.py      |   1 +
 22 files changed, 487 insertions(+), 327 deletions(-)

diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index dd02783..9ec0c6c 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -70,19 +70,110 @@ def default_dtype():
     # _TODO: get default dtype from environment variable
     return np.float32
 
+def default_rtols():
+    """Get default relative tolerances for data comparisons involving each data type."""
+    return {np.dtype(np.float16): 1e-2,
+            np.dtype(np.float32): 1e-4,
+            np.dtype(np.float64): 1e-5,
+            np.dtype(np.bool): 0,
+            np.dtype(np.int8): 0,
+            np.dtype(np.uint8): 0,
+            np.dtype(np.int32): 0,
+            np.dtype(np.uint32): 0,
+            np.dtype(np.int64): 0,
+            np.dtype(np.uint64): 0}
+
+def default_atols():
+    """Get default absolute tolerances for data comparisons involving each data type."""
+    return {np.dtype(np.float16): 1e-1,
+            np.dtype(np.float32): 1e-3,
+            np.dtype(np.float64): 1e-20,
+            np.dtype(np.bool): 0,
+            np.dtype(np.int8): 0,
+            np.dtype(np.uint8): 0,
+            np.dtype(np.int32): 0,
+            np.dtype(np.uint32): 0,
+            np.dtype(np.int64): 0,
+            np.dtype(np.uint64): 0}
+
+def default_numeric_eps():
+    """Get default epsilon for finite difference gradient calculations with data type."""
+    # prefer a power-of-two eps, since no bits are dropped when serving as an input delta
+    return {np.dtype(np.float16): 1.0 / 2**6,
+            np.dtype(np.float32): 1.0 / 2**9,
+            np.dtype(np.float64): 1.0 / 2**14}
+
+
+def effective_dtype(dat):
+    """ Return the most appropriate dtype for determining the tolerance used in dat comparisons
+    Parameters
+    ----------
+    dat : np.ndarray or mx.nd.array or mx.np.ndarray
+    """
+    # On arch 80 gpus, a float32-io gemm or conv op will trim the mantissa of data
+    # inputs to be of comparable precision to a float16, so float16 becomes the
+    # 'effective dtype' for tolerance tests involving such op outputs.
+
+    # Is TF32 enabled in the ctx (the default on arch 80 GPUs)
+    def is_TF32_enabled(ctx):
+        try:
+            return (ctx.device_type == 'gpu' and
+                    get_cuda_compute_capability(ctx) == 80 and
+                    os.environ.get('NVIDIA_TF32_OVERRIDE') != '0')
+        except:  # pylint: disable=bare-except
+            return False
+
+    ctx = dat.ctx if hasattr(dat, 'ctx') else None
+    dtype = np.dtype(dat.dtype)
+    if dtype == np.dtype(np.float32) and is_TF32_enabled(ctx):
+        return np.dtype(np.float16)
+    else:
+        return dtype
 
-def get_atol(atol=None):
-    """Get default numerical threshold for regression test."""
-    # _TODO: get from env variable, different threshold might
-    # be needed for different device and dtype
-    return 1e-20 if atol is None else atol
+
+def get_tolerance(dat, tol, default_tol):
+    """ Return the tolerance to be used for dat comparisons based on the given tol, datatype and context.
+    Parameters
+    ----------
+    dat : np.ndarray or mx.nd.array or mx.np.ndarray
+    tol : float, or a dict of dtype->float
+    default_tol : default dict of dtype->float for all types
+    """
+
+    if isinstance(tol, numbers.Number):
+        return tol
+
+    # If the caller has supplied a tol dict, use that if it has an entry for dtype,
+    # else use the supplied default tol dict.
+    dtype = effective_dtype(dat)
+    tol = {} if tol is None else tol
+    return tol.get(dtype, default_tol[dtype])
 
 
-def get_rtol(rtol=None):
+def get_tols(x, y, rtol, atol):
+    """For comparing two datasets 'x' and 'y', what relative and absolute tolerances should be used."""
+    # Tolerance analysis needs 'dtype' of 'x' and 'y', so convert numbers to numpy scalars as needed
+    if isinstance(x, numbers.Number):
+        x = np.array(x)
+    if isinstance(y, numbers.Number):
+        y = np.array(y)
+
+    # If tols are not specified, use the largest default tol for 'x' and 'y' based on their ctx and dtype.
+    rtol = max(get_tolerance(x, rtol, default_rtols()),
+               get_tolerance(y, rtol, default_rtols()))
+    atol = max(get_tolerance(x, atol, default_atols()),
+               get_tolerance(y, atol, default_atols()))
+
+    return rtol, atol
+
+
+def get_atol(atol=None, dtype=np.dtype(np.float64)):
     """Get default numerical threshold for regression test."""
-    # _TODO: get from env variable, different threshold might
-    # be needed for different device and dtype
-    return 1e-5 if rtol is None else rtol
+    return default_atols()[dtype] if atol is None else atol
+
+def get_rtol(rtol=None, dtype=np.dtype(np.float64)):
+    """Get default numerical threshold for regression test."""
+    return default_rtols()[dtype] if rtol is None else rtol
 
 def get_etol(etol=None):
     """Get default numerical threshold for regression test."""
@@ -499,10 +590,8 @@ def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     return ret
 
 
-def find_max_violation(a, b, rtol=None, atol=None):
+def _find_max_violation(a, b, rtol, atol):
     """Finds and returns the location of maximum violation."""
-    rtol = get_rtol(rtol)
-    atol = get_atol(atol)
     # 'smart' absdiff that considers inf's as equals (to match np.allclose)
     absdiff = np.where(np.equal(a, b), 0, np.abs(a-b))
     tol = atol + rtol*np.abs(b)
@@ -565,9 +654,9 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     ----------
     a : np.ndarray or mx.nd.array
     b : np.ndarray or mx.nd.array
-    rtol : None or float
+    rtol : None or float or dict of dtype -> float
         The relative threshold. Default threshold will be used if set to ``None``.
-    atol : None or float
+    atol : None or float or dict of dtype -> float
         The absolute threshold. Default threshold will be used if set to ``None``.
     names : tuple of names, optional
         The names used in error message when an exception occurs
@@ -579,8 +668,12 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     if not use_broadcast:
         checkShapes(a, b)
 
-    rtol = get_rtol(rtol)
-    atol = get_atol(atol)
+    rtol, atol = get_tols(a, b, rtol, atol)
+
+    if isinstance(a, mx.numpy.ndarray):
+        a = a.asnumpy()
+    if isinstance(b, mx.numpy.ndarray):
+        b = b.asnumpy()
     use_np_allclose = isinstance(a, np.ndarray) and isinstance(b, np.ndarray)
     if not use_np_allclose:
         if not (hasattr(a, 'ctx') and hasattr(b, 'ctx') and a.ctx == b.ctx and a.dtype == b.dtype):
@@ -604,32 +697,37 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
         a = a.asnumpy()
         b = b.asnumpy()
 
-    index, rel = find_max_violation(a, b, rtol, atol)
-    indexErr = index
-    relErr = rel
-
-    print('\n*** Maximum errors for vector of size {}:  rtol={}, atol={}\n'.format(a.size, rtol, atol))
-    aTmp = a.copy()
-    bTmp = b.copy()
-    i = 1
-    while i <= a.size:
-        if i <= mismatches[0]:
-            print("%3d: Error %f  %s" %(i, rel, locationError(a, b, index, names)))
+    index, rel = _find_max_violation(a, b, rtol, atol)
+    if index != ():
+        # a, b are the numpy arrays
+        indexErr = index
+        relErr = rel
+
+        print('\n*** Maximum errors for vector of size {}:  rtol={}, atol={}\n'.format(a.size, rtol, atol))
+        aTmp = a.copy()
+        bTmp = b.copy()
+        i = 1
+        while i <= a.size:
+            if i <= mismatches[0]:
+                print("%3d: Error %f  %s" %(i, rel, locationError(a, b, index, names)))
+
+            aTmp[index] = bTmp[index] = 0
+            if almost_equal(aTmp, bTmp, rtol, atol, equal_nan=equal_nan):
+                break
 
-        aTmp[index] = bTmp[index] = 0
-        if almost_equal(aTmp, bTmp, rtol, atol, equal_nan=equal_nan):
-            break
+            i += 1
+            if i <= mismatches[1] or mismatches[1] <= 0:
+                index, rel = _find_max_violation(aTmp, bTmp, rtol, atol)
+            else:
+                break
 
-        i += 1
-        if i <= mismatches[1] or mismatches[1] <= 0:
-            index, rel = find_max_violation(aTmp, bTmp, rtol, atol)
-        else:
-            break
+        mismatchDegree = "at least " if mismatches[1] > 0 and i > mismatches[1] else ""
+        errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e (mismatch %s%f%%).\n%s" % \
+                 (relErr, rtol, atol, mismatchDegree, 100*i/a.size, \
+                  locationError(a, b, indexErr, names, maxError=True))
+    else:
+        errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e.\n" % (rel, rtol, atol)
 
-    mismatchDegree = "at least " if mismatches[1] > 0 and i > mismatches[1] else ""
-    errMsg = "Error %f exceeds tolerance rtol=%e, atol=%e (mismatch %s%f%%).\n%s" % \
-             (relErr, rtol, atol, mismatchDegree, 100*i/a.size, \
-             locationError(a, b, indexErr, names, maxError=True))
     np.set_printoptions(threshold=4, suppress=True)
     msg = npt.build_err_msg([a, b], err_msg=errMsg)
 
@@ -648,16 +746,25 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
     ----------
     a : np.ndarray
     b : np.ndarray
+    rtol : None or float or dict of dtype -> float
+        The relative threshold. Default threshold will be used if set to ``None``.
+    atol : None or float or dict of dtype -> float
+        The absolute threshold. Default threshold will be used if set to ``None``.
     threshold : None or float
         The checking threshold. Default threshold will be used if set to ``None``.
     etol : None or float
         The error rate threshold. If etol is float, return true if error_rate < etol even if
         any error is found.
+    names : tuple of names, optional
+        The names used in error message when an exception occurs
+    equal_nan : boolean, optional
+        The flag determining how to treat NAN values in comparison
+    mismatches : tuple of mismatches
+        Maximum number of mismatches to be printed (mismatches[0]) and determine (mismatches[1])
     """
     etol = get_etol(etol)
     if etol > 0:
-        rtol = get_rtol(rtol)
-        atol = get_atol(atol)
+        rtol, atol = get_tols(a, b, rtol, atol)
         if isinstance(a, mx.nd.NDArray):
             a = a.asnumpy()
         if isinstance(b, mx.nd.NDArray):
@@ -665,7 +772,7 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
         equals = np.isclose(a, b, rtol=rtol, atol=atol)
         err = 1 - np.count_nonzero(equals) / equals.size
         if err > etol:
-            index, rel = find_max_violation(a, b, rtol, atol)
+            index, rel = _find_max_violation(a, b, rtol, atol)
             indexErr = index
             relErr = rel
 
@@ -683,7 +790,7 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
 
                 i += 1
                 if i <= mismatches[1] or mismatches[1] <= 0:
-                    index, rel = find_max_violation(aTmp, bTmp, rtol, atol)
+                    index, rel = _find_max_violation(aTmp, bTmp, rtol, atol)
                 else:
                     break
 
@@ -698,31 +805,6 @@ def assert_almost_equal_with_err(a, b, rtol=None, atol=None, etol=None,
         assert_almost_equal(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
 
 
-def almost_equal_ignore_nan(a, b, rtol=None, atol=None):
-    """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
-    Combines a relative and absolute measure of approximate eqality.
-    If either the relative or absolute check passes, the arrays are considered equal.
-    Including an absolute check resolves issues with the relative check where all
-    array values are close to zero.
-
-    Parameters
-    ----------
-    a : np.ndarray
-    b : np.ndarray
-    rtol : None or float
-        The relative threshold. Default threshold will be used if set to ``None``.
-    atol : None or float
-        The absolute threshold. Default threshold will be used if set to ``None``.
-    """
-    a = np.copy(a)
-    b = np.copy(b)
-    nan_mask = np.logical_or(np.isnan(a), np.isnan(b))
-    a[nan_mask] = 0
-    b[nan_mask] = 0
-
-    return almost_equal(a, b, rtol, atol)
-
-
 def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b')):
     """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
     Combines a relative and absolute measure of approximate eqality.
@@ -954,7 +1036,7 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4,
 
     return approx_grads
 
-def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2,
+def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=None, rtol=None,
                            atol=None, grad_nodes=None, use_forward_train=True, ctx=None,
                            grad_stype_dict=None, dtype=default_dtype()):
     """Verify an operation by checking backward pass via finite difference method.
@@ -979,8 +1061,10 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
         The auxiliary states required when generating the executor for the symbol.
     numeric_eps : float, optional
         Delta for the finite difference method that approximates the gradient.
-    check_eps : float, optional
-        relative error eps used when comparing numeric grad to symbolic grad.
+    rtol : None or float
+        The relative threshold. Default threshold will be used if set to ``None``.
+    atol : None or float
+        The absolute threshold. Default threshold will be used if set to ``None``.
     grad_nodes : None or list or tuple or dict, optional
         Names of the nodes to check gradient on
     use_forward_train : bool
@@ -997,9 +1081,6 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
     [1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
     """
     assert dtype in (np.float16, np.float32, np.float64)
-    # cannot use finite differences with small eps without high precision
-    if dtype in (np.float32, np.float16):
-        assert numeric_eps >= 1e-5
     if ctx is None:
         ctx = default_context()
 
@@ -1074,18 +1155,18 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
 
     executor.forward(is_train=True)
     assert len(executor.outputs) == 1
+
+    eps = get_tolerance(executor.outputs[0], numeric_eps, default_numeric_eps())
+    # cannot use finite differences with small eps without high precision
+    if dtype in (np.float32, np.float16):
+        assert eps >= 1e-5
+
     executor.backward()
-    symbolic_grads = {}
-    for k in grad_nodes:
-        grad_k = executor.grad_dict[k]
-        if grad_k is not None:
-            symbolic_grads[k] = grad_k.asnumpy()
-        else:
-            symbolic_grads[k] = None
+    symbolic_grads = executor.grad_dict
 
     numeric_gradients = numeric_grad(
         executor, location_npy, aux_states_npy,
-        eps=numeric_eps, use_forward_train=use_forward_train, dtype=dtype)
+        eps=eps, use_forward_train=use_forward_train, dtype=dtype)
 
     for name in grad_nodes:
         fd_grad = numeric_gradients[name]
@@ -1095,6 +1176,8 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
             assert_almost_equal(fd_grad, sym_grad, rtol, atol,
                                 ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
         elif grad_req[name] == 'add':
+            if isinstance(sym_grad, mx.nd.NDArray):
+                sym_grad = sym_grad.asnumpy()
             assert_almost_equal(fd_grad, sym_grad - orig_grad, rtol, atol,
                                 ("NUMERICAL_%s"%name, "BACKWARD_%s"%name))
         elif grad_req[name] == 'null':
@@ -1103,7 +1186,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
             raise ValueError("Invalid grad_req %s for argument %s"%(grad_req[name], name))
 
 
-def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
+def check_symbolic_forward(sym, location, expected, rtol=None, atol=None,
                            aux_states=None, ctx=None, equal_nan=False,
                            dtype=default_dtype()):
     """Compares a symbol's forward results with the expected ones.
@@ -1127,8 +1210,10 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
             Contains arrays corresponding to exe.outputs.
         - if type is dict of str to np.ndarray
             Contains mapping between sym.list_output() and exe.outputs.
-    check_eps : float, optional
-        Relative error to check to.
+    rtol : None or float
+        The relative threshold. Default threshold will be used if set to ``None``.
+    atol : None or float
+        The absolute threshold. Default threshold will be used if set to ``None``.
     aux_states : list of np.ndarray of dict, optional
         - if type is list of np.ndarray
             Contains all the NumPy arrays corresponding to sym.list_auxiliary_states
@@ -1177,14 +1262,14 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
 
     executor.forward(is_train=False)
 
-    outputs = [x.asnumpy() for x in executor.outputs]
+    outputs = executor.outputs
     for output_name, expect, output in zip(sym.list_outputs(), expected, outputs):
         assert_almost_equal(expect, output, rtol, atol,
                             ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name),
                             equal_nan=equal_nan)
     return executor.outputs
 
-def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None,
+def check_symbolic_backward(sym, location, out_grads, expected, rtol=None, atol=None,
                             aux_states=None, grad_req='write', ctx=None, grad_stypes=None,
                             equal_nan=False, dtype=default_dtype()):
     """Compares a symbol's backward results with the expected ones.
@@ -1215,8 +1300,10 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
             Contains arrays corresponding to exe.grad_arrays
         - if type is dict of str to np.ndarray
             Contains mapping between ``sym.list_arguments()`` and exe.outputs.
-    check_eps: float, optional
-        Relative error to check to.
+    rtol : None or float
+        The relative threshold. Default threshold will be used if set to ``None``.
+    atol : None or float
+        The absolute threshold. Default threshold will be used if set to ``None``.
     aux_states : list of np.ndarray or dict of str to np.ndarray
     grad_req : str or list of str or dict of str to str, optional
         Gradient requirements. 'write', 'add' or 'null'.
@@ -1302,7 +1389,7 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
         assert out_grads is None
     executor.backward(out_grads)
 
-    grads = {k: v.asnumpy() for k, v in args_grad_data.items()}
+    grads = args_grad_data
 
     for name in expected:
         if grad_req[name] == 'write':
@@ -1310,7 +1397,8 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
                                 ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
                                 equal_nan=equal_nan)
         elif grad_req[name] == 'add':
-            assert_almost_equal(expected[name], grads[name] - args_grad_npy[name],
+            grad = grads[name].asnumpy() if isinstance(grads[name], mx.nd.NDArray) else grads[name]
+            assert_almost_equal(expected[name], grad - args_grad_npy[name],
                                 rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
                                 equal_nan=equal_nan)
         elif grad_req[name] == 'null':
@@ -1395,16 +1483,8 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
         raise ValueError('typ can only be "whole" or "forward".')
 
 
-def get_tolerance(rtol, ctx):
-    if 'atol' in ctx:
-        return ctx['atol']
-    if 'atol_mult' in ctx:
-        return ctx['atol_mult'] * rtol
-    return rtol
-
-
 def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
-                      arg_params=None, aux_params=None, tol=None,
+                      arg_params=None, aux_params=None, rtol=None, atol=None,
                       raise_on_err=True, ground_truth=None, equal_nan=False,
                       use_uniform=False, rand_type=np.float64):
     """Check symbol gives the same output for different running context
@@ -1419,6 +1499,20 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
         Standard deviation of the inner normal distribution. Used in initialization.
     grad_req : str or list of str or dict of str to str
         Gradient requirement.
+    arg_params : dict of input name -> input data
+        data to use for non-aux inputs
+    aux_params : dict of input name -> input data
+        data to use for aux inputs
+    rtol : float or dictionary dtype->float, optional
+        The relative error tolerance.
+    atol : float or dictionary dtype->float, optional
+        The absolute error tolerance.
+    raise_on_err : bool, optional, defaults to True
+        Should an error raise an exception (or just output exception message)
+    ground_truth : dict of output name -> data, optional
+        Provided ideal result to be compared against
+    equal_nan : bool, optional, defaults to False
+        Should nans be treated as equal in the comparison
     use_unifrom: bool
         Optional, When flag set to true,
         random input data generated follows uniform distribution,
@@ -1454,20 +1548,6 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
   'type_dict': {'concat_arg0': np.float32, 'concat_arg1': np.float32}}]
     >>> check_consistency(sym, ctx_list)
     """
-    if tol is None:
-        tol = {np.dtype(np.float16): 1e-1,
-               np.dtype(np.float32): 1e-3,
-               np.dtype(np.float64): 1e-5,
-               np.dtype(np.uint8): 0,
-               np.dtype(np.int32): 0,
-               np.dtype(np.int64): 0}
-    elif isinstance(tol, numbers.Number):
-        tol = {np.dtype(np.float16): tol,
-               np.dtype(np.float32): tol,
-               np.dtype(np.float64): tol,
-               np.dtype(np.uint8): tol,
-               np.dtype(np.int32): tol,
-               np.dtype(np.int64): tol}
 
     assert len(ctx_list) > 1
     if isinstance(sym, Symbol):
@@ -1485,10 +1565,16 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
 
     arg_params = {} if arg_params is None else arg_params
     aux_params = {} if aux_params is None else aux_params
-    for n, arr in exe_list[0].arg_dict.items():
+
+    # returns the least precise of two dtypes
+    def smaller_dtype(dt1, dt2):
+        return dt1 if dt2 is None or np.dtype(dt1).itemsize < np.dtype(dt2).itemsize else dt2
+
+    # It's important to assign random inputs in a deterministic order, for reproducibility.
+    for n, arr in _sorted_items(exe_list[0].arg_dict):
         if n not in arg_params:
             if use_uniform:
-                arg_params[n] = np.random.uniform(low=-0.92, high=0.92,
+                arg_params[n] = np.random.uniform(low=-0.92 * scale, high=0.92 * scale,
                                                   size=arr.shape).astype(rand_type)
             else:
                 arg_params[n] = np.random.normal(size=arr.shape,
@@ -1511,25 +1597,22 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
         exe.forward(is_train=False)
 
     dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list]
-    max_idx = np.argmax(dtypes)
+    # Select the ground truth as the first model having the highest precision output[0]
+    gt_idx = np.argmax(dtypes)
     gt = ground_truth
     if gt is None:
-        gt = exe_list[max_idx].output_dict.copy()
+        gt = exe_list[gt_idx].output_dict.copy()
 
     for i, exe in enumerate(exe_list):
-        if i == max_idx:
+        if i == gt_idx:
             continue
 
-        rtol = tol[dtypes[i]]
-        atol = get_tolerance(rtol, ctx_list[i])
         for name, arr in zip(output_names, exe.outputs):
-            # Previously, the cast was to dtypes[i], but symbol may be mixed-precision,
-            # so casting the ground truth to the actual output type seems more correct.
-            gtarr = gt[name].astype(arr.dtype)
+            gtarr = gt[name]
             try:
                 assert_almost_equal(arr, gtarr, rtol=rtol, atol=atol, equal_nan=equal_nan)
             except AssertionError as e:
-                print('Predict Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
+                print('Predict Err: ctx %d vs ctx %d at %s'%(i, gt_idx, name))
                 traceback.print_exc()
                 if raise_on_err:
                     raise e
@@ -1538,33 +1621,55 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
 
     # train
     if grad_req != 'null':
+        # Perform forward()
         for exe in exe_list:
             exe.forward(is_train=True)
-            exe.backward(exe.outputs)
+        # Use the first executor's output data, cast to the least precise dtype,
+        # as the gradient data to pass to all executor's backward() call.
+        least_precise_dtype = [out.dtype for out in exe_list[0].outputs]
+        for exe in exe_list:
+            least_precise_dtype = [smaller_dtype(out1.dtype, dt) \
+                                    for (out1, dt) in zip(exe.outputs, least_precise_dtype)]
+        golden_data_np = [out.astype(dt).asnumpy() \
+                          for (out, dt) in zip(exe_list[0].outputs, least_precise_dtype)]
+        # Perform backward()
+        for exe in exe_list:
+            out_grads = [mx.nd.array(golden_np, ctx=exe._ctx,
+                                     dtype=out.dtype).tostype(out.stype)
+                         for (golden_np, out) in zip(golden_data_np, exe.outputs)]
+            exe.backward(out_grads)
+
         gt = ground_truth
         if gt is None:
-            gt = exe_list[max_idx].output_dict.copy()
+            gt = exe_list[gt_idx].output_dict.copy()
             if grad_req != 'null':
-                gt.update(exe_list[max_idx].grad_dict)
+                gt.update(exe_list[gt_idx].grad_dict)
         for i, exe in enumerate(exe_list):
-            if i == max_idx:
+            if i == gt_idx:
                 continue
 
-            rtol = tol[dtypes[i]]
-            atol = get_tolerance(rtol, ctx_list[i])
             curr = zip(output_names + arg_names, exe.outputs + exe.grad_arrays)
             for name, arr in curr:
                 if gt[name] is None:
                     assert arr is None, name
                     continue
 
-                # Previous cast was to dtypes[i], but symbol may be mixed-precision,
-                # so casting the ground truth to the actual output type seems more correct.
-                gtarr = gt[name].astype(arr.dtype)
+                gtarr = gt[name]
                 try:
-                    assert_almost_equal(arr, gtarr, rtol=rtol, atol=atol, equal_nan=equal_nan)
+                    rt, at = rtol, atol
+                    # If the primary data i/o type is float16, then the tolerance used when
+                    # comparing a float32 input gradient (e.g. batchnorm gamma) should be float16.
+                    smaller_arr_dtype = smaller_dtype(arr.dtype, dtypes[i])
+                    smaller_gt_dtype = smaller_dtype(gtarr.dtype, dtypes[gt_idx])
+                    if smaller_arr_dtype != arr.dtype or \
+                       smaller_gt_dtype != gtarr.dtype:
+                        rt, at = get_tols(arr.astype(smaller_arr_dtype),
+                                          gtarr.astype(smaller_gt_dtype), rtol, atol)
+                    assert_almost_equal(arr, gtarr, rtol=rt, atol=at, equal_nan=equal_nan)
                 except AssertionError as e:
-                    print('Train Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
+                    print('Train Err: {} {} ctx {} vs {} {} ctx {} at {}'.format(
+                        np.dtype(arr.dtype).name, arr.ctx, i,
+                        np.dtype(gtarr.dtype).name, gtarr.ctx, gt_idx, name))
                     traceback.print_exc()
                     if raise_on_err:
                         raise e
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
index 291e251..3e82c6a 100644
--- a/src/operator/linalg.h
+++ b/src/operator/linalg.h
@@ -280,6 +280,14 @@ void linalg_batch_det_backward_helper(const Tensor<xpu, 3, DType>& LU,
                                       const DType zero_det,
                                       const mxnet::OpContext& ctx);
 
+#ifdef __CUDACC__
+#if CUDA_VERSION < 11000
+#define VERSION_ADJUSTED_TF32_MATH CUBLAS_DEFAULT_MATH
+#else
+#define VERSION_ADJUSTED_TF32_MATH CUBLAS_TF32_TENSOR_OP_MATH
+#endif
+#endif  // __CUDACC__
+
 #include "linalg_impl.h"
 
 #endif  // MXNET_OPERATOR_LINALG_H_
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index 104acd5..6d94f33 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -212,12 +212,15 @@ inline void linalg_gemm<gpu, float>(const Tensor<gpu, 2, float>& A,
 #else
   cublasDataType_t full_datatype = CUBLAS_DATA_FULL;
 #endif
+  auto handle = Stream<gpu>::GetBlasHandle(s);
+  cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH);
   CUBLAS_CALL(cublasSgemmEx(
-      Stream<gpu>::GetBlasHandle(s), (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
+      handle, (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
       (tA ? CUBLAS_OP_T : CUBLAS_OP_N), C.size(1), C.size(0),
       (tB ? B.size(1) : B.size(0)), &alpha, B.dptr_, full_datatype, B.stride_,
       A.dptr_, full_datatype, A.stride_, &beta, C.dptr_, full_datatype,
-      C.stride_))
+      C.stride_));
+  CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode));
 }
 
 #else
@@ -235,13 +238,16 @@ void linalg_gemm_axis<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<g
   using mshadow::gpu; \
   CHECK_NOTNULL(s); \
   linalg_check_batch_size(A.size(1), B.size(1), C.size(1)); \
-  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+  auto handle = Stream<gpu>::GetBlasHandle(s); \
+  cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
+  CUBLAS_CALL(cublas##fname(handle, \
                             (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
                             (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
                             C.size(2), C.size(0), (tB ? B.size(2) : B.size(0)), &alpha, \
                             B.dptr_, B.size(1)*B.stride_, B.stride_, \
                             A.dptr_, A.size(1)*A.stride_, A.stride_, &beta, \
                             C.dptr_, C.size(1)*C.stride_, C.stride_, A.size(1))) \
+  CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode)); \
 }
 LINALG_GPU_GEMM_AXIS(SgemmStridedBatched, float)
 LINALG_GPU_GEMM_AXIS(DgemmStridedBatched, double)
@@ -349,13 +355,22 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
     linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
     check_gemm(A[0], B[0], C[0], alpha, beta, tA, tB); \
     using namespace mshadow::cuda; \
-    CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+    auto handle = Stream<gpu>::GetBlasHandle(s); \
+    cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
+    CUBLAS_CALL(cublas##fname(handle, \
                               (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
                               (tA ? CUBLAS_OP_T : CUBLAS_OP_N), \
                               C.size(2), C.size(1), (tB ? B.size(2) : B.size(1)), \
-                              &alpha, B.dptr_, B.stride_, B.size(1) * B.stride_, \
-                              A.dptr_,  A.stride_, A.size(1) * A.stride_, \
-                              &beta, C.dptr_, C.stride_, C.size(1) * C.stride_, A.size(0))) \
+                              &alpha, \
+                              B.dptr_, B.stride_, \
+                              static_cast<int64_t>(B.size(1) * B.stride_), \
+                              A.dptr_,  A.stride_, \
+                              static_cast<int64_t>(A.size(1) * A.stride_), \
+                              &beta, \
+                              C.dptr_, C.stride_, \
+                              static_cast<int64_t>(C.size(1) * C.stride_), \
+                              A.size(0))) \
+    CUBLAS_CALL(cublasSetMathMode(handle, saved_math_mode)); \
   }
 
   LINALG_GPU_BATCH_GEMM(DgemmStridedBatched, double)
@@ -380,7 +395,7 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
 
       using namespace mshadow::cuda;
       auto cublas_math_mode =
-          use_tensor_ops ? CUBLAS_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
+          use_tensor_ops ? CUBLAS_TENSOR_OP_MATH : VERSION_ADJUSTED_TF32_MATH;
       auto previous_math_mode = SetCublasMathMode(blas_handle, cublas_math_mode);
 
       // cublasGemmStridedBatchedEx is only supported for GPU with architecture
@@ -421,6 +436,8 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
     CHECK_NOTNULL(s); \
     linalg_check_batch_size(A.size(0), B.size(0), C.size(0)); \
     linalg_check_batch_size(A.size(2), B.size(2), C.size(2)); \
+    auto handle = Stream<gpu>::GetBlasHandle(s);                                           \
+    cublasMath_t saved_math_mode = SetCublasMathMode(handle, VERSION_ADJUSTED_TF32_MATH); \
     for (index_t i = 0; i < A.size(2); ++i) { \
       CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
           (tB ? CUBLAS_OP_T : CUBLAS_OP_N), \
@@ -430,6 +447,7 @@ void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half:
           A.dptr_+i*A.stride_, A.size(2) * A.stride_, A.size(1)*A.size(2)*A.stride_, &beta, \
           C.dptr_+i*C.stride_, C.size(2) * C.stride_, C.size(1)*C.size(2)*C.stride_, A.size(0))) \
     }\
+    SetCublasMathMode(handle, saved_math_mode); \
   }
 
   LINALG_GPU_BATCH_GEMM_AXIS(SgemmStridedBatched, float)
diff --git a/src/operator/nn/log_softmax.cc b/src/operator/nn/log_softmax.cc
index f3ef4ab..28ae8cf 100644
--- a/src/operator/nn/log_softmax.cc
+++ b/src/operator/nn/log_softmax.cc
@@ -58,6 +58,7 @@ static void LogSoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                        const std::vector<NDArray>& inputs,
                                        const std::vector<OpReqType>& req,
                                        const std::vector<NDArray>& outputs) {
+  if (inputs[0].shape().Size() == 0U) return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   if (SupportMKLDNNLogSoftmax(param, inputs[1], outputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index b95e159..9b28b71 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -59,6 +59,7 @@ static void SoftmaxGradComputeExCPU(const nnvm::NodeAttrs& attrs,
                                     const std::vector<NDArray>& inputs,
                                     const std::vector<OpReqType>& req,
                                     const std::vector<NDArray>& outputs) {
+  if (inputs[0].shape().Size() == 0U) return;
   const SoftmaxParam& param = nnvm::get<SoftmaxParam>(attrs.parsed);
   if (SupportMKLDNNSoftmax(param, inputs[1], outputs[0])) {
     MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
diff --git a/src/operator/numpy/np_true_divide-inl.h b/src/operator/numpy/np_true_divide-inl.h
index 6e97511..ea0057b 100644
--- a/src/operator/numpy/np_true_divide-inl.h
+++ b/src/operator/numpy/np_true_divide-inl.h
@@ -59,15 +59,17 @@ void TrueDivideScalarCompute(const nnvm::NodeAttrs &attrs,
       });
     });
   } else {
-    CHECK_EQ(outputs[0].type_flag_, mxnet::common::GetDefaultDtype())
+    CHECK(out.type_flag_ == mshadow::kFloat32 || out.type_flag_ == mshadow::kFloat64)
       << "true_divide only supports float32 and float64"
          " output when input's dtype is "
       << type_string(inputs[0].type_flag_);
-    MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-        Kernel<op_with_req<OP, Req>, xpu>::Launch(
-          s, data.Size(), out.dptr<float>(), data.dptr<DType>(),
-          static_cast<float>(alpha));
+    MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, ODType, {
+      MXNET_INT_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+          Kernel<op_with_req<OP, Req>, xpu>::Launch(
+            s, data.Size(), out.dptr<ODType>(), data.dptr<DType>(),
+            static_cast<ODType>(alpha));
+        });
       });
     });
   }
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index ba7deae..e259b74 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -50,10 +50,9 @@ def check_rnn_layer(layer):
         states = layer.begin_state(16)
         co, cs = layer(x, states)
 
-    # atol of 1e-6 required, as exposed by seed 2124685726
-    assert_almost_equal(go, co, rtol=1e-2, atol=1e-6)
+    assert_almost_equal(go, co)
     for g, c in zip(gs, cs):
-        assert_almost_equal(g, c, rtol=1e-2, atol=1e-6)
+        assert_almost_equal(g, c)
 
 
 @with_seed()
@@ -70,9 +69,9 @@ def check_rnn_layer_w_rand_inputs(layer):
         states = layer.begin_state(16)
         co, cs = layer(x, states)
 
-    assert_almost_equal(go, co, rtol=1e-2, atol=1e-6)
+    assert_almost_equal(go, co)
     for g, c in zip(gs, cs):
-        assert_almost_equal(g, c, rtol=1e-2, atol=1e-6)
+        assert_almost_equal(g, c)
 
 
 @with_seed()
@@ -485,6 +484,13 @@ def test_large_models():
     # This in the past has given cudnnFind() trouble when it needed to allocate similar I/O's
     # from the area carved out by the MXNET_GPU_MEM_POOL_RESERVE setting (by default 5%).
     (free_mem_bytes, total_mem_bytes) = mx.context.gpu_memory_info(ctx.device_id)
+    # This test needs to be 'qualified' for use with each new larger memory size
+    largest_supported_total_mem_GB = 32
+    if (total_mem_bytes > largest_supported_total_mem_GB * 1024 * 1024 * 1024):
+        sys.stderr.write(
+        ' bypassing test due to too-large global memory of size {} ... '.format(total_mem_bytes))
+        return
+
     start_size = tensor_size(0.20 * total_mem_bytes)
     num_trials = 10
     sys.stderr.write(
diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py
index 4272971..1d0d3f4 100644
--- a/tests/python/gpu/test_gluon_model_zoo_gpu.py
+++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py
@@ -89,7 +89,7 @@ def test_inference(model_name):
         max_val = np.max(np.abs(cpu_out.asnumpy()))
         gpu_max_val = np.max(np.abs(gpu_out.asnumpy()))
         eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val))
-        assert_almost_equal(cpu_out / max_val, gpu_out / gpu_max_val, rtol=1e-3, atol=1e-3)
+        assert_almost_equal(cpu_out / max_val, gpu_out / gpu_max_val)
 
 def get_nn_model(name):
     if "densenet" in name:
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 9b96960..d088b44 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -23,6 +23,7 @@ import multiprocessing as mp
 import mxnet as mx
 import numpy as np
 import pytest
+import itertools
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose
 from mxnet.test_utils import check_symbolic_forward, check_symbolic_backward, discard_stderr
 from mxnet.test_utils import default_context, rand_shape_2d, rand_ndarray, same
@@ -404,30 +405,20 @@ def test_batchnorm_with_type():
   ]
 
   # V2, 2D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_2D)
+  bools = [False, True]
+  for fix_gamma, cudnn_off in itertools.product(bools, bools):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_2D)
 
   # V2, 1D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_1D)
-  #
-  # # V2, 3D
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=False, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_3D)
-  sym = mx.sym.BatchNorm(name='norm', fix_gamma=True, cudnn_off=True)
-  check_consistency(sym, ctx_list_v2_3D)
+  for fix_gamma, cudnn_off in itertools.product(bools, bools):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_1D)
+
+  # V2, 3D
+  for fix_gamma, cudnn_off in itertools.product(bools, [True,]):
+      sym = mx.sym.BatchNorm(name='norm', fix_gamma=fix_gamma, cudnn_off=cudnn_off)
+      check_consistency(sym, ctx_list_v2_3D)
 
 
 @with_seed()
@@ -529,9 +520,9 @@ def test_convolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
     # test ability to turn off training on bias
-    check_consistency(sym, ctx_list, grad_req={'conv_data': 'write', 'conv_weight': 'write', 'conv_bias': 'null'}, tol=tol)
+    check_consistency(sym, ctx_list, grad_req={'conv_data': 'write', 'conv_weight': 'write', 'conv_bias': 'null'}, rtol=tol, atol=tol)
 
 
 # Apply N symbols against each of M contexts, checking that all NxM combinations match.
@@ -616,7 +607,6 @@ def test_conv_deconv_guards():
     # Test cases for convolution and deconvolution via strided fft.  Ensure that the framework
     # guards against problematic CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING in cuDNN [7.3.1,7.5)
     # see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/rel_750.html#rel_750
-    tol = 1e-1
     for (op, opname) in [(mx.sym.Convolution, 'conv'), (mx.sym.Deconvolution, 'deconv')]:
         dataname = opname + '_data'
         ctx = {'ctx': mx.gpu(0), dataname: (32, 32, 64, 64), 'type_dict': {dataname: np.float32}}
@@ -631,7 +621,7 @@ def test_conv_deconv_guards():
             try:
                 sym = op(**test_case_args)
                 sym_no_cudnn = op(cudnn_off=True, **test_case_args)
-                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=tol)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], scale=0.1)
             except:
                 print('Test failure of mx.sym.{} with args: {}'.format(op.__name__, test_case_args))
                 raise
@@ -655,7 +645,7 @@ def _conv_with_num_streams(seed):
                                               cudnn_off=True, name='conv')
             try:
                 # tol can be pretty high- we're looking for a large diff due to garbaged workspace
-                check_consistency([sym, sym_no_cudnn], [ctx, ctx], tol=1e-2)
+                check_consistency([sym, sym_no_cudnn], [ctx, ctx], rtol=1e-2, atol=1e-2)
             except:
                 print('Failing conv size = {}'.format(size))
                 raise
@@ -678,20 +668,19 @@ def test_convolution_multiple_streams():
 @pytest.mark.serial
 def test_convolution_large_c():
     problematic_c = 64 * 1024
-    # The convolution accumulates many values, so set large tolerances.
-    tol = {np.dtype(np.float32): 1,
-           np.dtype(np.float64): 1}
+    # The convolution accumulates many values, so scale the input magnitude.
+    scale = 0.1
     def test_1D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, width), 'type_dict': {'conv_data': np.float64}}]
         sym = mx.sym.Convolution(layout='NCW', num_filter=8, kernel=(2,), name='conv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     def test_2D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'conv_data': (1, problematic_c, 2, width), 'type_dict': {'conv_data': np.float64}}]
         sym = mx.sym.Convolution(layout='NCHW', num_filter=4, kernel=(2,2), name='conv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     # Run with different data tensor shapes to run cudnnFind() multiple times.
     # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
@@ -709,20 +698,19 @@ def test_convolution_large_c():
 @pytest.mark.serial
 def test_deconvolution_large_c():
     problematic_c = 64 * 1024
-    # The deconvolution accumulates many values, so set large tolerances.
-    tol = {np.dtype(np.float32): 1,
-           np.dtype(np.float64): 1}
+    # The deconvolution accumulates many values, so scale the input magnitude.
+    scale = 0.1
     def test_1D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'deconv_data': (1, 8, width), 'type_dict': {'deconv_data': np.float64}}]
         sym = mx.sym.Deconvolution(layout='NCW', num_filter=problematic_c, kernel=(2,), name='deconv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     def test_2D_with_width(width, grad_req):
         ctx_list = [{'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float32}},
                     {'ctx': mx.gpu(0), 'deconv_data': (1, 8, 2, width), 'type_dict': {'deconv_data': np.float64}}]
         sym = mx.sym.Deconvolution(layout='NCHW', num_filter=problematic_c, kernel=(2,2), name='deconv')
-        check_consistency([sym, sym], ctx_list, tol=tol, grad_req=grad_req)
+        check_consistency([sym, sym], ctx_list, grad_req=grad_req, scale=scale)
 
     # Run with different data tensor shapes to run cudnnFind() multiple times.
     # First, populate algo and op caches with models that always use cudnnFind() (req == 'write').
@@ -831,8 +819,8 @@ def test_deconvolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
-    check_consistency(sym, ctx_list, tol=tol, grad_req="add")
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol, grad_req="add")
 
     # 2D deconvolution
     sym = mx.sym.Deconvolution(num_filter=2, kernel=(3,3), name='deconv')
@@ -847,8 +835,8 @@ def test_deconvolution_with_type():
                np.dtype(np.float64): 1e-5,
                np.dtype(np.uint8): 0,
                np.dtype(np.int32): 0}
-    check_consistency(sym, ctx_list, tol=tol)
-    check_consistency(sym, ctx_list, tol=tol, grad_req="add")
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol)
+    check_consistency(sym, ctx_list, rtol=tol, atol=tol, grad_req="add")
 
 
 @with_seed()
@@ -931,10 +919,11 @@ def test_bilinear_sampler_with_type():
 def test_grid_generator_with_type():
     data = mx.sym.Variable('data')
     sym = mx.sym.GridGenerator(data=data, transform_type='affine', target_shape=(20, 20))
+    scale = 1
     ctx_list = [{'ctx': mx.gpu(0), 'data': (3, 6), 'type_dict': {'data': np.float32}},
                 {'ctx': mx.cpu(0), 'data': (3, 6), 'type_dict': {'data': np.float32}}]
-    check_consistency(sym, ctx_list)
-    check_consistency(sym, ctx_list, grad_req="add")
+    check_consistency(sym, ctx_list, scale=scale)
+    check_consistency(sym, ctx_list, scale=scale, grad_req="add")
     sym = mx.sym.GridGenerator(data=data, transform_type='warp', target_shape=(20, 20))
     ctx_list = [{'ctx': mx.gpu(0), 'data': (3, 2, 20, 20), 'type_dict': {'data': np.float32}},
                 {'ctx': mx.cpu(0), 'data': (3, 2, 20, 20), 'type_dict': {'data': np.float32}}]
@@ -1080,7 +1069,7 @@ def test_pooling_versions():
                                                                             pool_op))
             sym_list.append(sym)
 
-        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), tol=tol)
+        check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), rtol=tol, atol=tol)
 
     def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_include_pad=True,
                          tol=None):
@@ -1239,7 +1228,7 @@ def test_flatten_slice_after_conv():
 
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 16, 16, 16), 'type_dict': {'conv_data': np.float32}},
                 {'ctx': mx.cpu(0), 'conv_data': (2, 16, 16, 16), 'type_dict': {'conv_data': np.float32}}]
-    check_consistency(slice_sym, ctx_list)
+    check_consistency(slice_sym, ctx_list, scale=0.5)
 
 
 @with_seed()
@@ -1545,7 +1534,7 @@ def test_embedding_with_type():
                         'type_dict': {'embedding_data': data_type, 'embedding_weight': weight_type}})
             arg_params = {'embedding_data': np.random.randint(low=-low_pad, high=V+high_pad, size=(N,))}
             check_consistency(sym, ctx_list, grad_req={'embedding_data': 'null','embedding_weight': 'write'},
-                              arg_params=arg_params)
+                              arg_params=arg_params, scale=0.1)
 
     data_types = [np.float16, np.float32, np.float64, np.int32]
     weight_types = [np.float16, np.float32, np.float64]
@@ -1678,7 +1667,7 @@ def test_deformable_psroipooling_with_type():
                                'deformable_psroipool_trans': np.float16}},
                 ]
 
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol,
                       grad_req={'deformable_psroipool_data': 'write',
                                 'deformable_psroipool_rois': 'null',
                                 'deformable_psroipool_trans': 'write'}, arg_params=arg_params)
@@ -1710,9 +1699,9 @@ def test_deformable_convolution_with_type():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
 
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
     # test ability to turn off training on bias
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol,
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol,
                       grad_req={'deformable_conv_data': 'write',
                                 'deformable_conv_offset': 'write',
                                 'deformable_conv_weight': 'write',
@@ -1745,7 +1734,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Stride > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1766,7 +1755,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Dilate > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1787,7 +1776,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
     # Deformable group > 1
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1808,7 +1797,7 @@ def test_deformable_convolution_options():
                  'type_dict': {'deformable_conv_data': np.float32, 'deformable_conv_offset': np.float32}},
                 ]
     sym = mx.sym.contrib.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2, name='deformable_conv')
-    check_consistency(sym, ctx_list, scale=0.1, tol=tol)
+    check_consistency(sym, ctx_list, scale=0.1, rtol=tol, atol=tol)
 
 
 def check_rnn_layer(layer):
diff --git a/tests/python/gpu/test_profiler_gpu.py b/tests/python/gpu/test_profiler_gpu.py
index 11a0b7d..89eb425 100644
--- a/tests/python/gpu/test_profiler_gpu.py
+++ b/tests/python/gpu/test_profiler_gpu.py
@@ -27,6 +27,8 @@ sys.path.insert(0, os.path.join(curr_path, '../unittest'))
 # They will be detected by test framework, as long as the current file has a different filename
 from test_profiler import *
 
+# Test seen to crash pytest worker during development of https://github.com/apache/incubator-mxnet/pull/18694
+del test_aggregate_duplication
 
 def test_gpu_memory_profiler_symbolic():
     iter_num = 5
diff --git a/tests/python/unittest/common.py b/tests/python/unittest/common.py
index 0219616..331f32d 100644
--- a/tests/python/unittest/common.py
+++ b/tests/python/unittest/common.py
@@ -222,11 +222,13 @@ def with_seed(seed=None):
                 try:
                     orig_test(*args, **kwargs)
                 except:
-                    # With exceptions, repeat test_msg at INFO level to be sure it's seen.
-                    if log_level < logging.INFO:
-                        logger.info(test_msg)
+                    # With exceptions, repeat test_msg at WARNING level to be sure it's seen.
+                    if log_level < logging.WARNING:
+                        logger.warning(test_msg)
                     raise
                 finally:
+                    # Provide test-isolation for any test having this decorator
+                    mx.nd.waitall()
                     np.random.set_state(post_test_state)
         return test_new
     return test_helper
@@ -285,7 +287,7 @@ def setup_module():
         seed = np.random.randint(0, np.iinfo(np.int32).max)
     else:
         seed = int(module_seed_str)
-        logger.warn('*** module-level seed is set: all tests running deterministically ***')
+        logger.warning('*** module-level seed is set: all tests running deterministically ***')
     logger.info('Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=%s to reproduce.', seed)
     np.random.seed(seed)
     mx.random.seed(seed)
@@ -293,7 +295,7 @@ def setup_module():
     # The MXNET_TEST_SEED environment variable will override MXNET_MODULE_SEED for tests with
     #  the 'with_seed()' decoration.  Inform the user of this once here at the module level.
     if os.getenv('MXNET_TEST_SEED') is not None:
-        logger.warn('*** test-level seed set: all "@with_seed()" tests run deterministically ***')
+        logger.warning('*** test-level seed set: all "@with_seed()" tests run deterministically ***')
 
 
 def teardown_module():
@@ -359,13 +361,13 @@ def retry(n):
         @functools.wraps(orig_test)
         def test_new(*args, **kwargs):
             """Wrapper for tests function."""
-            for _ in range(n):
+            for i in range(n):
                 try:
                     orig_test(*args, **kwargs)
+                    return
                 except AssertionError as e:
-                    err = e
-                    continue
-                return
-            raise err
+                    if i == n-1:
+                        raise e
+                    mx.nd.waitall()
         return test_new
     return test_helper
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index cc1e87a..ad0601c 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -440,6 +440,7 @@ def test_grad_with_stype():
             check_grad_with_stype(stype, grad_stype, grad_stype)
 
 @with_seed()
+@pytest.mark.garbage_expected
 def test_sparse_dot_grad():
     def check_sparse_dot_grad(rhs):
         lhs = rand_ndarray((2, 8), 'csr')
diff --git a/tests/python/unittest/test_contrib_gluon_data_vision.py b/tests/python/unittest/test_contrib_gluon_data_vision.py
index d2e38d6..166b07f 100644
--- a/tests/python/unittest/test_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_contrib_gluon_data_vision.py
@@ -19,7 +19,7 @@ import mxnet as mx
 import numpy as np
 import scipy.ndimage
 from mxnet.test_utils import *
-from common import assertRaises, with_seed
+from common import assertRaises, with_seed, setup_module, teardown_module
 import shutil
 import tempfile
 import unittest
@@ -63,6 +63,7 @@ class TestImage(unittest.TestCase):
             print("cleanup {}".format(self.IMAGES_DIR))
             shutil.rmtree(self.IMAGES_DIR)
 
+    @with_seed()
     def test_imageiter(self):
         im_list = [[np.random.randint(0, 5), x] for x in self.IMAGES]
         os.makedirs('./data', exist_ok=True)
@@ -95,6 +96,7 @@ class TestImage(unittest.TestCase):
                     for batch in it:
                         pass
 
+    @with_seed()
     def test_image_bbox_iter(self):
         im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
@@ -131,6 +133,7 @@ class TestImage(unittest.TestCase):
         ]
 
 
+    @with_seed()
     def test_bbox_augmenters(self):
         # only test if all augmenters will work
         # TODO(Joshua Zhang): verify the augmenter outputs
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 52bab2b..588f5c1 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -22,7 +22,7 @@ import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.base import py_str, MXNetError
-from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import assert_almost_equal, default_context
 from mxnet.util import is_np_array
 from mxnet.ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
 from mxnet.test_utils import use_np
@@ -810,7 +810,7 @@ def test_sync_batchnorm():
                             input2grad.asnumpy(), atol=atol, rtol=rtol)
 
     cfgs = [(1, False)]
-    num_gpus = mx.context.num_gpus()
+    num_gpus = 0 if default_context().device_type != 'gpu' else mx.context.num_gpus()
     batch_size = 24
     for i in range(1, num_gpus + 1):
         if batch_size % i == 0:
diff --git a/tests/python/unittest/test_gluon_probability_v1.py b/tests/python/unittest/test_gluon_probability_v1.py
index 92721f6..b5b1644 100644
--- a/tests/python/unittest/test_gluon_probability_v1.py
+++ b/tests/python/unittest/test_gluon_probability_v1.py
@@ -417,7 +417,7 @@ def test_gluon_half_cauchy_v1():
     # Test icdf
     for shape, hybridize in itertools.product(shapes, [True, False]):
         scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
+        samples = np.random.uniform(size=shape, high=1.0-1e-4)
         net = TestHalfCauchy("icdf")
         if hybridize:
             net.hybridize()
@@ -1727,15 +1727,15 @@ def test_gluon_mvn_v1():
                     net.hybridize()
                 mx_out = net(loc, cov_param, samples)
                 assert mx_out.shape == samples.shape[:-1]
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
                 if mx_out.shape == ():
                     mx_out_t = mx_out.asnumpy()
                 else:
                     mx_out_t = mx_out.flatten()[0].asnumpy()
                 samples_t = samples.reshape(-1, event_shape).asnumpy()[0]
+                # Select the first element in the batch, because scipy does not support batching.
+                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
+                sigma_t = sigma.reshape(-1, event_shape,
+                                        event_shape)[0].asnumpy()
                 scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
                 ss_out = scipy_mvn.logpdf(samples_t)
                 assert_almost_equal(mx_out_t, ss_out, atol=1e-4,
@@ -1758,14 +1758,14 @@ def test_gluon_mvn_v1():
                     net.hybridize()
                 mx_out = net(loc, cov_param)
                 assert mx_out.shape == sigma.shape[:-2]
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
                 if mx_out.shape == ():
                     mx_out_t = mx_out.asnumpy()
                 else:
                     mx_out_t = mx_out.flatten()[0].asnumpy()
+                # Select the first element in the batch, because scipy does not support batching.
+                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
+                sigma_t = sigma.reshape(-1, event_shape,
+                                        event_shape)[0].asnumpy()
                 scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
                 ss_out = scipy_mvn.entropy()
                 assert_almost_equal(mx_out_t, ss_out, atol=1e-4,
@@ -2084,7 +2084,7 @@ def test_gluon_kl_v1():
     # exponential, geometric
     for dist in [mgp.Exponential, mgp.Geometric]:
         for shape in shapes:
-            def s(): return np.random.uniform(size=shape)
+            def s(): return np.random.uniform(size=shape, low=1e-3)
             _test_zero_kl(_dist_factory(dist, s), shape)
             if monte_carlo_test:
                 _test_monte_carlo(_dist_factory(dist, s),
diff --git a/tests/python/unittest/test_gluon_probability_v2.py b/tests/python/unittest/test_gluon_probability_v2.py
index 9a36b4f..d75aa69 100644
--- a/tests/python/unittest/test_gluon_probability_v2.py
+++ b/tests/python/unittest/test_gluon_probability_v2.py
@@ -417,7 +417,7 @@ def test_gluon_half_cauchy():
     # Test icdf
     for shape, hybridize in itertools.product(shapes, [True, False]):
         scale = np.random.uniform(0.5, 1.5, shape)
-        samples = np.random.uniform(size=shape)
+        samples = np.random.uniform(size=shape, high=1.0-1e-4)
         net = TestHalfCauchy("icdf")
         if hybridize:
             net.hybridize()
@@ -1727,14 +1727,14 @@ def test_gluon_mvn():
                     net.hybridize()
                 mx_out = net(loc, cov_param, samples)
                 assert mx_out.shape == samples.shape[:-1]
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
                 if mx_out.shape == ():
                     mx_out_t = mx_out.asnumpy()
                 else:
                     mx_out_t = mx_out.asnumpy().flatten()[0]
+                # Select the first element in the batch, because scipy does not support batching.
+                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
+                sigma_t = sigma.reshape(-1, event_shape,
+                                        event_shape)[0].asnumpy()
                 samples_t = samples.reshape(-1, event_shape).asnumpy()[0]
                 scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
                 ss_out = scipy_mvn.logpdf(samples_t)
@@ -1758,14 +1758,14 @@ def test_gluon_mvn():
                     net.hybridize()
                 mx_out = net(loc, cov_param)
                 assert mx_out.shape == sigma.shape[:-2]
-                # Select the first element in the batch, because scipy does not support batching.
-                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
-                sigma_t = sigma.reshape(-1, event_shape,
-                                        event_shape)[0].asnumpy()
                 if mx_out.shape == ():
                     mx_out_t = mx_out.asnumpy()
                 else:
                     mx_out_t = mx_out.asnumpy().flatten()[0]
+                # Select the first element in the batch, because scipy does not support batching.
+                loc_t = loc.reshape(-1, event_shape)[0].asnumpy()
+                sigma_t = sigma.reshape(-1, event_shape,
+                                        event_shape)[0].asnumpy()
                 scipy_mvn = ss.multivariate_normal(loc_t, sigma_t)
                 ss_out = scipy_mvn.entropy()
                 assert_almost_equal(mx_out_t, ss_out, atol=1e-4,
@@ -2081,10 +2081,11 @@ def test_gluon_kl():
                               _dist_factory(dist, rate),
                               repeated_times)
 
+
     # exponential, geometric
     for dist in [mgp.Exponential, mgp.Geometric]:
         for shape in shapes:
-            def s(): return np.random.uniform(size=shape)
+            def s(): return np.random.uniform(size=shape, low=1e-3)
             _test_zero_kl(_dist_factory(dist, s), shape)
             if monte_carlo_test:
                 _test_monte_carlo(_dist_factory(dist, s),
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 2aab5c0..a01746e 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -24,7 +24,7 @@ import pickle as pkl
 import random
 import functools
 import pytest
-from common import with_seed, assertRaises, TemporaryDirectory
+from common import with_seed, assertRaises, TemporaryDirectory, setup_module, teardown_module
 from mxnet.test_utils import almost_equal
 from mxnet.test_utils import assert_almost_equal, assert_exception
 from mxnet.test_utils import default_context
diff --git a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
index 9c9c7fd..8c3e76a 100644
--- a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
@@ -19,7 +19,7 @@ import mxnet as mx
 import numpy as np
 import scipy.ndimage
 from mxnet.test_utils import *
-from common import assertRaises, with_seed
+from common import assertRaises, with_seed, setup_module, teardown_module
 import shutil
 import tempfile
 import unittest
@@ -63,10 +63,11 @@ class TestImage(unittest.TestCase):
             print("cleanup {}".format(self.IMAGES_DIR))
             shutil.rmtree(self.IMAGES_DIR)
 
+    @with_seed()
     @use_np
     def test_imageiter(self):
         im_list = [[np.random.randint(0, 5), x] for x in self.IMAGES]
-        fname = './data/test_imageiter.lst'
+        fname = './data/test_numpy_imageiter.lst'
         file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
                         for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
@@ -95,6 +96,7 @@ class TestImage(unittest.TestCase):
                     for batch in it:
                         pass
 
+    @with_seed()
     @use_np
     def test_image_bbox_iter(self):
         im_list = [_generate_objects() + [x] for x in self.IMAGES]
@@ -110,7 +112,7 @@ class TestImage(unittest.TestCase):
             pass
 
         # test file list with last batch handle
-        fname = './data/test_imagedetiter.lst'
+        fname = './data/test_numpy_imagedetiter.lst'
         im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
             for line in im_list:
@@ -130,6 +132,7 @@ class TestImage(unittest.TestCase):
                 path_imglist=fname, path_root='', last_batch='keep')
         ]
 
+    @with_seed()
     @use_np
     def test_bbox_augmenters(self):
         # only test if all augmenters will work
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 8b50fc4..3b97864 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -29,7 +29,7 @@ from mxnet import np
 from mxnet.test_utils import assert_almost_equal
 from mxnet.test_utils import use_np
 from mxnet.test_utils import is_op_runnable
-from common import assertRaises, with_seed
+from common import assertRaises, with_seed, random_seed
 from mxnet.numpy_dispatch_protocol import with_array_function_protocol, with_array_ufunc_protocol
 from mxnet.numpy_dispatch_protocol import _NUMPY_ARRAY_FUNCTION_LIST, _NUMPY_ARRAY_UFUNC_LIST
 
@@ -517,8 +517,8 @@ def _add_workload_linalg_cholesky():
     dtypes = (np.float32, np.float64)
 
     for shape, dtype in itertools.product(shapes, dtypes):
-        _np.random.seed(1)
-        a = _np.random.randn(*shape)
+        with random_seed(1):
+            a = _np.random.randn(*shape)
 
         t = list(range(len(shape)))
         t[-2:] = -1, -2
@@ -2898,7 +2898,6 @@ def _add_workload_unwrap():
     phase[3:] += np.pi
     phase_s = np.vstack((phase,phase))
     OpArgMngr.add_workload('unwrap', phase)
-    print(phase_s.shape)
     OpArgMngr.add_workload('unwrap', phase_s, axis=1)
 
 
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 88ad77f..edc41f9 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -30,7 +30,7 @@ from mxnet import np, npx
 from mxnet.gluon import HybridBlock
 from mxnet.base import MXNetError
 from mxnet.test_utils import same, assert_almost_equal, rand_shape_nd, rand_ndarray
-from mxnet.test_utils import check_numeric_gradient, use_np, collapse_sum_like
+from mxnet.test_utils import check_numeric_gradient, use_np, collapse_sum_like, effective_dtype
 from mxnet.test_utils import new_matrix_with_real_eigvals_nd
 from mxnet.test_utils import new_sym_matrix_with_real_eigvals_nd
 from common import assertRaises, with_seed, retry, xfail_when_nonstandard_decimal_separator
@@ -1849,15 +1849,18 @@ def test_npx_batch_norm(shape, fix_gamma, cudnn_off, output_mean_var):
 
             running_mean = running_mean * momentum + \
                 data_mean_flat * (1 - momentum)
+
+            m = _np.prod(shape) / shape[axis]
+            # cudnn uses m-1 in the denominator of its sample variance calculation, not m
+            sample_var_adjust = 1.0 if cudnn_off or fix_gamma else m / (m-1)
             running_var = running_var * momentum + \
-                data_var_flat * (1 - momentum)
+                data_var_flat * sample_var_adjust * (1 - momentum)
 
             W = bn_gamma.reshape(expand_shape)
             dnx = ograd * W
             xsm = data - data_mean
             nd = 1.0 / np.sqrt(data_var + epsilon)
             nx = xsm * nd
-            m = _np.prod(shape) / shape[axis]
             dvar = (dnx * xsm).sum(axis=reduce_axis, keepdims=True,
                                   ) * (-0.5) * np.power(nd, 3)
             dmean = -nd * dnx.sum(axis=reduce_axis, keepdims=True) - \
@@ -1966,6 +1969,7 @@ def test_npx_softmax():
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5, equal_nan=True)
 
                     mx_out.backward()
+                    mx_a.grad.wait_to_read()
                     assert_almost_equal(mx_a.grad.asnumpy(), _np.zeros(shape), rtol=1e-3, atol=1e-5)
 
 
@@ -4608,7 +4612,7 @@ def test_np_random_grad():
                 scale = np.ones(scale)
             mx_out = getattr(np.random, op_name)(loc, scale)
             np_out = getattr(_np.random, op_name)(loc, scale)
-            assert_almost_equal(mx_out.asnumpy().shape, np_out.shape)
+            assert mx_out.asnumpy().shape == np_out.shape
 
 
 @with_seed()
@@ -4654,7 +4658,7 @@ def test_np_lognormal_grad():
     for ((shape1, shape2), out_shape) in zip(param_shape, output_shapes):
         mx_out = np.random.lognormal(np.zeros(shape1), np.ones(shape2), out_shape)
         np_out = _np.random.lognormal(np.zeros(shape1).asnumpy(), np.ones(shape2).asnumpy(), out_shape)
-        assert_almost_equal(mx_out.asnumpy().shape, np_out.shape)
+        assert mx_out.asnumpy().shape == np_out.shape
 
     def _test_lognormal_exception(sigma):
         output = np.random.lognormal(sigma=sigma).asnumpy()
@@ -4913,7 +4917,7 @@ def test_np_random_rayleigh():
             with mx.autograd.record():
                 mx_out = test_rayleigh(scale)
             np_out = _np.random.rayleigh(scale = scale.asnumpy(), size = shape)
-            assert_almost_equal(np_out.shape, mx_out.shape)
+            assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)
@@ -4921,7 +4925,7 @@ def test_np_random_rayleigh():
     for shape in shapes:
         mx_out = np.random.rayleigh(np.array([1]), shape)
         np_out = _np.random.rayleigh(np.array([1]).asnumpy(), shape)
-        assert_almost_equal(mx_out.asnumpy().shape, np_out.shape)
+        assert mx_out.asnumpy().shape == np_out.shape
 
     def _test_rayleigh_exception(scale):
         output = np.random.rayleigh(scale=scale).asnumpy()
@@ -4954,7 +4958,7 @@ def test_np_exponential():
             with mx.autograd.record():
                 mx_out = test_exponential_grad(scale)
             np_out = _np.random.exponential(scale = scale.asnumpy(), size = out_shape)
-            assert_almost_equal(np_out.shape, mx_out.shape)
+            assert np_out.shape == mx_out.shape
             mx_out.backward()
             assert scale.grad.shape == out_shape
             assert_almost_equal(scale.grad.asnumpy().sum(), mx_out.asnumpy().sum(), rtol=1e-3, atol=1e-5)
@@ -5724,6 +5728,8 @@ def test_np_linalg_svd():
                 data_np = _np.random.uniform(-10.0, 10.0, shape)
                 data_np = _np.array(data_np, dtype=dtype)
                 data = np.array(data_np, dtype=dtype)
+                if effective_dtype(data) == np.dtype(np.float16):
+                    continue
                 data.attach_grad()
                 with mx.autograd.record():
                     ret = test_svd(data)
@@ -6115,7 +6121,7 @@ def test_np_linalg_solve():
             print(e)
         else:
             assert x.shape == x_expected.shape
-            assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(x, x_expected)
 
     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
@@ -6155,7 +6161,6 @@ def test_np_linalg_solve():
     nrhs = (-1, 0, 1, 2, 3)
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, nrh in itertools.product([False, True], shapes, dtypes, nrhs):
-        rtol, atol =1e-2, 1e-4
         test_solve = TestSolve()
         if hybridize:
             test_solve.hybridize()
@@ -6189,8 +6194,8 @@ def test_np_linalg_solve():
                 mx.autograd.backward(mx_out)
                 b_backward_expected = get_grad_b(a.asnumpy(), mx_out.asnumpy())
                 a_backward_expected = -_np.matmul(b_backward_expected, _np.swapaxes(mx_out, -1, -2).asnumpy())
-                assert_almost_equal(a.grad.asnumpy(), a_backward_expected, rtol=rtol, atol=atol)
-                assert_almost_equal(b.grad.asnumpy(), b_backward_expected, rtol=rtol, atol=atol)
+                assert_almost_equal(a.grad, a_backward_expected)
+                assert_almost_equal(b.grad, b_backward_expected)
 
         # check imperative once again
         mx_out = np.linalg.solve(a, b)
@@ -6215,7 +6220,7 @@ def test_np_linalg_tensorinv():
             print(e)
         else:
             assert inv_a.shape == inv_a_expected.shape
-            assert_almost_equal(inv_a.asnumpy(), inv_a_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(inv_a, inv_a_expected)
 
     def newInvertibleMatrix_2D(shape, max_cond=4):
         while 1:
@@ -6258,11 +6263,6 @@ def test_np_linalg_tensorinv():
     ]
     dtypes = ['float32', 'float64']
     for hybridize, shape, dtype, in itertools.product([False, True], shapes, dtypes):
-        rtol = 1e-3
-        atol = 1e-5
-        if dtype == 'float32':
-            rtol = 1e-2
-            atol = 1e-4
         ind = shape[0]
         test_tensorinv = TestTensorinv(ind=ind)
         if hybridize:
@@ -6290,7 +6290,7 @@ def test_np_linalg_tensorinv():
         if 0 not in mx_out.shape:
             mx.autograd.backward(mx_out)
             grad_A_expected = get_grad_A(a.asnumpy(), ind)
-            assert_almost_equal(a.grad.asnumpy(), grad_A_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(a.grad, grad_A_expected)
 
     # check imperative once again
     mx_out = np.linalg.tensorinv(a, ind)
@@ -6343,7 +6343,7 @@ def test_np_linalg_tensorsolve():
             print(e)
         else:
             assert x.shape == x_expected.shape
-            assert_almost_equal(x.asnumpy(), x_expected, rtol=rtol, atol=atol)
+            assert_almost_equal(x, x_expected)
 
     def shapeInfer(a_shape, b_shape, axes=None):
         # b_shape - Right-hand tensor shape, which can be of any shape.
@@ -6405,8 +6405,6 @@ def test_np_linalg_tensorsolve():
     for hybridize in [True, False]:
         for dtype in dtypes:
             for a_shape, b_shape, axes in shapes:
-                rtol = 1e-2 if dtype == 'float32' else 1e-3
-                atol = 1e-4 if dtype == 'float32' else 1e-5
                 test_tensorsolve = TestTensorsolve(axes)
                 if hybridize:
                     test_tensorsolve.hybridize()
@@ -6443,8 +6441,8 @@ def test_np_linalg_tensorsolve():
                     mx.autograd.backward(mx_out)
                     grad_a_expected, grad_b_expected = get_tensorsolve_backward(
                         a.asnumpy(), b.asnumpy(), mx_out.asnumpy(), a_axes, a_origin_axes, a_trans_shape)
-                    assert_almost_equal(a.grad.asnumpy(), grad_a_expected, rtol=rtol, atol=atol)
-                    assert_almost_equal(b.grad.asnumpy(), grad_b_expected, rtol=rtol, atol=atol)
+                    assert_almost_equal(a.grad, grad_a_expected)
+                    assert_almost_equal(b.grad, grad_b_expected)
 
                 # check imperative once again
                 mx_out = test_tensorsolve(a, b)
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 1578e14..344cb72 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -458,21 +458,26 @@ def test_symbol_pow():
 
 @with_seed()
 def test_fully_connected():
+    # Create data of given shape as a uniform distribution centered on 0.0
+    def random_data(shape, dtype=np.float32):
+        return mx.nd.random.uniform(low=-0.5,
+                                    high=0.5, shape=shape, dtype=dtype)
     data = mx.sym.var("data")
     fc_weight = mx.sym.var("weight")
     fc_bias = mx.sym.var("bias")
     fc = mx.sym.FullyConnected(data=data, weight=fc_weight, bias=fc_bias, num_hidden=10, no_bias=False, name='fc')
-    data = mx.nd.random.uniform(shape=(5, 5, 5, 13), dtype=np.float32)
-    fc_weight = mx.nd.random.uniform(shape=(10, 325), dtype=np.float32)
-    fc_bias = mx.nd.random.uniform(shape=(10), dtype=np.float32)
-    fc_bias2 = mx.nd.random.uniform(shape=(10, 1), dtype=np.float32)
+
+    data = random_data(shape=(5, 5, 5, 13))
+    fc_weight = random_data(shape=(10, 325))
+    fc_bias = random_data(shape=(10))
+    fc_bias2 = random_data(shape=(10, 1))
+
     data_np = data.asnumpy().reshape(5, 325)
     fc_weight_np = np.transpose(fc_weight.asnumpy())
     fc_bias_np = fc_bias.asnumpy()
     res = np.dot(data_np, fc_weight_np) + fc_bias.asnumpy()
     check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np}, {'fc_output': res})
-    check_numeric_gradient(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np},
-                           numeric_eps=1e-2, rtol=1e-4, atol=1e-2)
+    check_numeric_gradient(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias_np})
     # TODO: Fix Bug #15032 when bias has ndim > 1
     #check_symbolic_forward(fc, {'data': data_np, 'weight': fc_weight.asnumpy(), 'bias': fc_bias2.asnumpy()}, {'fc_output': res})
 
@@ -1544,15 +1549,18 @@ def test_batchnorm(op_name, shape, fix_gamma, cudnn_off, output_mean_var):
 
             running_mean = running_mean * momentum + \
                 data_mean_flat * (1 - momentum)
+
+            m = np.prod(shape) / shape[axis]
+            # cudnn uses m-1 in the denominator of its sample variance calculation, not m
+            sample_var_adjust = 1.0 if cudnn_off or fix_gamma else m / (m-1)
             running_var = running_var * momentum + \
-                data_var_flat * (1 - momentum)
+                data_var_flat * sample_var_adjust * (1 - momentum)
 
             W = bn_gamma.reshape(expand_shape)
             dnx = ograd * W
             xsm = data - data_mean
             nd = 1.0 / mx.nd.sqrt(data_var + epsilon)
             nx = xsm * nd
-            m = np.prod(shape) / shape[axis]
             dvar = (dnx * xsm).sum(axis=axis, keepdims=True,
                                    exclude=True) * (-0.5) * mx.nd.power(nd, 3)
             dmean = -nd * dnx.sum(axis=axis, keepdims=True, exclude=True) - \
@@ -2478,13 +2486,13 @@ def test_reduce():
                          args_grad={'a': grad_nd})
             net.forward(is_train=True)
 
-            equal_forward = almost_equal_ignore_nan(net.outputs[0].asnumpy(), sum_groundtruth, 1E-4, 1E-4)
-            assert equal_forward
+            # check forward
+            assert_almost_equal_ignore_nan(net.outputs[0].asnumpy(), sum_groundtruth, rtol=1e-4, atol=1e-4)
 
             net.backward(out_grads=mx.nd.array(outgrad_npy))
             bc_grad_groundtruth = np.broadcast_to(grad_groundtruth, grad_nd.shape)
-            equal_backward = almost_equal_ignore_nan(grad_nd.asnumpy(), bc_grad_groundtruth, 1E-4, 1E-4)
-            assert equal_backward
+            # check backward
+            assert_almost_equal_ignore_nan(grad_nd.asnumpy(), bc_grad_groundtruth, rtol=1e-4, atol=1e-4)
 
     test_none_axis = [True, False]
     for test_none in test_none_axis:
@@ -4074,7 +4082,7 @@ def test_order():
                 out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=a_npy.size, is_ascend=is_ascend)
             else:
                 out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=5, is_ascend=is_ascend)
-            check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx)
+            check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, rtol=1e-2, ctx=ctx)
             check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy])
 
     b = mx.sym.topk(a, axis=1, is_ascend=is_ascend, ret_typ="indices", k=5)
@@ -4122,7 +4130,7 @@ def test_order():
                 for is_ascend in [True, False]:
                     b = mx.sym.topk(a, axis=axis, is_ascend=is_ascend, ret_typ="value", k=k)
                     out_npy = gt_topk(dat=a_npy, axis=axis, ret_typ="value", k=k, is_ascend=is_ascend)
-                    check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, ctx=ctx)
+                    check_numeric_gradient(b, location={'a': a_npy}, numeric_eps=1e-2, rtol=1e-2, ctx=ctx)
                     check_symbolic_forward(b, location={'a': a_npy}, expected=[out_npy])
 
         b = mx.sym.topk(a, axis=1, is_ascend=is_ascend, ret_typ="indices", k=5)
@@ -4285,7 +4293,7 @@ def test_grid_generator():
         # check forward
         exe.arg_dict['affine'][:] = np.array([[1.0,0,0,0,1.0,0]])
         exe.forward(is_train=True)
-        output = exe.outputs[0].asnumpy()
+        output = exe.outputs[0]
         output[0,0,:,:] = (output[0,0,:,:] + 1) * (target_shape[1] - 1) / 2.0
         output[0,1,:,:] = (output[0,1,:,:] + 1) * (target_shape[0] - 1) / 2.0
         xv, yv = np.meshgrid(np.arange(target_shape[0]), np.arange(target_shape[1]))
@@ -4300,7 +4308,7 @@ def test_grid_generator():
         tmp[1] = -1.0 + (np.arange(target_shape[0]*target_shape[1]) // target_shape[1]) * (2.0 / (target_shape[0]-1))
         tmp[2] = 1
         grad_est = np.dot(out_grad[0].reshape(2,target_shape[0]*target_shape[1]),tmp.T).reshape(1,6)
-        assert_almost_equal(exe.grad_dict['affine'], grad_est, rtol=1e-3, atol=1e-5)
+        assert_almost_equal(exe.grad_dict['affine'], grad_est)
         # check addto
         exe = grid._simple_bind(ctx=default_context(), affine=(1,6), grad_req='add')
         grid_grad_npy = np.random.normal(size=exe.grad_dict['affine'].shape)
@@ -4308,7 +4316,7 @@ def test_grid_generator():
         exe.arg_dict['affine'][:] = np.array([[1.0, 0, 0, 0, 1.0, 0]])
         exe.forward(is_train=True)
         exe.backward(mx.nd.array(out_grad))
-        assert_almost_equal(exe.grad_dict['affine'], grad_est + grid_grad_npy, rtol=1e-2, atol=1e-5)
+        assert_almost_equal(exe.grad_dict['affine'], grad_est + grid_grad_npy)
 
     # transform_type = warp
     test_case = [(12,21),(4,3),(6,12)]
@@ -5354,51 +5362,62 @@ def test_div_sqrt_dim():
     check_symbolic_forward(test, [data_tmp], [data_tmp / np.sqrt(data_tmp.shape[-1])])
 
 
+# helper function to identify inputs likely to fail check_numeric_gradient tol test
+# due to finite difference method inaccuracies or function discontuities at the origin
+def bad_input_finder(f, f_grad, dtype):
+    eps = default_numeric_eps()[np.dtype(dtype)]
+    rtol = default_rtols()[np.dtype(dtype)]
+    def expected_relative_error(x):
+        fd_gradient = (f(x+eps/2) - f(x-eps/2)) / eps
+        return abs(fd_gradient/f_grad(x) - 1)
+    def is_fd_problem_input(x):
+        return abs(x) < eps/2 or expected_relative_error(x) > rtol
+    return np.vectorize(is_fd_problem_input)
+
 @with_seed()
 def test_reciprocal_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid possible division by 0 errors and finite difference method inaccuracies.
-    # Factor of 6 below set empirically, depends on eps.
-    # Issue exposed by seed 879579887.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 6*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(np.reciprocal,
+                                    lambda x: -np.reciprocal(x)**2, np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.reciprocal(data)
 
-    check_numeric_gradient(test, [data_tmp], numeric_eps = eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [np.reciprocal(data_tmp)])
 
 
 @with_seed()
 def test_cbrt_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid finite difference method inaccuracies due to infinite gradient at the origin.
-    # Factor of 4 below set empirically, depends on eps.
-    # Issue exposed by seed 553872106.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 4*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(np.cbrt,
+                                    lambda x: 1./(3 * np.cbrt(x)**2), np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.cbrt(data)
-
-    check_numeric_gradient(test, [data_tmp], numeric_eps=eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [np.cbrt(data_tmp)])
 
 
 @with_seed()
 def test_rcbrt_op():
-    eps = 2**(-11)
-    data_tmp = np.random.rand(3, 4) * 10 - 5
-    # Avoid possible division by 0 errors and finite difference method inaccuracies.
-    # Factor of 4 below set empirically, depends on eps.
-    # Issue exposed by seed 788174893.
-    # Replace problematic inputs with 1.0.
-    data_tmp[abs(data_tmp) < 4*eps] = 1.0
+    data_tmp = np.random.rand(3, 4).astype(np.float32) * 10 - 5
+
+    # Avoid possible division by 0 errors and finite difference method
+    # inaccuracies by replacing problem inputs with 1.0.
+    is_bad_input = bad_input_finder(lambda x: 1./np.cbrt(x),
+                                    lambda x: -1./(3 * np.cbrt(x)**4), np.float32)
+    data_tmp[is_bad_input(data_tmp)] = 1.0
     data = mx.symbol.Variable('data')
     test = mx.sym.rcbrt(data)
 
-    check_numeric_gradient(test, [data_tmp], numeric_eps = eps)
+    check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [1/np.cbrt(data_tmp)])
 
 
@@ -5807,7 +5826,7 @@ def test_deformable_convolution():
                         # By now we only have gpu implementation
                         if default_context().device_type == 'gpu':
                             check_numeric_gradient(op, [im_data, offset_data, weight, bias], rtol=rtol, atol=atol,
-                                                   grad_nodes=grad_nodes, ctx=mx.gpu(0))
+                                                   grad_nodes=grad_nodes, ctx=mx.gpu(0), numeric_eps=1.0/64)
 
 
 def _validate_sample_location(input_rois, input_offset, spatial_scale, pooled_w, pooled_h, sample_per_part, part_size, output_dim, num_classes, trans_std, feat_h, feat_w):
@@ -5900,10 +5919,11 @@ def test_deformable_psroipooling():
                                                grad_nodes=grad_nodes, ctx=mx.gpu(0))
 
 
-def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
-    num_eps = 1e-6
-    rtol_bw = 1e-5
-    atol_bw = 1e-6
+def _gemm_test_helper(dtype, grad_check, rtol_fw = None, atol_fw = None,
+                                         rtol_bw = None, atol_bw = None, num_eps = None):
+    def np_random_data(shape, dtype=np.float32):
+        return np.random.uniform(low=-0.5,
+                                 high=0.5, size=shape).astype(dtype)
 
     data1 = mx.symbol.Variable('data1')
     data2 = mx.symbol.Variable('data2')
@@ -5922,10 +5942,10 @@ def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
     shape2 = (3, 2)
     shape3 = (3, 3)
     shape4 = (2, 2)
-    data_in1 = np.random.uniform(1, 10, shape1).astype(dtype)
-    data_in2 = np.random.uniform(1, 10, shape2).astype(dtype)
-    data_in3 = np.random.uniform(1, 10, shape3).astype(dtype)
-    data_in4 = np.random.uniform(1, 10, shape4).astype(dtype)
+    data_in1 = np_random_data(shape1, dtype)
+    data_in2 = np_random_data(shape2, dtype)
+    data_in3 = np_random_data(shape3, dtype)
+    data_in4 = np_random_data(shape4, dtype)
     # Check all transpositions of gemm operator.
     data_in1_t = np.transpose(data_in1)
     data_in2_t = np.transpose(data_in2)
@@ -6032,10 +6052,10 @@ def _gemm_test_helper(dtype, grad_check, rtol_fw = 1e-7, atol_fw = 1e-9):
 def test_gemm():
     _gemm_test_helper(np.float64, True)
     os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "0"
-    _gemm_test_helper(np.float32, False, rtol_fw = 1e-5, atol_fw = 1e-7)
+    _gemm_test_helper(np.float32, True)
     if default_context().device_type == 'gpu':
         os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "1"
-        _gemm_test_helper(np.float32, False, rtol_fw = 2e-5, atol_fw = 2e-7)
+        _gemm_test_helper(np.float32, True)
         os.environ["MXNET_CUDA_TENSOR_OP_MATH_ALLOW_CONVERSION"] = "0"
 
 # Helper functions for test_laop
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index dc07201..8bc086e 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1598,6 +1598,7 @@ def test_sparse_axis_operations():
 
 
 @with_seed()
+@pytest.mark.serial
 def test_sparse_square_sum():
     dim0 = 30
     dim1 = 30