You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by jx...@apache.org on 2017/12/28 20:27:30 UTC
[incubator-mxnet] branch master updated: fix random generator: do not gen seed each time (#9119)

This is an automated email from the ASF dual-hosted git repository.

jxie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 34a5195  fix random generator: do not gen seed each time (#9119)
34a5195 is described below

commit 34a51959bd2bc21c6cfa93f5fe0e079ef5268261
Author: Yizhi Liu <ja...@gmail.com>
AuthorDate: Thu Dec 28 12:27:25 2017 -0800

    fix random generator: do not gen seed each time (#9119)
    
    * add tests for distribution generators
    
    fix lint
    
    fix lint
    
    fix typo
    
    fix docstring
    
    fix docstring
    
    * [Bugfix] fix random generator: do not gen seed each time
    
    * gen samplers on gpu for test_softmax
    
    * fix test cases
    
    * remove unnecessary prints
    
    * refactor RandGenerator
    
    * get_native_random -> get_parallel_random
    
    * revise test cases + remove dependency of scipy
    
    * raise warning
---
 amalgamation/amalgamation.py            |   2 +-
 include/mxnet/resource.h                |  18 +-
 include/mxnet/storage.h                 |   2 +-
 perl-package/AI-MXNet/t/test_random.t   |   2 +-
 python/mxnet/optimizer.py               |   3 +-
 python/mxnet/test_utils.py              | 226 ++++++++++++++++++++++++
 src/common/random_generator.cu          |  56 ++++++
 src/common/random_generator.h           | 219 ++++++++++++++++++++++++
 src/common/utils.h                      |   1 -
 src/executor/attach_op_resource_pass.cc |   2 +
 src/imperative/imperative_utils.h       |   4 +
 src/operator/random/multisample_op.cc   |   3 +-
 src/operator/random/multisample_op.h    |  23 ++-
 src/operator/random/sample_op.h         |  63 +++----
 src/operator/random/sampler.h           | 293 ++++++++++++++------------------
 src/resource.cc                         |  96 ++++++++++-
 tests/python/unittest/test_module.py    |   2 +-
 tests/python/unittest/test_operator.py  |   4 +-
 tests/python/unittest/test_optimizer.py |   2 +-
 tests/python/unittest/test_random.py    | 122 +++++++++++++
 20 files changed, 919 insertions(+), 224 deletions(-)

diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 9419898..f1e1e02 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -21,7 +21,7 @@ import platform
 
 blacklist = [
     'Windows.h', 'cublas_v2.h', 'cuda/tensor_gpu-inl.cuh',
-    'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h',
+    'cuda_runtime.h', 'cudnn.h', 'cudnn_lrn-inl.h', 'curand.h', 'curand_kernel.h',
     'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
     'kvstore_dist.h', 'mach/clock.h', 'mach/mach.h',
     'malloc.h', 'mkl.h', 'mkl_cblas.h', 'mkl_vsl.h', 'mkl_vsl_functions.h',
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 7d2e6ca..773baf0 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -28,6 +28,7 @@
 #include <dmlc/logging.h>
 #include "./base.h"
 #include "./engine.h"
+#include "../../src/common/random_generator.h"
 
 namespace mxnet {
 
@@ -40,7 +41,9 @@ struct ResourceRequest {
     /*! \brief mshadow::Random<xpu> object */
     kRandom,
     /*! \brief A dynamic temp space that can be arbitrary size */
-    kTempSpace
+    kTempSpace,
+    /*! \brief common::RandGenerator<xpu> object, which can be used in GPU kernel functions */
+    kParallelRandom
   };
   /*! \brief type of resources */
   Type type;
@@ -89,6 +92,19 @@ struct Resource {
     ret->set_stream(stream);
     return ret;
   }
+
+  /*!
+   * \brief Get parallel random number generator.
+   * \tparam xpu the device type of random number generator.
+   * \tparam DType the return type.
+   * \return the native random number generator. for gpu, it is allocated on global memory.
+   */
+  template<typename xpu, typename DType>
+  inline common::random::RandGenerator<xpu, DType>* get_parallel_random() const {
+    CHECK_EQ(req.type, ResourceRequest::kParallelRandom);
+    return static_cast<common::random::RandGenerator<xpu, DType>*>(ptr_);
+  }
+
   /*!
    * \brief Get space requested as mshadow Tensor.
    *  The caller can request arbitrary size.
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index d19f98b..a8481c1 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -82,7 +82,7 @@ class Storage {
   virtual void SharedIncrementRefCount(Handle handle) = 0;
   /*!
    * \brief Free storage.
-   * \param handle Handle struect.
+   * \param handle Handle struct.
    */
   virtual void Free(Handle handle) = 0;
   /*!
diff --git a/perl-package/AI-MXNet/t/test_random.t b/perl-package/AI-MXNet/t/test_random.t
index c95a199..60cebcf 100644
--- a/perl-package/AI-MXNet/t/test_random.t
+++ b/perl-package/AI-MXNet/t/test_random.t
@@ -87,7 +87,7 @@ sub check_with_device
             ]
         },
     );
-    my $shape = [100, 100];
+    my $shape = [1000, 1000];
     for my $symbdic (@symbols)
     {
         my $name = $symbdic->{name};
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 7e8e7c2..aebb52e 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -648,7 +648,8 @@ class SGLD(Optimizer):
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
         weight[:] += - lr/2 * (grad + wd * weight) + normal(0, math.sqrt(lr),
-                                                            weight.shape, weight.context)
+                                                            shape=weight.shape,
+                                                            ctx=weight.context)
 
 
 @register  # pylint: disable=invalid-name
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index 53814b7..58bc8d3 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -35,6 +35,10 @@ import numpy as np
 import numpy.testing as npt
 import numpy.random as rnd
 try:
+    import scipy.stats as ss
+except ImportError:
+    ss = None
+try:
     import requests
 except ImportError:
     # in rare cases requests may be not installed
@@ -1593,3 +1597,225 @@ class DummyIter(mx.io.DataIter):
             The data of next batch.
         """
         return self.the_batch
+
+def gen_buckets_probs_with_ppf(ppf, nbuckets):
+    """Generate the buckets and probabilities for chi_square test when the ppf (Quantile function)
+     is specified.
+
+    Parameters
+    ----------
+    ppf : function
+        The Quantile function that takes a probability and maps it back to a value.
+        It's the inverse of the cdf function
+    nbuckets : int
+        size of the buckets
+
+    Returns
+    -------
+    buckets : list of tuple
+        The generated buckets
+    probs : list
+        The generate probabilities
+    """
+    assert nbuckets > 0
+    probs = [1.0 / nbuckets for _ in range(nbuckets)]
+    buckets = [(ppf(i / float(nbuckets)), ppf((i + 1) / float(nbuckets))) for i in range(nbuckets)]
+    return buckets, probs
+
+def mean_check(generator, mu, sigma, nsamples=1000000):
+    """Test the generator by matching the mean.
+
+    We test the sample mean by checking if it falls inside the range
+        (mu - 3 * sigma / sqrt(n), mu + 3 * sigma / sqrt(n))
+
+    References::
+
+        @incollection{goucher2009beautiful,
+              title={Beautiful Testing: Leading Professionals Reveal How They Improve Software},
+              author={Goucher, Adam and Riley, Tim},
+              year={2009},
+              chapter=10
+        }
+
+    Examples::
+
+        generator = lambda x: np.random.normal(0, 1.0, size=x)
+        mean_check_ret = mean_check(generator, 0, 1.0)
+
+    Parameters
+    ----------
+    generator : function
+        The generator function. It's expected to generate N i.i.d samples by calling generator(N).
+    mu : float
+    sigma : float
+    nsamples : int
+
+    Returns
+    -------
+    ret : bool
+        Whether the mean test succeeds
+    """
+    samples = np.array(generator(nsamples))
+    sample_mean = samples.mean()
+    ret = (sample_mean > mu - 3 * sigma / np.sqrt(nsamples)) and\
+          (sample_mean < mu + 3 * sigma / np.sqrt(nsamples))
+    return ret
+
+def var_check(generator, sigma, nsamples=1000000):
+    """Test the generator by matching the variance.
+    It will need a large number of samples and is not recommended to use
+
+    We test the sample variance by checking if it falls inside the range
+        (sigma^2 - 3 * sqrt(2 * sigma^4 / (n-1)), sigma^2 + 3 * sqrt(2 * sigma^4 / (n-1)))
+
+    References::
+
+        @incollection{goucher2009beautiful,
+              title={Beautiful Testing: Leading Professionals Reveal How They Improve Software},
+              author={Goucher, Adam and Riley, Tim},
+              year={2009},
+              chapter=10
+        }
+
+    Examples::
+
+        generator = lambda x: np.random.normal(0, 1.0, size=x)
+        var_check_ret = var_check(generator, 0, 1.0)
+
+    Parameters
+    ----------
+    generator : function
+        The generator function. It's expected to generate N i.i.d samples by calling generator(N).
+    sigma : float
+    nsamples : int
+
+    Returns
+    -------
+    ret : bool
+        Whether the variance test succeeds
+    """
+    samples = np.array(generator(nsamples))
+    sample_var = samples.var(ddof=1)
+    ret = (sample_var > sigma ** 2 - 3 * np.sqrt(2 * sigma ** 4 / (nsamples - 1))) and\
+          (sample_var < sigma ** 2 + 3 * np.sqrt(2 * sigma ** 4 / (nsamples - 1)))
+    return ret
+
+def chi_square_check(generator, buckets, probs, nsamples=1000000):
+    """Run the chi-square test for the generator. The generator can be both continuous and discrete.
+    If the generator is continuous, the buckets should contain tuples of (range_min, range_max) and
+     the probs should be the corresponding ideal probability within the specific ranges.
+    Otherwise, the buckets should be the possible output of the discrete distribution and the probs
+     should be groud-truth probability.
+
+    Usually the user is required to specify the probs parameter.
+
+    After obtatining the p value, we could further use the standard p > 0.05 threshold to get
+     the final result.
+
+    Examples::
+        buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.norm.ppf(x, 0, 1), 5)
+        generator = lambda x: np.random.normal(0, 1.0, size=x)
+        p = chi_square_check(generator=generator, buckets=buckets, probs=probs)
+        assert(p > 0.05)
+
+    Parameters
+    ----------
+    generator: function
+        A function that is assumed to generate i.i.d samples from a specific distribution.
+        generator(N) should generate N random samples.
+    buckets: list of tuple or list of number
+        The buckets to run the chi-square the test. Make sure that the buckets cover
+         the whole range of the distribution. Also, the buckets must be in ascending order and have
+         no intersection
+    probs: list or tuple
+        The ground-truth probability of the random value fall in a specific bucket.
+    nsamples:int
+        The number of samples to generate for the testing
+
+    Returns
+    -------
+    p : float
+        p value that the generator has the expected distribution.
+        A higher value indicates a larger confidence
+    obs_freq : list
+        Observed frequency of buckets
+    expected_freq : list
+        The expected (ground-truth) frequency of the buckets
+    """
+    if not ss:
+        raise ImportError("scipy is not available."
+                          " Please check if the scipy python bindings are installed.")
+    assert isinstance(buckets, list)
+    samples = generator(nsamples)
+    assert len(probs) == len(buckets)
+    if isinstance(buckets[0], (list, tuple)):
+        # Check whether the buckets are valid and fill them into a npy array
+        continuous_dist = True
+        buckets_npy = np.zeros((len(buckets) * 2, ), dtype=np.float32)
+        for i, _ in enumerate(buckets):
+            assert(buckets[i][0] <= buckets[i][1])
+            if i < len(buckets) - 1:
+                assert(buckets[i][1] <= buckets[i + 1][0])
+            buckets_npy[i * 2] = buckets[i][0]
+            buckets_npy[i * 2 + 1] = buckets[i][1]
+    else:
+        continuous_dist = False
+        buckets_npy = np.array(buckets)
+    expected_freq = (nsamples * np.array(probs, dtype=np.float32)).astype(np.int32)
+    if continuous_dist:
+        sample_bucket_ids = np.searchsorted(buckets_npy, samples, side='right')
+    else:
+        sample_bucket_ids = samples
+    if continuous_dist:
+        sample_bucket_ids = sample_bucket_ids // 2
+    obs_freq = np.zeros(shape=len(buckets), dtype=np.int)
+    for i in range(len(buckets)):
+        obs_freq[i] = (sample_bucket_ids == i).sum()
+    _, p = ss.chisquare(f_obs=obs_freq, f_exp=expected_freq)
+    return p, obs_freq, expected_freq
+
+def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, success_rate=0.25):
+    """Verify whether the generator is correct using chi-square testing.
+
+    The test is repeated for "nrepeat" times and we check if the success rate is
+     above the threshold (25% by default).
+
+    Parameters
+    ----------
+    generator: function
+        A function that is assumed to generate i.i.d samples from a specific distribution.
+            generator(N) should generate N random samples.
+    buckets: list of tuple or list of number
+        The buckets to run the chi-square the test. Make sure that the buckets cover
+         the whole range of the distribution. Also, the buckets must be in ascending order and
+         have no intersection
+    probs: list or tuple
+        The ground-truth probability of the random value fall in a specific bucket.
+    nsamples: int
+        The number of samples to generate for the testing
+    nrepeat: int
+        The times to repeat the test
+    success_rate: float
+        The desired success rate
+
+    Returns
+    -------
+    cs_ret_l: list
+        The p values of the chi-square test.
+    """
+    cs_ret_l = []
+    obs_freq_l = []
+    expected_freq_l = []
+    for _ in range(nrepeat):
+        cs_ret, obs_freq, expected_freq = chi_square_check(generator=generator, buckets=buckets,
+                                                           probs=probs, nsamples=nsamples)
+        cs_ret_l.append(cs_ret)
+        obs_freq_l.append(obs_freq)
+        expected_freq_l.append(expected_freq)
+    success_num = (np.array(cs_ret_l) > 0.05).sum()
+    if success_num < nrepeat * success_rate:
+        raise AssertionError("Generator test fails, Chi-square p=%s, obs_freq=%s, expected_freq=%s."
+                             "\nbuckets=%s, probs=%s"
+                             % (str(cs_ret_l), str(obs_freq_l), str(expected_freq_l),
+                                str(buckets), str(probs)))
+    return cs_ret_l
diff --git a/src/common/random_generator.cu b/src/common/random_generator.cu
new file mode 100644
index 0000000..5f6ac44
--- /dev/null
+++ b/src/common/random_generator.cu
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file random_generator.cu
+ * \brief gpu implements for parallel random number generator.
+ */
+
+#include <algorithm>
+#include "./random_generator.h"
+#include "../operator/mxnet_op.h"
+
+namespace mxnet {
+namespace common {
+namespace random {
+
+__global__ void rand_generator_seed_kernel(curandStatePhilox4_32_10_t *states_,
+                                           const int size,
+                                           uint32_t seed) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < size) curand_init(seed, id, 0, states_ + id);
+}
+
+template<>
+void RandGenerator<gpu, float>::Seed(Stream<gpu> *s, uint32_t seed) {
+  using namespace mshadow::cuda;
+  int ngrid = std::min(kMaxGridNum,
+                       (RandGenerator<gpu, float>::kNumRandomStates + kBaseThreadNum - 1) /
+                         kBaseThreadNum);
+  rand_generator_seed_kernel
+      <<<ngrid, kBaseThreadNum, 0, Stream<gpu>::GetStream(s)>>>(
+          states_,
+          RandGenerator<gpu, float>::kNumRandomStates,
+          seed);
+}
+
+}  // namespace random
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/random_generator.h b/src/common/random_generator.h
new file mode 100644
index 0000000..21db9d7
--- /dev/null
+++ b/src/common/random_generator.h
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file random_generator.h
+ * \brief Parallel random number generator.
+ */
+#ifndef MXNET_COMMON_RANDOM_GENERATOR_H_
+#define MXNET_COMMON_RANDOM_GENERATOR_H_
+
+#include <mxnet/base.h>
+#include <random>
+#include <new>
+
+#if MXNET_USE_CUDA
+#include <curand_kernel.h>
+#include "../common/cuda_utils.h"
+#endif  // MXNET_USE_CUDA
+
+using namespace mshadow;
+
+namespace mxnet {
+namespace common {
+namespace random {
+
+template<typename Device, typename DType MSHADOW_DEFAULT_DTYPE>
+class RandGenerator;
+
+template<typename DType>
+class RandGenerator<cpu, DType> {
+ public:
+  // at least how many random numbers should be generated by one CPU thread.
+  static const int kMinNumRandomPerThread = 64;
+  // store how many global random states for CPU.
+  static const int kNumRandomStates = 1024;
+
+  // implementation class for random number generator
+  class Impl {
+   public:
+    typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                      DType, double>::type FType;
+
+    explicit Impl(RandGenerator<cpu, DType> *gen, int state_idx)
+        : engine_(gen->states_ + state_idx) {}
+
+    Impl(const Impl &) = delete;
+    Impl &operator=(const Impl &) = delete;
+
+    MSHADOW_XINLINE int rand() { return engine_->operator()(); }
+
+    MSHADOW_XINLINE FType uniform() {
+      typedef typename std::conditional<std::is_integral<DType>::value,
+      std::uniform_int_distribution<DType>,
+      std::uniform_real_distribution<FType>>::type GType;
+      GType dist_uniform;
+      return dist_uniform(*engine_);
+    }
+
+    MSHADOW_XINLINE FType normal() {
+      std::normal_distribution<FType> dist_normal;
+      return dist_normal(*engine_);
+    }
+
+   private:
+    std::mt19937 *engine_;
+  };
+
+  static void AllocState(RandGenerator<cpu, DType> *inst) {
+    inst->states_ = new std::mt19937[kNumRandomStates];
+  }
+
+  static void FreeState(RandGenerator<cpu, DType> *inst) {
+    delete[] inst->states_;
+  }
+
+  MSHADOW_XINLINE void Seed(Stream<cpu> *, uint32_t seed) {
+    for (int i = 0; i < kNumRandomStates; ++i) (states_ + i)->seed(seed + i);
+  }
+
+ private:
+  std::mt19937 *states_;
+};
+
+#if MXNET_USE_CUDA
+
+template<typename DType>
+class RandGenerator<gpu, DType> {
+ public:
+  // at least how many random numbers should be generated by one GPU thread.
+  static const int kMinNumRandomPerThread = 64;
+  // store how many global random states for GPU.
+  static const int kNumRandomStates = 32768;
+
+  // uniform number generation in Cuda made consistent with stl (include 0 but exclude 1)
+  // by using 1.0-curand_uniform().
+  // Needed as some samplers in sampler.h won't be able to deal with
+  // one of the boundary cases.
+  class Impl {
+   public:
+    Impl &operator=(const Impl &) = delete;
+    Impl(const Impl &) = delete;
+
+    // Copy state to local memory for efficiency.
+    __device__ explicit Impl(RandGenerator<gpu, DType> *gen, int state_idx)
+        : global_gen_(gen),
+          global_state_idx_(state_idx),
+          state_(*(gen->states_ + state_idx)) {}
+
+    __device__ ~Impl() {
+      // store the curand state back into global memory
+      global_gen_->states_[global_state_idx_] = state_;
+    }
+
+    MSHADOW_FORCE_INLINE __device__ int rand() {
+      return curand(&state_);
+    }
+
+    MSHADOW_FORCE_INLINE __device__ float uniform() {
+      return static_cast<float>(1.0) - curand_uniform(&state_);
+    }
+
+    MSHADOW_FORCE_INLINE __device__ float normal() {
+      return curand_normal(&state_);
+    }
+
+   private:
+    RandGenerator<gpu, DType> *global_gen_;
+    int global_state_idx_;
+    curandStatePhilox4_32_10_t state_;
+  };  // class RandGenerator<gpu, DType>::Impl
+
+  static void AllocState(RandGenerator<gpu, DType> *inst) {
+    CUDA_CALL(cudaMalloc(&inst->states_,
+                         kNumRandomStates * sizeof(curandStatePhilox4_32_10_t)));
+  }
+
+  static void FreeState(RandGenerator<gpu, DType> *inst) {
+    CUDA_CALL(cudaFree(inst->states_));
+  }
+
+  void Seed(Stream<gpu> *s, uint32_t seed);
+
+ private:
+  curandStatePhilox4_32_10_t *states_;
+};
+
+template<>
+class RandGenerator<gpu, double> {
+ public:
+  // at least how many random numbers should be generated by one GPU thread.
+  static const int kMinNumRandomPerThread = 64;
+  // store how many global random states for GPU.
+  static const int kNumRandomStates = 32768;
+
+  // uniform number generation in Cuda made consistent with stl (include 0 but exclude 1)
+  // by using 1.0-curand_uniform().
+  // Needed as some samplers in sampler.h won't be able to deal with
+  // one of the boundary cases.
+  class Impl {
+   public:
+    Impl &operator=(const Impl &) = delete;
+    Impl(const Impl &) = delete;
+
+    // Copy state to local memory for efficiency.
+    __device__ explicit Impl(RandGenerator<gpu, double> *gen, int state_idx)
+        : global_gen_(gen),
+          global_state_idx_(state_idx),
+          state_(*(gen->states_ + state_idx)) {}
+
+    __device__ ~Impl() {
+      // store the curand state back into global memory
+      global_gen_->states_[global_state_idx_] = state_;
+    }
+
+    MSHADOW_FORCE_INLINE __device__ int rand() {
+      return curand(&state_);
+    }
+
+    MSHADOW_FORCE_INLINE __device__ double uniform() {
+      return static_cast<float>(1.0) - curand_uniform_double(&state_);
+    }
+
+    MSHADOW_FORCE_INLINE __device__ double normal() {
+      return curand_normal_double(&state_);
+    }
+
+   private:
+    RandGenerator<gpu, double> *global_gen_;
+    int global_state_idx_;
+    curandStatePhilox4_32_10_t state_;
+  };  // class RandGenerator<gpu, double>::Impl
+
+ private:
+  curandStatePhilox4_32_10_t *states_;
+};
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace random
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_RANDOM_GENERATOR_H_
diff --git a/src/common/utils.h b/src/common/utils.h
index 038ab2a..ede218b 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -49,7 +49,6 @@
 namespace mxnet {
 namespace common {
 
-
 /*!
  * \brief IndPtr should be non-negative, in non-decreasing order, start with 0
  *           and end with value equal with size of indices.
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index 18feec7..9a7ed09 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -61,6 +61,8 @@ Graph AttachOpResources(Graph g) {
           }
         } else if (req.type == ResourceRequest::kRandom) {
           requested.push_back(ResourceManager::Get()->Request(ctx, req));
+        } else if (req.type == ResourceRequest::kParallelRandom) {
+          requested.push_back(ResourceManager::Get()->Request(ctx, req));
         } else {
           LOG(FATAL) << "resource type not yet supported";
         }
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index e265cce..8be1eb4 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -203,6 +203,10 @@ inline void SetDependency(const nnvm::NodeAttrs& attrs,
         requested.push_back(ResourceManager::Get()->Request(ctx, req));
         write_vars.push_back(requested.back().var);
         break;
+       case ResourceRequest::kParallelRandom:
+        requested.push_back(ResourceManager::Get()->Request(ctx, req));
+        write_vars.push_back(requested.back().var);
+        break;
        default:
         LOG(FATAL) << "resource type not yet supported";
       }
diff --git a/src/operator/random/multisample_op.cc b/src/operator/random/multisample_op.cc
index 5f2af61..a88db09 100644
--- a/src/operator/random/multisample_op.cc
+++ b/src/operator/random/multisample_op.cc
@@ -47,7 +47,8 @@ DMLC_REGISTER_PARAMETER(MultiSampleParam);
   .set_attr<nnvm::FInferShape>("FInferShape", MultiSampleOpShape) \
   .set_attr<nnvm::FInferType>("FInferType", MultiSampleOpType) \
   .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) { \
-      return std::vector<ResourceRequest>{ResourceRequest::kRandom, ResourceRequest::kTempSpace}; \
+      return std::vector<ResourceRequest>{ResourceRequest::kParallelRandom, \
+                                          ResourceRequest::kTempSpace}; \
     }) \
   .set_attr<FCompute>("FCompute<cpu>", MultiSampleOpForward<cpu, sampler, num_inputs>) \
   .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes) \
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index 38ccbb6..e93e453 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -135,6 +135,8 @@ inline bool MultiSampleOpType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+using namespace mxnet::common::random;
+
 template<typename xpu, typename IType, typename OType, typename Sampler, int inum>
 struct SamplerCaller;
 
@@ -142,12 +144,12 @@ template<typename xpu, typename IType, typename OType, typename Sampler>
 struct SamplerCaller<xpu, IType, OType, Sampler, 1> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const Tensor<xpu, 1, unsigned int>& seeds,
-                       mshadow::Stream<xpu> *s) {
+                 RandGenerator<xpu, OType> *pgen,
+                 mshadow::Stream<xpu> *s) {
     Sampler sampler;
     sampler.Sample(inputs[0].FlatTo1D<xpu, IType>(s),
                    outputs[0].FlatTo1D<xpu, OType>(s),
-                   seeds, s);
+                   pgen, s);
   }
 };
 
@@ -155,13 +157,13 @@ template<typename xpu, typename IType, typename OType, typename Sampler>
 struct SamplerCaller<xpu, IType, OType, Sampler, 2> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
-                 const Tensor<xpu, 1, unsigned int>& seeds,
-                       mshadow::Stream<xpu> *s) {
+                 RandGenerator<xpu, OType> *pgen,
+                 mshadow::Stream<xpu> *s) {
     Sampler sampler;
     sampler.Sample(inputs[0].FlatTo1D<xpu, IType>(s),
                    inputs[1].FlatTo1D<xpu, IType>(s),
                    outputs[0].FlatTo1D<xpu, OType>(s),
-                   seeds, s);
+                   pgen, s);
   }
 };
 
@@ -177,15 +179,10 @@ void MultiSampleOpForward(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1);
   CHECK_GT(inputs[0].Size(), 0);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  // Generate multiple seeds for the different threads.
-  const int nSeeds(OptSampleSeedNum<xpu>(outputs[0].Size()));
-  Tensor<xpu, 1, unsigned> seeds
-    = ctx.requested[1].get_space_typed<xpu, 1, unsigned> (Shape1(nSeeds), ctx.get_stream<xpu>());
-  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(seeds);
   MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-        SamplerCaller<xpu, IType, OType, Sampler, inum>
-            ::op(inputs, outputs, seeds, s);
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
+      SamplerCaller<xpu, IType, OType, Sampler, inum>::op(inputs, outputs, pgen, s);
     });
   });
 }
diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h
index 9fdff03..a81b41a 100644
--- a/src/operator/random/sample_op.h
+++ b/src/operator/random/sample_op.h
@@ -241,31 +241,27 @@ using FSampleCompute = std::function<void (const nnvm::NodeAttrs& attrs,
                                            TBlob* outputs)>;
 
 using mxnet::TBlob;
+using namespace mxnet::common::random;
 
 // Allocates a single chunk of workspace memory and partitions it into three
 // workspace tensors that hold the seeds as well as the distribution parameters.
 template<typename xpu, typename DType>
-MSHADOW_FORCE_INLINE void GetSamplingTempData(index_t N, DType p1, DType p2, const OpContext& ctx,
-                                              Tensor<xpu, 1, unsigned int>* seeds,
+MSHADOW_FORCE_INLINE void GetSamplingTempData(DType p1, DType p2, const OpContext& ctx,
                                               Tensor<xpu, 1, DType>* parm1,
                                               Tensor<xpu, 1, DType>* parm2) {
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  const index_t nSeeds(OptSampleSeedNum<xpu>(N));
   // Combined memory requirement for the workspace data.
-  const index_t nInt(nSeeds + (2 * sizeof(DType) + sizeof(unsigned) - 1) / sizeof(unsigned));
+  const index_t nInt((2 * sizeof(DType) + sizeof(unsigned) - 1) / sizeof(unsigned));
   Tensor<xpu, 1, unsigned> wspace
     = ctx.requested[1].get_space_typed<xpu, 1, unsigned>(Shape1(nInt), s);
-  // Partition workspace into three chunks and initialize them.
-  *seeds = Tensor<xpu, 1, unsigned>(wspace.dptr_, Shape1(nSeeds), s);
-  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(*seeds);
-  DType *pspace = static_cast<DType*>(static_cast<void*>(wspace.dptr_+nSeeds));
+  // Partition workspace into two chunks and initialize them.
+  DType *pspace = static_cast<DType*>(static_cast<void*>(wspace.dptr_));
   *parm1 = Tensor<xpu, 1, DType>(pspace, Shape1(1), s);
   Copy(*parm1, Tensor<cpu, 1, DType>(&p1, Shape1(1)), s);
   *parm2 = Tensor<xpu, 1, DType>(pspace+1, Shape1(1), s);
   Copy(*parm2, Tensor<cpu, 1, DType>(&p2, Shape1(1)), s);
 }
 
-
 template<typename xpu, typename Sampler>
 struct SampleMaster;
 
@@ -278,14 +274,14 @@ struct SampleMaster<xpu, UniformSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
     CHECK_GE(param.high, param.low) << "low must be less or equal to high in uniform distribution";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> low, high;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.low, param.high, ctx,
-                                    &seeds, &low, &high);
+    GetSamplingTempData<xpu, float>(param.low, param.high, ctx,
+                                    &low, &high);
     UniformSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(low, high, out, seeds, s);
+      sampler.Sample(low, high, out, pgen, s);
     });
   }
 };
@@ -299,14 +295,13 @@ struct SampleMaster<xpu, NormalSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
     CHECK_GT(param.scale, 0) << "scale parameter in gaussian has to be positive";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> loc, scale;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.loc, param.scale, ctx,
-                                    &seeds, &loc, &scale);
+    GetSamplingTempData<xpu, float>(param.loc, param.scale, ctx, &loc, &scale);
     NormalSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(loc, scale, out, seeds, s);
+      sampler.Sample(loc, scale, out, pgen, s);
     });
   }
 };
@@ -321,14 +316,13 @@ struct SampleMaster<xpu, GammaSampler<xpu>> {
     const SampleGammaParam& param = nnvm::get<SampleGammaParam>(attrs.parsed);
     CHECK_GT(param.alpha, 0) << "alpha parameter in gamma distribution has to be positive";
     CHECK_GT(param.beta, 0) << "beta parameter in gamma distribution has to be positive";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> alpha, beta;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.alpha, param.beta, ctx,
-                                    &seeds, &alpha, &beta);
+    GetSamplingTempData<xpu, float>(param.alpha, param.beta, ctx, &alpha, &beta);
     GammaSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(alpha, beta, out, seeds, s);
+      sampler.Sample(alpha, beta, out, pgen, s);
     });
   }
 };
@@ -342,13 +336,13 @@ struct SampleMaster<xpu, ExponentialSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SampleExponentialParam& param = nnvm::get<SampleExponentialParam>(attrs.parsed);
     CHECK_GT(param.lam, 0) << "lambda parameter in exponential distribution has to be positive";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> lam, dummy;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.lam, 0, ctx, &seeds, &lam, &dummy);
+    GetSamplingTempData<xpu, float>(param.lam, 0, ctx, &lam, &dummy);
     ExponentialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(lam, out, seeds, s);
+      sampler.Sample(lam, out, pgen, s);
     });
   }
 };
@@ -362,13 +356,13 @@ struct SampleMaster<xpu, PoissonSampler<xpu>> {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     const SamplePoissonParam& param = nnvm::get<SamplePoissonParam>(attrs.parsed);
     CHECK_GE(param.lam, 0) << "lambda parameter in poisson distribution has to be non-negative";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> lam, dummy;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.lam, 0, ctx, &seeds, &lam, &dummy);
+    GetSamplingTempData<xpu, float>(param.lam, 0, ctx, &lam, &dummy);
     PoissonSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(lam, out, seeds, s);
+      sampler.Sample(lam, out, pgen, s);
     });
   }
 };
@@ -383,13 +377,13 @@ struct SampleMaster<xpu, NegativeBinomialSampler<xpu>> {
     const SampleNegBinomialParam& param = nnvm::get<SampleNegBinomialParam>(attrs.parsed);
     CHECK_GE(param.k, 0) << "k parameter in negative binomial distribution has to be non-negative";
     CHECK_GE(param.p, 0) << "p parameter in negative binomial distribution has to be non-negative";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> k, p;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.k, param.p, ctx, &seeds, &k, &p);
+    GetSamplingTempData<xpu, float>(param.k, param.p, ctx, &k, &p);
     NegativeBinomialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(k, p, out, seeds, s);
+      sampler.Sample(k, p, out, pgen, s);
     });
   }
 };
@@ -406,14 +400,13 @@ struct SampleMaster<xpu, GeneralizedNegativeBinomialSampler<xpu>> {
       << "mu parameter in generalized negative binomial distribution has to be non-negative";
     CHECK_GE(param.alpha, 0)
       << "alpha parameter in generalized negative binomial distribution has to be non-negative";
-    Tensor<xpu, 1, unsigned int> seeds;
     Tensor<xpu, 1, float> mu, alpha;
-    GetSamplingTempData<xpu, float>(outputs->Size(), param.mu, param.alpha, ctx,
-                                    &seeds, &mu, &alpha);
+    GetSamplingTempData<xpu, float>(param.mu, param.alpha, ctx, &mu, &alpha);
     GeneralizedNegativeBinomialSampler<xpu> sampler;
     MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      RandGenerator<xpu, OType> *pgen = ctx.requested[0].get_parallel_random<xpu, OType>();
       Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
-      sampler.Sample(mu, alpha, out, seeds, s);
+      sampler.Sample(mu, alpha, out, pgen, s);
     });
   }
 };
@@ -502,7 +495,7 @@ inline bool SampleOpType(const nnvm::NodeAttrs& attrs,
 }
 
 inline std::vector<ResourceRequest> SampleResource(const NodeAttrs& attrs) {
-  return { ResourceRequest::kRandom, ResourceRequest::kTempSpace };
+  return { ResourceRequest::kParallelRandom, ResourceRequest::kTempSpace };
 }
 
 }  // namespace op
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
index d544aec..8eace1e 100644
--- a/src/operator/random/sampler.h
+++ b/src/operator/random/sampler.h
@@ -25,89 +25,52 @@
 #ifndef MXNET_OPERATOR_RANDOM_SAMPLER_H_
 #define MXNET_OPERATOR_RANDOM_SAMPLER_H_
 
-#ifdef __CUDACC__
-#include <curand.h>
-#include <curand_kernel.h>
-#endif  // __CUDACC__
+#include <algorithm>
 
 using namespace mshadow;
 using namespace mxnet::op::mxnet_op;
+using namespace mxnet::common::random;
 
 namespace mxnet {
 namespace op {
 
-// Elementary random number generation for int/uniform/gaussian in CPU and GPU.
-// Will use float data type whenever instantiated for half_t or any other non
-// standard real type.
-template<typename xpu, typename DType>
-class RandGenerator;
-
-template<typename DType>
-class RandGenerator<cpu, DType> {
- public:
-  typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                    DType, float>::type FType;
-  std::mt19937 engine;
-  std::uniform_real_distribution<FType> uniformNum;
-  std::normal_distribution<FType> normalNum;
-  explicit RandGenerator(unsigned int seed): engine(seed) {}
-  MSHADOW_XINLINE int rand() { return engine(); }
-  MSHADOW_XINLINE FType uniform() { return uniformNum(engine); }
-  MSHADOW_XINLINE FType normal() { return normalNum(engine); }
-};
-
-#ifdef __CUDACC__
-
-// uniform number generation in Cuda made consistent with stl (include 0 but exclude 1)
-// by using 1.0-curand_uniform(). Needed as some samplers below won't be able to deal with
-// one of the boundary cases.
-template<typename DType>
-class RandGenerator<gpu, DType> {
- public:
-  curandState_t state;
-  __device__ RandGenerator(unsigned int seed) { curand_init(seed, 0, 0, &state); }
-  MSHADOW_FORCE_INLINE __device__ int rand() { return curand(&state); }
-  MSHADOW_FORCE_INLINE __device__ float uniform()
-                              { return static_cast<float>(1.0) - curand_uniform(&state); }
-  MSHADOW_FORCE_INLINE __device__ float normal() { return curand_normal(&state); }
-};
-
-template<>
-class RandGenerator<gpu, double> {
- public:
-  curandState_t state;
-  __device__ RandGenerator(unsigned int seed) { curand_init(seed, 0, 0, &state); }
-  MSHADOW_FORCE_INLINE __device__ int rand() { return curand(&state); }
-  MSHADOW_FORCE_INLINE __device__ double uniform()
-                            { return static_cast<double>(1.0) - curand_uniform_double(&state); }
-  MSHADOW_FORCE_INLINE __device__ double normal() { return curand_normal_double(&state); }
-};
-
-#endif  // __CUDACC__
-
-// Number of seeds/threads when sampling on cpu/gpu.
-template<typename xpu>
-MSHADOW_XINLINE index_t OptSampleSeedNum(index_t N);
-template<>
-MSHADOW_XINLINE index_t OptSampleSeedNum<cpu>(index_t N) {
-  return omp_get_num_threads();
-}
-template<>
-MSHADOW_XINLINE index_t OptSampleSeedNum<gpu>(index_t N) {
-  return N;
+/*!
+ * \brief Launch a generic kernel with parallel random generator.
+ * \tparam gen random generator
+ * \tparam N Number of iterations
+ * \tparam Args Varargs type to eventually pass to the OP::Map() functoion
+ */
+template<typename OP, typename xpu, typename GType, typename ...Args>
+inline static void LaunchRNG(mshadow::Stream<xpu> *s,
+                             common::random::RandGenerator<xpu, GType> *gen,
+                             const int N, Args... args) {
+  const int nloop = (N + RandGenerator<xpu, GType>::kMinNumRandomPerThread - 1) /
+                    RandGenerator<xpu, GType>::kMinNumRandomPerThread;
+  const int nthread = std::min(nloop, RandGenerator<xpu, GType>::kNumRandomStates);
+  const int step = (N + nthread - 1) / nthread;
+  Kernel<OP, xpu>::Launch(s, nthread, *gen, N, step, args...);
 }
 
+#define RNG_KERNEL_LOOP(xpu, GType, thread_id, gen, N, step, ...)        \
+  const int start = thread_id * step;                                    \
+  const int end = start + step;                                          \
+  typename RandGenerator<xpu, GType>::Impl genImpl(&gen, thread_id);     \
+  for (int i = start; i < end && i < N; ++i) {                           \
+    {__VA_ARGS__}                                                        \
+  }
+
 template<typename xpu>
 struct SampleUniformKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                     const IType *lower, const IType *upper, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, OType> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      out[j] = OType(lower[j/nBatch] + (upper[j/nBatch] - lower[j/nBatch]) * gen.uniform());
-    }
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *lower, const IType *upper, OType *out) {
+    RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      out[i] = OType(lower[i / nBatch] +
+                     (upper[i / nBatch] - lower[i / nBatch]) * genImpl.uniform());
+    });
   }
 };
 
@@ -117,25 +80,24 @@ struct UniformSampler {
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
                                    const Tensor<xpu, 1, IType>& upper,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleUniformKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), lower.size(0), out.size(0), seed.size(0),
-               lower.dptr_, upper.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    LaunchRNG<SampleUniformKernel<xpu>, xpu>(s, pgen, out.size(0), lower.size(0), out.size(0),
+                                             lower.dptr_, upper.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu>
 struct SampleNormalKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                            const IType *mean, const IType *std, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, OType> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      out[j] = OType(gen.normal() * std[j/nBatch] + mean[j/nBatch]);
-    }
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *mean, const IType *std, OType *out) {
+    RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      out[i] = OType(genImpl.normal() * std[i / nBatch] + mean[i / nBatch]);
+    });
   }
 };
 
@@ -145,25 +107,24 @@ struct NormalSampler {
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mean,
                                    const Tensor<xpu, 1, IType>& std,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleNormalKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), mean.size(0), out.size(0), seed.size(0),
-               mean.dptr_, std.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    LaunchRNG<SampleNormalKernel<xpu>, xpu>(s, pgen, out.size(0), mean.size(0), out.size(0),
+                                            mean.dptr_, std.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu>
 struct SampleExponentialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                                  const IType *lambda, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, OType> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      out[j] = OType(-log(1.0-gen.uniform()) / lambda[j/nBatch]);
-    }
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, OType> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *lambda, OType *out) {
+    RNG_KERNEL_LOOP(xpu, OType, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      out[i] = OType(-log(1.0 - genImpl.uniform()) / lambda[i / nBatch]);
+    });
   }
 };
 
@@ -172,16 +133,16 @@ struct ExponentialSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleExponentialKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), lambda.size(0), out.size(0), seed.size(0),
-               lambda.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    LaunchRNG<SampleExponentialKernel<xpu>, xpu>(s, pgen, out.size(0),
+                                                 lambda.size(0), out.size(0),
+                                                 lambda.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu, typename IType, typename OType>
-MSHADOW_XINLINE OType SampleGamma(IType a, IType b, RandGenerator<xpu, OType> *gen) {
+MSHADOW_XINLINE OType SampleGamma(IType a, IType b, typename RandGenerator<xpu, OType>::Impl *gen) {
   // Generate one sample of the gamma distribution
   OType sample;
   OType d = a < 1 ? a + 2.0 / 3.0 : a - 1.0 / 3.0;
@@ -203,17 +164,16 @@ MSHADOW_XINLINE OType SampleGamma(IType a, IType b, RandGenerator<xpu, OType> *g
 
 template<typename xpu>
 struct SampleGammaKernel {
-  template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                      const IType *alpha, const IType *beta, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    typedef typename std::conditional<std::is_floating_point<OType>::value,
-                                     OType, float>::type FType;
-    RandGenerator<xpu, FType> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      out[j] = OType(SampleGamma(alpha[j/nBatch], beta[j/nBatch], &gen));
-    }
+  template<typename IType, typename OType, typename FType>
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, FType> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *alpha, const IType *beta, OType *out) {
+    RNG_KERNEL_LOOP(xpu, FType, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      out[i] = OType(SampleGamma<xpu, IType, FType>(alpha[i / nBatch],
+                                                    beta[i / nBatch], &genImpl));
+    });
   }
 };
 
@@ -223,16 +183,18 @@ struct GammaSampler {
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& alpha,
                                    const Tensor<xpu, 1, IType>& beta,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleGammaKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), alpha.size(0), out.size(0), seed.size(0),
-               alpha.dptr_, beta.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    typedef typename std::conditional<std::is_floating_point<OType>::value,
+                                      OType, float>::type FType;
+    RandGenerator<xpu, FType> *gen = reinterpret_cast<RandGenerator<xpu, FType> *>(pgen);
+    LaunchRNG<SampleGammaKernel<xpu>, xpu>(s, gen, out.size(0), alpha.size(0), out.size(0),
+                                           alpha.dptr_, beta.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu>
-MSHADOW_XINLINE int SamplePoisson(float lambda, RandGenerator<xpu, float> *gen) {
+MSHADOW_XINLINE int SamplePoisson(float lambda, typename RandGenerator<xpu, float>::Impl *gen) {
   // Generate one sample of the poisson distribution. Intentionally written
   // towards a specific type (float) for internal computation which is sufficient
   // for accurate enough computation.
@@ -265,14 +227,14 @@ MSHADOW_XINLINE int SamplePoisson(float lambda, RandGenerator<xpu, float> *gen)
 template<typename xpu>
 struct SamplePoissonKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                                  const IType *lambda, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, float> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      out[j] = OType(SamplePoisson(lambda[j/nBatch], &gen));
-    }
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *lambda, OType *out) {
+    RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      out[i] = OType(SamplePoisson<xpu>(lambda[i / nBatch], &genImpl));
+    });
   }
 };
 
@@ -281,29 +243,29 @@ struct PoissonSampler {
   template<typename IType, typename OType>
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SamplePoissonKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), lambda.size(0), out.size(0), seed.size(0),
-               lambda.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
+    LaunchRNG<SamplePoissonKernel<xpu>, xpu>(s, gen, out.size(0), lambda.size(0), out.size(0),
+                                             lambda.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu>
 struct SampleNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                             const IType *k, const IType *p, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, float> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      float alpha = k[j/nBatch];
-      float prob = p[j/nBatch];
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *k, const IType *p, OType *out) {
+    RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      float alpha = k[i / nBatch];
+      float prob = p[i / nBatch];
       float beta = (1.0 - prob) / prob;
-      float lambda = SampleGamma(alpha, beta, &gen);
-      out[j] = OType(SamplePoisson(lambda, &gen));
-    }
+      float lambda = SampleGamma<xpu, IType, float>(alpha, beta, &genImpl);
+      out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
+    });
   }
 };
 
@@ -313,27 +275,29 @@ struct NegativeBinomialSampler {
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& k,
                                    const Tensor<xpu, 1, IType>& p,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleNegativeBinomialKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), k.size(0), out.size(0), seed.size(0),
-               k.dptr_, p.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
+    LaunchRNG<SampleNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0), k.size(0), out.size(0),
+                                                      k.dptr_, p.dptr_, out.dptr_);
   }
 };
 
 template<typename xpu>
 struct SampleGeneralizedNegativeBinomialKernel {
   template<typename IType, typename OType>
-  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
-                        const IType *mu, const IType *alpha, OType *out, const unsigned *seed) {
-    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
-            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
-    RandGenerator<xpu, float> gen(seed[i]);
-    for ( index_t j = start; j < end; ++j ) {
-      float lambda = alpha[j/nBatch] == 0 ? static_cast<float>(mu[j/nBatch])
-              : SampleGamma(IType(1) / alpha[j/nBatch], alpha[j/nBatch] * mu[j/nBatch], &gen);
-      out[j] = OType(SamplePoisson(lambda, &gen));
-    }
+  MSHADOW_XINLINE static void Map(int id, RandGenerator<xpu, float> gen,
+                                  const int N, const int step,
+                                  index_t nParm, index_t nSample,
+                                  const IType *mu, const IType *alpha, OType *out) {
+    RNG_KERNEL_LOOP(xpu, float, id, gen, N, step, {
+      index_t nBatch(1 + (nSample - 1) / nParm);
+      float lambda = alpha[i / nBatch] == 0 ?
+                     static_cast<float>(mu[i / nBatch]) :
+                     SampleGamma<xpu, IType, float>(IType(1) / alpha[i / nBatch],
+                                                    alpha[i / nBatch] * mu[i / nBatch], &genImpl);
+      out[i] = OType(SamplePoisson<xpu>(lambda, &genImpl));
+    });
   }
 };
 
@@ -343,11 +307,12 @@ struct GeneralizedNegativeBinomialSampler {
   MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mu,
                                    const Tensor<xpu, 1, IType>& alpha,
                                    const Tensor<xpu, 1, OType>& out,
-                                   const Tensor<xpu, 1, unsigned>& seed,
-                                         Stream<xpu> *s) {
-    Kernel<SampleGeneralizedNegativeBinomialKernel<xpu>, xpu>
-      ::Launch(s, seed.size(0), mu.size(0), out.size(0), seed.size(0),
-               mu.dptr_, alpha.dptr_, out.dptr_, seed.dptr_);
+                                   RandGenerator<xpu, OType> *pgen,
+                                   Stream<xpu> *s) {
+    RandGenerator<xpu, float> *gen = reinterpret_cast<RandGenerator<xpu, float> *>(pgen);
+    LaunchRNG<SampleGeneralizedNegativeBinomialKernel<xpu>, xpu>(s, gen, out.size(0),
+                                                                 mu.size(0), out.size(0),
+                                                                 mu.dptr_, alpha.dptr_, out.dptr_);
   }
 };
 
diff --git a/src/resource.cc b/src/resource.cc
index d1038dc..e195006 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -32,6 +32,8 @@
 #include <limits>
 #include <atomic>
 #include "./common/lazy_alloc_array.h"
+#include "./common/random_generator.h"
+#include "./common/utils.h"
 
 namespace mxnet {
 namespace resource {
@@ -88,20 +90,26 @@ class ResourceManagerImpl : public ResourceManager {
       : global_seed_(0) {
     cpu_temp_space_copy_ = dmlc::GetEnv("MXNET_CPU_TEMP_COPY", 4);
     gpu_temp_space_copy_ = dmlc::GetEnv("MXNET_GPU_TEMP_COPY", 1);
+    cpu_native_rand_copy_ = dmlc::GetEnv("MXNET_CPU_NATIVE_RAND_COPY", 1);
+    gpu_native_rand_copy_ = dmlc::GetEnv("MXNET_GPU_NATIVE_RAND_COPY", 4);
     engine_ref_ = Engine::_GetSharedRef();
     storage_ref_ = Storage::_GetSharedRef();
     cpu_rand_.reset(new ResourceRandom<cpu>(
         Context::CPU(), global_seed_));
     cpu_space_.reset(new ResourceTempSpace(
         Context::CPU(), cpu_temp_space_copy_));
+    cpu_native_rand_.reset(new ResourceNativeRandom<cpu>(
+        Context::CPU(), cpu_native_rand_copy_, global_seed_));
   }
   ~ResourceManagerImpl() {
     // need explicit delete, before engine get killed
     cpu_rand_.reset(nullptr);
     cpu_space_.reset(nullptr);
+    cpu_native_rand_.reset(nullptr);
 #if MXNET_USE_CUDA
     gpu_rand_.Clear();
     gpu_space_.Clear();
+    gpu_native_rand_.Clear();
 #endif
     if (engine_ref_ != nullptr) {
       engine_ref_ = nullptr;
@@ -117,6 +125,7 @@ class ResourceManagerImpl : public ResourceManager {
       switch (req.type) {
         case ResourceRequest::kRandom: return cpu_rand_->resource;
         case ResourceRequest::kTempSpace: return cpu_space_->GetNext();
+        case ResourceRequest::kParallelRandom: return cpu_native_rand_->GetNext();
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
     } else {
@@ -133,6 +142,11 @@ class ResourceManagerImpl : public ResourceManager {
               return new ResourceTempSpace(ctx, gpu_temp_space_copy_);
             })->GetNext();
         }
+        case ResourceRequest::kParallelRandom: {
+          return gpu_native_rand_.Get(ctx.dev_id, [ctx, this]() {
+            return new ResourceNativeRandom<gpu>(ctx, gpu_native_rand_copy_, global_seed_);
+          })->GetNext();
+        }
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
 #else
@@ -146,10 +160,14 @@ class ResourceManagerImpl : public ResourceManager {
   void SeedRandom(uint32_t seed) override {
     global_seed_ = seed;
     cpu_rand_->Seed(global_seed_);
+    cpu_native_rand_->Seed(global_seed_);
 #if MXNET_USE_CUDA
     gpu_rand_.ForEach([seed](size_t i, ResourceRandom<gpu> *p) {
         p->Seed(seed);
       });
+    gpu_native_rand_.ForEach([seed](size_t i, ResourceNativeRandom<gpu> *p) {
+      p->Seed(seed);
+    });
 #endif
   }
 
@@ -205,7 +223,7 @@ class ResourceManagerImpl : public ResourceManager {
     std::vector<SpaceAllocator> space;
     /*! \brief resource representation */
     std::vector<Resource> resource;
-    /*! \brief current pointer to the round roubin alloator */
+    /*! \brief current pointer to the round roubin allocator */
     std::atomic<size_t> curr_ptr;
     /*! \brief constructor */
     explicit ResourceTempSpace(Context ctx, size_t ncopy)
@@ -241,10 +259,82 @@ class ResourceManagerImpl : public ResourceManager {
       return resource[ptr % space.size()];
     }
   };
+
+  // the native random sampler resources
+  template<typename xpu>
+  struct ResourceNativeRandom {
+    /*! \brief the context of the PRNG */
+    Context ctx;
+    /*! \brief pointers to sampler */
+    std::vector<common::random::RandGenerator<xpu> *> sampler;
+    /*! \brief resource representation */
+    std::vector<Resource> resource;
+    /*! \brief current pointer to the round roubin allocator */
+    std::atomic<size_t> curr_ptr;
+    /*! \brief constructor */
+    explicit ResourceNativeRandom(Context ctx, size_t ncopy, uint32_t global_seed)
+        : ctx(ctx), sampler(ncopy), resource(ncopy), curr_ptr(0) {
+      for (size_t i = 0; i < sampler.size(); ++i) {
+        const uint32_t seed = ctx.dev_id + i * kMaxNumGPUs + global_seed * kRandMagic;
+        resource[i].var = Engine::Get()->NewVariable();
+        common::random::RandGenerator<xpu> *r = new common::random::RandGenerator<xpu>();
+        common::random::RandGenerator<xpu>::AllocState(r);
+        Engine::Get()->PushSync(
+        [r, seed](RunContext rctx) {
+          r->Seed(rctx.get_stream<xpu>(), seed);
+        }, ctx, {}, {resource[i].var},
+        FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceNativeRandomSetSeed"));
+        sampler[i] = r;
+        resource[i].ptr_ = sampler[i];
+        resource[i].req = ResourceRequest(ResourceRequest::kParallelRandom);
+      }
+    }
+    ~ResourceNativeRandom() {
+      for (size_t i = 0; i < sampler.size(); ++i) {
+        common::random::RandGenerator<xpu> *r = sampler[i];
+        Engine::Get()->DeleteVariable(
+        [r](RunContext rctx) {
+          MSHADOW_CATCH_ERROR(common::random::RandGenerator<xpu>::FreeState(r));
+          MSHADOW_CATCH_ERROR(delete r);
+        }, ctx, resource[i].var);
+      }
+    }
+    // set seed to a sampler
+    inline void Seed(uint32_t global_seed) {
+      for (size_t i = 0; i < sampler.size(); ++i) {
+        const uint32_t seed = ctx.dev_id + i * kMaxNumGPUs + global_seed * kRandMagic;
+        common::random::RandGenerator<xpu> *r = sampler[i];
+        Engine::Get()->PushAsync(
+        [r, seed](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          r->Seed(rctx.get_stream<xpu>(), seed);
+          on_complete();
+        }, ctx, {}, {resource[i].var},
+        FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceNativeRandomSetSeed"));
+      }
+      // reset pointer to ensure the same result with the same seed.
+      curr_ptr.store(0);
+    }
+    // get next resource in round roubin matter
+    inline Resource GetNext() {
+      const size_t kMaxDigit = std::numeric_limits<size_t>::max() / 2;
+      size_t ptr = ++curr_ptr;
+      // reset ptr to avoid undefined behavior during overflow
+      // usually this won't happen
+      if (ptr > kMaxDigit) {
+        curr_ptr.store((ptr + 1) % sampler.size());
+      }
+      return resource[ptr % sampler.size()];
+    }
+  };
+
   /*! \brief number of copies in CPU temp space */
   int cpu_temp_space_copy_;
   /*! \brief number of copies in GPU temp space */
   int gpu_temp_space_copy_;
+  /*! \brief number of copies in CPU native random sampler */
+  int cpu_native_rand_copy_;
+  /*! \brief number of copies in GPU native random sampler */
+  int gpu_native_rand_copy_;
   /*! \brief Reference to the engine */
   std::shared_ptr<Engine> engine_ref_;
   /*! \brief Reference to the storage */
@@ -255,11 +345,15 @@ class ResourceManagerImpl : public ResourceManager {
   std::unique_ptr<ResourceRandom<cpu> > cpu_rand_;
   /*! \brief CPU temp space resources */
   std::unique_ptr<ResourceTempSpace> cpu_space_;
+  /*! \brief CPU native random number resources */
+  std::unique_ptr<ResourceNativeRandom<cpu> > cpu_native_rand_;
 #if MXNET_USE_CUDA
   /*! \brief random number generator for GPU */
   common::LazyAllocArray<ResourceRandom<gpu> > gpu_rand_;
   /*! \brief temp space for GPU */
   common::LazyAllocArray<ResourceTempSpace> gpu_space_;
+  /*! \brief GPU native (on device) random number resources */
+  common::LazyAllocArray<ResourceNativeRandom<gpu> > gpu_native_rand_;
 #endif
 };
 }  // namespace resource
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 08302b8..db5508d 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -613,7 +613,7 @@ def test_factorization_machine_module(verbose=False):
             expected_accuracy = 0.02
         elif optimizer == 'adam':
             # use Sparse Adam to train
-            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.001,
+            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
                                      rescale_grad=1.0/batch_size)
             mod.init_optimizer(optimizer=adam)
             if num_epochs is None:
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index d05e325..0230d5f 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -330,8 +330,8 @@ def check_softmax_with_shape(shape, xpu, preserve_shape=False):
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L')
     Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape)
-    x = mx.random.uniform(-1, 1, shape, ctx=mx.cpu()).copyto(xpu)
-    l = mx.random.uniform(-1, 1, shape, ctx=mx.cpu()).copyto(xpu)
+    x = mx.random.uniform(-1, 1, shape, ctx=xpu)
+    l = mx.random.uniform(-1, 1, shape, ctx=xpu)
     l[:] = np_softmax(l.asnumpy())
     grad = mx.nd.empty(shape, ctx = xpu)
     exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6178cbe..1aa2e22 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -666,7 +666,7 @@ def test_nadam():
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=30, optimizer_params={'learning_rate': 0.005, 'wd': 0.0005},
+    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.0005, 'wd': 0.0005},
             initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
             optimizer='nadam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index a67e2d1..0efe8e6 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -17,12 +17,16 @@
 
 import os
 import mxnet as mx
+from mxnet.test_utils import verify_generator, gen_buckets_probs_with_ppf
 import numpy as np
+import scipy.stats as ss
 
 def same(a, b):
     return np.sum(a != b) == 0
 
 def check_with_device(device, dtype):
+    # The thresholds chosen for the tests are too loose. We will rely on the other tests to test the samples from the
+    #  generators.
     tol = 0.1
     symbols = [
         {
@@ -216,6 +220,124 @@ def test_sample_multinomial():
             real_dx[y[i][j]] += 5.0 / rprob[j]
         mx.test_utils.assert_almost_equal(real_dx, dx.asnumpy()[i])
 
+# Test the generators with the chi-square testing
+def test_normal_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        for mu, sigma in [(0.0, 1.0), (1.0, 5.0)]:
+            print("ctx=%s, dtype=%s, Mu=%g, Sigma=%g:" % (ctx, dtype, mu, sigma))
+            buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.norm.ppf(x, mu, sigma), 5)
+            generator_mx = lambda x: mx.nd.random.normal(mu, sigma, shape=x, ctx=ctx, dtype=dtype).asnumpy()
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            generator_mx_same_seed =\
+                lambda x: np.concatenate(
+                    [mx.nd.random.normal(mu, sigma, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                     for _ in range(10)])
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_uniform_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        for low, high in [(-1.0, 1.0), (1.0, 3.0)]:
+            print("ctx=%s, dtype=%s, Low=%g, High=%g:" % (ctx, dtype, low, high))
+            buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.uniform.ppf(x, loc=low, scale=high - low), 5)
+            generator_mx = lambda x: mx.nd.random.uniform(low, high, shape=x, ctx=ctx, dtype=dtype).asnumpy()
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            generator_mx_same_seed = \
+                lambda x: np.concatenate(
+                    [mx.nd.random.uniform(low, high, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                     for _ in range(10)])
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_gamma_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        for kappa, theta in [(0.5, 1.0), (1.0, 5.0)]:
+            print("ctx=%s, dtype=%s, Shape=%g, Scale=%g:" % (ctx, dtype, kappa, theta))
+            buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.gamma.ppf(x, a=kappa, loc=0, scale=theta), 5)
+            generator_mx = lambda x: mx.nd.random.gamma(kappa, theta, shape=x, ctx=ctx, dtype=dtype).asnumpy()
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            generator_mx_same_seed = \
+                lambda x: np.concatenate(
+                    [mx.nd.random.gamma(kappa, theta, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                     for _ in range(10)])
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_exponential_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        for scale in [0.1, 1.0]:
+            print("ctx=%s, dtype=%s, Scale=%g:" % (ctx, dtype, scale))
+            buckets, probs = gen_buckets_probs_with_ppf(lambda x: ss.expon.ppf(x, loc=0, scale=scale), 5)
+            generator_mx = lambda x: mx.nd.random.exponential(scale, shape=x, ctx=ctx, dtype=dtype).asnumpy()
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            generator_mx_same_seed = \
+                lambda x: np.concatenate(
+                    [mx.nd.random.exponential(scale, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                     for _ in range(10)])
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_poisson_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        for lam in [1, 10]:
+            print("ctx=%s, dtype=%s, Lambda=%d:" % (ctx, dtype, lam))
+            buckets = [(-1.0, lam - 0.5), (lam - 0.5, 2 * lam + 0.5), (2 * lam + 0.5, np.inf)]
+            probs = [ss.poisson.cdf(bucket[1], lam) - ss.poisson.cdf(bucket[0], lam) for bucket in buckets]
+            generator_mx = lambda x: mx.nd.random.poisson(lam, shape=x, ctx=ctx, dtype=dtype).asnumpy()
+            verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+            generator_mx_same_seed = \
+                lambda x: np.concatenate(
+                    [mx.nd.random.poisson(lam, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                     for _ in range(10)])
+            verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_negative_binomial_generator():
+    ctx = mx.context.current_context()
+    for dtype in ['float16', 'float32', 'float64']:
+        success_num = 2
+        success_prob = 0.2
+        print("ctx=%s, dtype=%s, Success Num=%d:, Success Prob=%g" % (ctx, dtype, success_num, success_prob))
+        buckets = [(-1.0, 2.5), (2.5, 5.5), (5.5, 8.5), (8.5, np.inf)]
+        probs = [ss.nbinom.cdf(bucket[1], success_num, success_prob) -
+                 ss.nbinom.cdf(bucket[0], success_num, success_prob) for bucket in buckets]
+        generator_mx = lambda x: mx.nd.random.negative_binomial(success_num, success_prob,
+                                                                shape=x, ctx=ctx, dtype=dtype).asnumpy()
+        verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+        generator_mx_same_seed = \
+            lambda x: np.concatenate(
+                [mx.nd.random.negative_binomial(success_num, success_prob, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                 for _ in range(10)])
+        verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+        # Also test the Gamm-Poisson Mixture
+        print('Gamm-Poisson Mixture Test:')
+        alpha = 1.0 / success_num
+        mu = (1.0 - success_prob) / success_prob / alpha
+        generator_mx = lambda x: mx.nd.random.generalized_negative_binomial(mu, alpha,
+                                                                            shape=x, ctx=ctx, dtype=dtype).asnumpy()
+        verify_generator(generator=generator_mx, buckets=buckets, probs=probs)
+        generator_mx_same_seed = \
+            lambda x: np.concatenate(
+                [mx.nd.random.generalized_negative_binomial(mu, alpha, shape=x // 10, ctx=ctx, dtype=dtype).asnumpy()
+                 for _ in range(10)])
+        verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
+def test_multinomial_generator():
+    ctx = mx.context.current_context()
+    probs = [0.1, 0.2, 0.3, 0.05, 0.15, 0.2]
+    buckets = list(range(6))
+    for dtype in ['float16', 'float32', 'float64']:
+        print("ctx=%s, dtype=%s" %(ctx, dtype))
+        generator_mx = lambda x: mx.nd.random.multinomial(data=mx.nd.array(np.array(probs), ctx=ctx, dtype=dtype),
+                                                          shape=x).asnumpy()
+        verify_generator(generator_mx, buckets, probs)
+        generator_mx_same_seed = \
+            lambda x: np.concatenate(
+                [mx.nd.random.multinomial(data=mx.nd.array(np.array(probs), ctx=ctx, dtype=dtype),
+                                                          shape=x // 10).asnumpy()
+                 for _ in range(10)])
+        verify_generator(generator=generator_mx_same_seed, buckets=buckets, probs=probs)
+
 
 if __name__ == '__main__':
     import nose

-- 
To stop receiving notification emails like this one, please contact
['"commits@mxnet.apache.org" <co...@mxnet.apache.org>'].