You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mxnet.apache.org by zh...@apache.org on 2020/03/10 03:37:29 UTC
[incubator-mxnet] branch master updated: Implement storage tagging, the first half of the memory profiler (#17656)

This is an automated email from the ASF dual-hosted git repository.

zhasheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet.git


The following commit(s) were added to refs/heads/master by this push:
     new 4dddb08  Implement storage tagging, the first half of the memory profiler (#17656)
4dddb08 is described below

commit 4dddb087293965af7e7af8d0eaead60a9051f0fc
Author: Bojian Zheng <bo...@mail.utoronto.ca>
AuthorDate: Mon Mar 9 23:36:30 2020 -0400

    Implement storage tagging, the first half of the memory profiler (#17656)
---
 CMakeLists.txt                                  |  14 +++
 Makefile                                        |   7 ++
 ci/docker/runtime_functions.sh                  |   8 +-
 cmake/Modules/FindNVML.cmake                    |  84 +++++++++++++
 include/mxnet/c_api.h                           |   7 ++
 include/mxnet/libinfo.h                         |   4 +
 include/mxnet/ndarray.h                         |  11 +-
 include/mxnet/resource.h                        |  70 ++++++++---
 include/mxnet/storage.h                         |  11 +-
 python/mxnet/gluon/block.py                     |  24 +++-
 python/mxnet/operator.py                        |   8 +-
 python/mxnet/optimizer/updater.py               |   4 +-
 python/mxnet/profiler.py                        |  52 ++++++++
 python/mxnet/symbol/register.py                 |  11 ++
 python/mxnet/symbol/symbol.py                   |  11 +-
 src/c_api/c_api.cc                              |  34 ++++--
 src/c_api/c_api_ndarray.cc                      |   5 +
 src/c_api/c_api_profile.cc                      |  19 +++
 src/c_api/c_api_symbolic.cc                     |  14 ++-
 src/common/cuda_utils.h                         |  19 +++
 src/common/utils.h                              |  20 ++-
 src/executor/graph_executor.cc                  |  36 +++++-
 src/imperative/cached_op.cc                     |  40 ++++++
 src/imperative/cached_op.h                      |   8 +-
 src/imperative/imperative_utils.h               |  23 +++-
 src/io/iter_image_recordio_2.cc                 |   7 ++
 src/kvstore/comm.h                              |  19 ++-
 src/kvstore/comm_tree.h                         |  12 +-
 src/kvstore/kvstore_local.h                     |  22 ++++
 src/ndarray/ndarray.cc                          |  27 +++-
 src/operator/linalg_impl.h                      |  47 ++++---
 src/operator/nn/cudnn/cudnn_convolution-inl.h   |   5 +-
 src/operator/nn/cudnn/cudnn_deconvolution-inl.h |   5 +-
 src/operator/rnn-inl.h                          |   4 +
 src/profiler/profiler.cc                        |  18 +++
 src/profiler/profiler.h                         |  14 ++-
 src/profiler/storage_profiler.cc                | 130 ++++++++++++++++++++
 src/profiler/storage_profiler.h                 |  95 ++++++++++++++-
 src/resource.cc                                 |  30 +++--
 src/storage/gpu_device_storage.h                |   8 +-
 src/storage/pinned_memory_storage.h             |   5 +
 src/storage/pooled_storage_manager.h            |  11 ++
 src/storage/storage.cc                          |   2 +-
 tests/python/unittest/test_profiler.py          | 156 +++++++++++++++++++++++-
 tests/python/unittest/test_symbol.py            |   3 +-
 tests/python/unittest/test_thread_local.py      |   1 +
 46 files changed, 1066 insertions(+), 99 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 037a692..5806883 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ Format: Auto | Common | All | LIST(ARCH_AND_PTX ...)
 option(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
 option(USE_OPENCV "Build with OpenCV support" ON)
 option(USE_OPENMP "Build with Openmp support" ON)
+cmake_dependent_option(USE_NVML "Build with nvml support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_CUDNN "Build with cudnn support" ON "USE_CUDA" OFF) # one could set CUDNN_ROOT for search path
 cmake_dependent_option(USE_NVTX "Build with nvtx support if found" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_SSE "Build with x86 SSE instruction support" ON "NOT ARM" OFF)
@@ -630,6 +631,19 @@ if(USE_CUDA)
   list(APPEND SOURCE ${CUDA})
   add_definitions(-DMXNET_USE_CUDA=1)
 
+  if(UNIX)
+    if(USE_NVML)
+      find_package(NVML)
+      if(NVML_FOUND)
+        include_directories(${NVML_INCLUDE_DIRS})
+        list(APPEND mxnet_LINKER_LIBS ${NVML_LIBRARIES})
+        add_definitions(-DMXNET_USE_NVML=1)
+      else()
+        add_definitions(-DMXNET_USE_NVML=0)
+        message(WARNING "Could not find NVML libraries")
+      endif()
+    endif()
+  endif()
   if(USE_NCCL)
     find_package(NCCL)
     if(NCCL_FOUND)
diff --git a/Makefile b/Makefile
index 90303ae..49ba8fe 100644
--- a/Makefile
+++ b/Makefile
@@ -526,6 +526,12 @@ ifeq ($(USE_CUDA), 1)
 	# Make sure to add stubs as fallback in order to be able to build
 	# without full CUDA install (especially if run without nvidia-docker)
 	LDFLAGS += -L/usr/local/cuda/lib64/stubs
+	ifeq ($(USE_NVML), 1)
+		LDFLAGS += -lnvidia-ml
+		CFLAGS += -DMXNET_USE_NVML=1
+	else
+		CFLAGS += -DMXNET_USE_NVML=0
+	endif
 	ifeq ($(USE_NCCL), 1)
 		ifneq ($(USE_NCCL_PATH), NONE)
 			CFLAGS += -I$(USE_NCCL_PATH)/include
@@ -537,6 +543,7 @@ ifeq ($(USE_CUDA), 1)
 		CFLAGS += -DMXNET_USE_NCCL=0
 	endif
 else
+	CFLAGS += -DMXNET_USE_NVML=0
 	CFLAGS += -DMXNET_USE_NCCL=0
 endif
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index c5ecdd7..ffd871a 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -701,7 +701,7 @@ build_ubuntu_gpu_mkldnn() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -715,7 +715,7 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -730,7 +730,7 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
@@ -786,7 +786,7 @@ build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
     set -ex
     cd /work/build
     cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
+        -DCMAKE_BUILD_TYPE=Release \
         -DUSE_MKL_IF_AVAILABLE=OFF \
         -DUSE_TVM_OP=OFF \
         -DUSE_CUDA=ON \
diff --git a/cmake/Modules/FindNVML.cmake b/cmake/Modules/FindNVML.cmake
new file mode 100644
index 0000000..e098132
--- /dev/null
+++ b/cmake/Modules/FindNVML.cmake
@@ -0,0 +1,84 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Find the nvml libraries
+#
+# The following variables are optionally searched for defaults
+#  NVML_ROOT_DIR: Base directory where all NVML components are found
+#  NVML_INCLUDE_DIR: Directory where NVML header is found
+#  NVML_LIB_DIR: Directory where NVML library is found
+#
+# The following are set after configuration is done:
+#  NVML_FOUND
+#  NVML_INCLUDE_DIRS
+#  NVML_LIBRARIES
+#
+# The path hints include CUDA_TOOLKIT_ROOT_DIR seeing as some folks
+# install NVML in the same location as the CUDA toolkit.
+# See https://github.com/caffe2/caffe2/issues/1601
+
+if ($ENV{NVML_ROOT_DIR})
+  message(WARNING "NVML_ROOT_DIR is deprecated. Please set NVML_ROOT instead.")
+endif()
+
+find_path(NVML_INCLUDE_DIRS
+  NAMES nvml.h
+  HINTS
+  ${NVML_INCLUDE_DIR}
+  ${NVML_ROOT_DIR}
+  ${NVML_ROOT_DIR}/include
+  ${CUDA_TOOLKIT_ROOT_DIR}/include
+  $ENV{NVML_DIR}/include
+  )
+
+find_library(NVML_LIBRARIES
+  NAMES nvidia-ml
+  HINTS
+  ${NVML_LIB_DIR}
+  ${NVML_ROOT_DIR}
+  ${NVML_ROOT_DIR}/lib
+  ${NVML_ROOT_DIR}/lib/x86_64-linux-gnu
+  ${NVML_ROOT_DIR}/lib64
+  ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs
+  $ENV{NVML_DIR}/lib
+  )
+
+# if not found in any of the above paths, finally, check in the /usr/local/cuda for UNIX systems
+if (UNIX)
+  set (search_paths "/usr/local/cuda")
+
+  find_path(NVML_INCLUDE_DIRS
+    NAMES nvml.h
+    PATHS ${search_paths}
+    PATH_SUFFIXES include
+  )
+
+  find_library(NVML_LIBRARIES
+    NAMES nvidia-ml
+    PATHS ${search_paths}
+    PATH_SUFFIXES lib64/stubs
+  )
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NVML DEFAULT_MSG NVML_INCLUDE_DIRS NVML_LIBRARIES)
+
+if(NVML_FOUND)
+  message(STATUS "Found NVML (include: ${NVML_INCLUDE_DIRS}, library: ${NVML_LIBRARIES})")
+  mark_as_advanced(NVML_ROOT_DIR NVML_INCLUDE_DIRS NVML_LIBRARIES)
+endif()
+
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index bb2a568..efa0033 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -314,6 +314,13 @@ MXNET_DLL int MXSetProcessProfilerState(int state, int profile_process,
 MXNET_DLL int MXSetProfilerState(int state);
 
 /*!
+ * \brief Set the scope of profiler for current process
+ * \param scope indicate the working scope of profiler
+ * \return 0 when success, -1 when failure happens.
+ */
+MXNET_DLL int MXSetProfilerScope(const char* scope);
+
+/*!
  * \brief Save profile and stop profiler
  * \param finished true if stat output should stop after this point
  * \param profile_process an int,
diff --git a/include/mxnet/libinfo.h b/include/mxnet/libinfo.h
index 1972688..8b31b2d 100644
--- a/include/mxnet/libinfo.h
+++ b/include/mxnet/libinfo.h
@@ -55,6 +55,10 @@
 #define MXNET_USE_CUDNN MSHADOW_USE_CUDNN
 #endif
 
+#ifndef MXNET_USE_NVML
+#define MXNET_USE_NVML 0
+#endif
+
 #ifndef MXNET_USE_NCCL
 #define MXNET_USE_NCCL 0
 #endif
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index c55e49e..fd7cc38 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -361,6 +361,9 @@ class NDArray {
     CheckAndAlloc();
     return ptr_->shandle;
   }
+  /*! \brief assign profiler scope and name to the storage handles */
+  void AssignStorageInfo(const std::string& profiler_scope,
+                         const std::string& name);
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -989,7 +992,7 @@ class NDArray {
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif
@@ -1004,7 +1007,8 @@ class NDArray {
           << "CheckAndAlloc(dbytes) is only intended for kDefaultStorage";
       dbytes = std::max(dbytes, static_cast<uint64_t>(shandle.size));
       if (delay_alloc) {
-        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        shandle.size = dbytes;
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif
@@ -1013,7 +1017,8 @@ class NDArray {
         // free storage
         Storage::Get()->Free(shandle);
         // init storage
-        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        shandle.size = dbytes;
+        Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
         mkl_mem_ = nullptr;
 #endif
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 0114c3b..d766faa 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -26,6 +26,7 @@
 #define MXNET_RESOURCE_H_
 
 #include <dmlc/logging.h>
+#include <string>
 #include "./base.h"
 #include "./engine.h"
 #include "./random_generator.h"
@@ -62,6 +63,28 @@ struct ResourceRequest {
       : type(type) {}
 };
 
+namespace {
+/// \brief Given a path, extract the filename.
+inline std::string __extract_fname(const std::string& path) {
+  std::size_t last_dir_pos = path.find_last_of("/\\");
+  if (last_dir_pos == std::string::npos) {
+    return path;
+  }
+  return path.substr(last_dir_pos + 1);
+}
+}  // anonymous namespace
+
+#if (defined(__GNUC__) || defined(__GNUG__)) && !defined(__clang__)
+#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
+    std::string(tag) \
+    + " (" + __extract_fname(__builtin_FILE()) \
+    + " +" +  std::to_string(__builtin_LINE()) + ")"
+#else  // !__GNUC__ || __clang__
+#define MXNET_RESOURCE_DEFAULT_NAME_FARG(tag) \
+    std::string(tag) \
+    + " (" + __extract_fname(__FILE__) \
+    + " +" +  std::to_string(__LINE__) + ")"
+#endif  // __GNUC__ && !__clang__
 
 /*!
  * \brief Resources used by mxnet operations.
@@ -120,16 +143,18 @@ struct Resource {
    *  when running on device, so the launched kernels that depend on the temp space
    *  can finish correctly.
    *
-   * \param shape the Shape of returning tensor.
-   * \param stream the stream of retruning tensor.
+   * \param shape   the shape of returning tensor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
-   * \tparam xpu the device type of random number generator.
-   * \tparam ndim the number of dimension of the tensor requested.
+   * \tparam xpu   the device type of random number generator.
+   * \tparam ndim  the number of dimension of the tensor requested.
    */
   template<typename xpu, int ndim>
   inline mshadow::Tensor<xpu, ndim, real_t> get_space(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
-    return get_space_typed<xpu, ndim, real_t>(shape, stream);
+      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
+    return get_space_typed<xpu, ndim, real_t>(shape, stream, name);
   }
   /*!
    * \brief Get cpu space requested as mshadow Tensor.
@@ -148,33 +173,37 @@ struct Resource {
    * \brief Get space requested as mshadow Tensor in specified type.
    *  The caller can request arbitrary size.
    *
-   * \param shape the Shape of returning tensor.
-   * \param stream the stream of retruning tensor.
+   * \param shape   the shape of returning tensor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
-   * \tparam xpu the device type of random number generator.
-   * \tparam ndim the number of dimension of the tensor requested.
+   * \tparam xpu   the device type of random number generator.
+   * \tparam ndim  the number of dimension of the tensor requested.
    */
   template<typename xpu, int ndim, typename DType>
   inline mshadow::Tensor<xpu, ndim, DType> get_space_typed(
-      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream) const {
+      mshadow::Shape<ndim> shape, mshadow::Stream<xpu> *stream,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("temp_space")) const {
     CHECK_EQ(req.type, ResourceRequest::kTempSpace);
     return mshadow::Tensor<xpu, ndim, DType>(
-        reinterpret_cast<DType*>(get_space_internal(shape.Size() * sizeof(DType))),
+        reinterpret_cast<DType*>(get_space_internal(
+          shape.Size() * sizeof(DType), name)),
         shape, shape[ndim - 1], stream);
   }
 #if MXNET_USE_CUDNN == 1
   /*!
-   * \brief Get cudnn dropout descriptor from shared state space.
+   * \brief Get cuDNN dropout descriptor from shared state space.
    *
-   * \param dropout_desc reference to previously created cudnn dropout descriptor.
-   * \param stream the stream of retruning tensor.
+   * \param dropout_desc  reference to previously created cuDNN dropout descriptor.
+   * \param stream  the stream of returning tensor.
+   * \param name    the name of the operator requesting the resource.
    * \return the mshadow tensor requested.
    */
   void get_cudnn_dropout_desc(
-      cudnnDropoutDescriptor_t* dropout_desc,
+      cudnnDropoutDescriptor_t *dropout_desc,
       mshadow::Stream<gpu> *stream,
-      const float dropout,
-      uint64_t seed) const;
+      const float dropout, uint64_t seed,
+      const std::string &name = MXNET_RESOURCE_DEFAULT_NAME_FARG("cudnn_dropout_state")) const;
 #endif  // MXNET_USE_CUDNN == 1
 
   /*!
@@ -195,10 +224,11 @@ struct Resource {
   }
   /*!
    * \brief internal function to get space from resources.
-   * \param size The size of the space.
+   * \param size the Size of the space.
+   * \param name the Name of the operator requesting the resource.
    * \return The allocated space.
    */
-  void* get_space_internal(size_t size) const;
+  void* get_space_internal(size_t size, const std::string &name) const;
   /*!
    * \brief internal function to get cpu space from resources.
    * \param size The size of space.
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 4d1fc3d..0e90d17 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -26,10 +26,14 @@
 #define MXNET_STORAGE_H_
 
 #include <memory>
+#include <string>
 #include "./base.h"
 
 namespace mxnet {
 
+#define MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR  "<unk>:"
+#define MXNET_STORAGE_DEFAULT_NAME_CSTR  "unknown"
+
 /*!
  * \brief Storage manager across multiple devices.
  */
@@ -55,7 +59,12 @@ class Storage {
      * \brief Id for IPC shared memory
      */
     int shared_pid{-1};
-    int shared_id{-1};
+    int shared_id {-1};
+    /*!
+     * \brief Attributes for tracking storage allocations.
+     */
+    std::string profiler_scope{MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR};
+    std::string name{MXNET_STORAGE_DEFAULT_NAME_CSTR};
   };
   /*!
    * \brief Allocate a new contiguous memory for a given size.
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index e925b31..a944880 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -32,6 +32,7 @@ from .. import symbol, ndarray, initializer, np_symbol
 from ..symbol import Symbol
 from ..ndarray import NDArray
 from .. import name as _name
+from .. import profiler as _profiler
 from .parameter import Parameter, ParameterDict, DeferredInitializationError
 from .utils import _indent, _brief_print_list, HookHandle
 from .utils import _check_same_symbol_type, _check_all_np_ndarrays
@@ -53,18 +54,24 @@ class _BlockScope(object):
 
     @staticmethod
     def create(prefix, params, hint):
-        """Creates prefix and params for new `Block`."""
+        """
+        Creates prefix, params, and profiler scope name for new `Block`.
+        The profiler scope is to support the GPU memory profiler.
+        """
         current = getattr(_BlockScope._current, "value", None)
         if current is None:
             if prefix is None:
                 if not hasattr(_name.NameManager._current, "value"):
                     _name.NameManager._current.value = _name.NameManager()
                 prefix = _name.NameManager._current.value.get(None, hint) + '_'
+            # replace the trailing underscore with colon
+            profiler_scope_name = (prefix[:-1] if prefix.endswith('_') \
+                                   else prefix) + ":"
             if params is None:
                 params = ParameterDict(prefix)
             else:
                 params = ParameterDict(params.prefix, params)
-            return prefix, params
+            return prefix, params, profiler_scope_name
 
         if prefix is None:
             count = current._counter.get(hint, 0)
@@ -75,7 +82,11 @@ class _BlockScope(object):
             params = ParameterDict(parent.prefix+prefix, parent._shared)
         else:
             params = ParameterDict(params.prefix, params)
-        return current._block.prefix+prefix, params
+        # replace the trailing underscore with colon
+        profiler_scope_name = (prefix[:-1] if prefix.endswith('_') \
+                               else prefix) + ":"
+        return current._block.prefix + prefix, params, \
+               current._block._profiler_scope_name + profiler_scope_name
 
     def __enter__(self):
         if self._block._empty_prefix:
@@ -84,6 +95,8 @@ class _BlockScope(object):
         _BlockScope._current.value = self
         self._name_scope = _name.Prefix(self._block.prefix)
         self._name_scope.__enter__()
+        self._profiler_scope = _profiler.Scope(self._block._profiler_scope_name)
+        self._profiler_scope.__enter__()
         return self
 
     def __exit__(self, ptype, value, trace):
@@ -91,6 +104,8 @@ class _BlockScope(object):
             return
         self._name_scope.__exit__(ptype, value, trace)
         self._name_scope = None
+        self._profiler_scope.__exit__(ptype, value, trace)
+        self._profiler_scope = None
         _BlockScope._current.value = self._old_scope
 
 
@@ -274,7 +289,8 @@ class Block(object):
     """
     def __init__(self, prefix=None, params=None):
         self._empty_prefix = prefix == ''
-        self._prefix, self._params = _BlockScope.create(prefix, params, self._alias())
+        self._prefix, self._params, self._profiler_scope_name = \
+                _BlockScope.create(prefix, params, self._alias())
         self._name = self._prefix[:-1] if self._prefix.endswith('_') else self._prefix
         self._scope = _BlockScope(self)
         self._children = OrderedDict()
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index eff7320..04efbee 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -733,7 +733,13 @@ def register(reg_name):
         def creator(op_type, argc, keys, vals, ret):
             """internal function"""
             assert py_str(op_type) == reg_name
-            kwargs = dict([(py_str(keys[i]), py_str(vals[i])) for i in range(argc)])
+            kwargs = {}
+            for i in range(argc):
+                key = py_str(keys[i])
+                if key not in ['__ctx_group__', '__lr_mult__', '__wd_mult__',
+                               '__force_mirroring__',
+                               '__mirror_stage__', '__profiler_scope__']:
+                    kwargs[key] = py_str(vals[i])
             op_prop = prop_cls(**kwargs)
 
             def infer_shape_entry(num_tensor, tensor_dims,
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
index 62b7004..a969614 100644
--- a/python/mxnet/optimizer/updater.py
+++ b/python/mxnet/optimizer/updater.py
@@ -21,6 +21,7 @@ import pickle
 import numpy
 from ..base import py_str
 from ..ndarray import NDArray
+from ..profiler import Scope
 from ..util import is_np_array
 from .utils import _as_classic
 
@@ -54,7 +55,8 @@ class Updater(object):
                 indices[i] = py_str(idx)
                 idx = indices[i]
             if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                with Scope("updater:optimizer_state"):
+                    self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
                 self.states_synced[idx] = True
             elif not self.states_synced[idx]:
                 self.states[idx] = \
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index 1a12f6f..3b8830d 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -20,6 +20,7 @@
 # pylint: disable=too-many-branches, too-many-statements
 """Profiler setting methods."""
 import ctypes
+import threading
 import warnings
 from .base import _LIB, check_call, c_str, ProfileHandle, c_str_array, py_str, KVStoreHandle
 
@@ -36,6 +37,8 @@ def set_config(**kwargs):
     ----------
     filename : string,
         output file for profile data
+    gpu_memory_profile_filename_prefix : string
+        filename prefix for the GPU memory profile
     profile_all : boolean,
         all profile types enabled
     profile_symbolic : boolean,
@@ -497,3 +500,52 @@ class Marker(object):
             Default is `process`.
         """
         check_call(_LIB.MXProfileSetMarker(self.domain.handle, c_str(self.name), c_str(scope)))
+
+
+class Scope(object):
+    """
+    The `_profiler.Scope` was developed to assign the profiler scope for the GPU
+    memory profiler. It is implicitly invoked when the Gluon API is used.
+
+    Parameters
+    ==========
+    name : Name of the Profiler Scope
+    append_mode : Whether to append the old profiler scope at the front.
+    """
+    _current = threading.local()
+
+    def __init__(self, name='<unk>:', append_mode=False):
+        self._name = name + ":" if not name.endswith(":") else name
+        self._old_scope = None
+        if append_mode:
+            if not hasattr(Scope._current, "value"):
+                Scope._current.value = Scope()
+            self._name = Scope._current.value.name + self._name
+
+    def __enter__(self):
+        if not hasattr(Scope._current, "value"):
+            Scope._current.value = Scope()
+        self._old_scope = Scope._current.value
+        Scope._current.value = self
+        # Invoke the C API to propagate the profiler scope information to the
+        # C++ backend.
+        check_call(_LIB.MXSetProfilerScope(c_str(self.name)))
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        assert self._old_scope
+        Scope._current.value = self._old_scope
+        # If the old profiler scope is also of type `profiler.Scope`, invoke the
+        # C API once again to recover the previous scope information. Otherwise,
+        # the default scope `<unk>:` will be set.
+        if isinstance(self._old_scope, Scope):
+            check_call(_LIB.MXSetProfilerScope(c_str(self._old_scope.name)))
+        else:
+            check_call(_LIB.MXSetProfilerScope(c_str("<unk>:")))
+
+    @property
+    def name(self):
+        return self._name
+
+# initialize the default profiler scope
+Scope._current.value = Scope()
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
index 6b02e6d..e1ce4d4 100644
--- a/python/mxnet/symbol/register.py
+++ b/python/mxnet/symbol/register.py
@@ -28,6 +28,7 @@ from ..base import mx_uint, check_call, _LIB, py_str
 from ..symbol_doc import _build_doc
 from ..base import _Null, _init_op_module, _is_np_op, _output_is_list
 from ..name import NameManager
+from ..profiler import Scope
 # pylint: enable=unused-import
 
 
@@ -195,6 +196,11 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
             key_var_num_args, key_var_num_args))
 
             code.append("""
+    if 'profiler_scope' not in keys:
+        keys.append('profiler_scope')
+        if not hasattr(Scope._current, "value"):
+            Scope._current.value = Scope()
+        vals.append(Scope._current.value.name)
     return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name, %s, %s)"""%(
                 handle.value, str(is_np_op), str(output_is_list)))
     else:
@@ -252,6 +258,11 @@ def %s(%s):"""%(func_name, ', '.join(signature)))
     if not hasattr(NameManager._current, "value"):
         NameManager._current.value = NameManager()
     name = NameManager._current.value.get(name, '%s')
+    if 'profiler_scope' not in _keys:
+        _keys.append('profiler_scope')
+        if not hasattr(Scope._current, "value"):
+            Scope._current.value = Scope()
+        _vals.append(Scope._current.value.name)
     return _symbol_creator(%d, None, sym_kwargs, _keys, _vals, name, %s, %s)"""%(
         func_name.lower(), handle.value, str(is_np_op), str(output_is_list)))
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index a4599c8..706152f 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -44,6 +44,7 @@ from . import _internal
 from . import op
 from ._internal import SymbolBase, _set_symbol_class
 from ..util import is_np_shape
+from ..profiler import Scope
 
 __all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
            "pow", "power", "maximum", "minimum", "hypot", "eye", "zeros",
@@ -2742,7 +2743,7 @@ class Symbol(SymbolBase):
         raise NotImplementedForSymbol(self.backward, None)
 
 def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
-        init=None, stype=None, **kwargs):
+        init=None, stype=None, profiler_scope=None, **kwargs):
     """Creates a symbolic variable with specified name.
 
     Example
@@ -2777,6 +2778,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
         Initializer for this variable to (optionally) override the default initializer.
     stype : str
         The storage type of the variable, such as 'row_sparse', 'csr', 'default', etc
+    profiler_scope : str
+        The profiler scope for input variable.
     kwargs : Additional attribute variables
         Additional attributes must start and end with double underscores.
 
@@ -2812,6 +2815,12 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
         attr['__init__'] = init
     if stype is not None:
         attr['__storage_type__'] = str(_STORAGE_TYPE_STR_TO_ID[stype])
+    if profiler_scope is not None:
+        attr['__profiler_scope__'] = profiler_scope
+    else:
+        if not hasattr(Scope._current, "value"):
+            Scope._current.value = Scope()
+        attr['__profiler_scope__'] = Scope._current.value.name
     for k, v in kwargs.items():
         if k.startswith('__') and k.endswith('__'):
             attr[k] = str(v)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 31b9d84..ad30350 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -56,6 +56,7 @@
 #include "../operator/subgraph/partitioner/custom_subgraph_property.h"
 #include "../operator/subgraph/subgraph_property.h"
 #include "../common/utils.h"
+#include "../profiler/profiler.h"
 #include "nnvm/pass_functions.h"
 
 using namespace mxnet;
@@ -989,9 +990,12 @@ void CreateNDArray(const DataType* shape,
               "[CreateNDArray] Size of tensor you are trying to allocate is larger than "
               "2^31 elements. Please build with flag USE_INT64_TENSOR_SIZE=1";
   }
-  *out = new NDArray(requested_shape,
-                     Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
-                     delay_alloc != 0, dtype);
+  NDArray* nd = new NDArray(requested_shape,
+                            Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
+                            delay_alloc != 0, dtype);
+  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
+                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
+  *out = nd;
 }
 
 int MXNDArrayCreate(const uint32_t *shape,
@@ -1001,9 +1005,12 @@ int MXNDArrayCreate(const uint32_t *shape,
                     int delay_alloc,
                     NDArrayHandle *out) {
   API_BEGIN();
-  *out = new NDArray(mxnet::TShape(shape, shape + ndim),
-                     Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
-                     delay_alloc != 0);
+  NDArray* nd = new NDArray(mxnet::TShape(shape, shape + ndim),
+                            Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
+                            delay_alloc != 0);
+  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
+                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
+  *out = nd;
   API_END();
 }
 
@@ -1054,12 +1061,15 @@ void CreateSparseNDArray(int storage_type,
     aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]);
     shape_start += aux_ndims[i];
   }
-  *out = new NDArray(
+  NDArray* nd = new NDArray(
       NDArrayStorageType(storage_type),
       mxnet::TShape(shape, shape + ndim),
       Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
       delay_alloc != 0,
       dtype, aux_types, aux_shapes);
+  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
+                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
+  *out = nd;
 }
 
 int MXNDArrayCreateSparseEx(int storage_type,
@@ -2462,14 +2472,20 @@ int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid, int* shar
 int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const uint32_t *shape,
                                  uint32_t ndim, int dtype, NDArrayHandle *out) {
   API_BEGIN();
-  *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  NDArray* nd = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
+                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
+  *out = nd;
   API_END();
 }
 
 int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
                                    int ndim, int dtype, NDArrayHandle *out) {
   API_BEGIN();
-  *out = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  NDArray* nd = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
+  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
+                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
+  *out = nd;
   API_END();
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index b88eea4..ef03fe6 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -38,6 +38,7 @@
 #include "../imperative/imperative_utils.h"
 #include "../imperative/cached_op.h"
 #include "../imperative/cached_op_threadsafe.h"
+#include "../profiler/profiler.h"
 
 using namespace mxnet;
 
@@ -98,6 +99,10 @@ void MXImperativeInvokeImpl(AtomicSymbolCreator creator,
 
   nnvm::NodeAttrs attrs = imperative::ParseAttrs(op, num_inputs, num_params,
                                                  param_keys, param_vals);
+  attrs.dict["__profiler_scope__"] = profiler::ProfilerScope::Get()->GetCurrentProfilerScope();
+  if (attrs.op) {
+    attrs.name = attrs.op->name;
+  }
 
   int infered_num_outputs;
   int num_visible_outputs;
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index 5eb219a..d0ad6a1 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -32,6 +32,7 @@
 #include <mxnet/kvstore.h>
 #include <stack>
 #include "./c_api_common.h"
+#include "../profiler/storage_profiler.h"
 #include "../profiler/profiler.h"
 
 namespace mxnet {
@@ -209,6 +210,7 @@ struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
   bool profile_memory;
   bool profile_api;
   std::string filename;
+  std::string gpu_memory_profile_filename_prefix;
   bool continuous_dump;
   float dump_period;
   bool aggregate_stats;
@@ -226,6 +228,10 @@ struct ProfileConfigParam : public dmlc::Parameter<ProfileConfigParam> {
       .describe("Profile C API.  Default is True.");
     DMLC_DECLARE_FIELD(filename).set_default("profile.json")
       .describe("File name to write profiling info.");
+#if MXNET_USE_CUDA
+    DMLC_DECLARE_FIELD(gpu_memory_profile_filename_prefix).set_default("gpu_memory_profile")
+      .describe("File name prefix to write GPU memory profile info.");
+#endif  // MXNET_USE_CUDA
     DMLC_DECLARE_FIELD(continuous_dump).set_default(true)
       .describe("Periodically dump (and append) profiling data to file while running. "
                 "Default is True.");
@@ -298,6 +304,10 @@ int MXSetProcessProfilerConfig(int num_params, const char* const* keys, const ch
                                            param.continuous_dump,
                                            param.dump_period,
                                            param.aggregate_stats);
+#if MXNET_USE_CUDA
+      profiler::GpuDeviceStorageProfiler::Get()->SetConfig(
+          param.gpu_memory_profile_filename_prefix);
+#endif  // MXNET_USE_CUDA
     }
   API_END();
 }
@@ -354,6 +364,9 @@ int MXDumpProcessProfile(int finished, int profile_process, KVStoreHandle kvStor
     CHECK(profiler->IsEnableOutput())
       << "Profiler hasn't been run. Config and start profiler first";
     profiler->DumpProfile(finished != 0);
+#if MXNET_USE_CUDA
+    profiler::GpuDeviceStorageProfiler::Get()->DumpProfile();
+#endif  // MXNET_USE_CUDA
   }
   API_END()
 }
@@ -362,6 +375,12 @@ int MXSetProfilerState(int state) {
   return MXSetProcessProfilerState(state, static_cast<int>(ProfileProcess::kWorker), nullptr);
 }
 
+int MXSetProfilerScope(const char* const scope) {
+  API_BEGIN();
+  profiler::ProfilerScope::Get()->SetCurrentProfilerScope(scope);
+  API_END();
+}
+
 int MXSetProcessProfilerState(int state, int profile_process, KVStoreHandle kvStoreHandle) {
   mxnet::IgnoreProfileCallScope ignore;
   // state, kNotRunning: 0, kRunning: 1
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index 8f78fc1..9042dfa 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -41,10 +41,20 @@ void RegisterLegacyOpProp();
 void RegisterLegacyNDFunc();
 }
 const std::vector<std::string> kHiddenKeys = {
-  "ctx_group", "lr_mult", "wd_mult", "force_mirroring", "mirror_stage"
+  "ctx_group",
+  "lr_mult",
+  "wd_mult",
+  "force_mirroring",
+  "mirror_stage",
+  "profiler_scope"
 };
 const std::vector<std::string> kReplacedHiddenKeys = {
-  "__ctx_group__", "__lr_mult__", "__wd_mult__", "__force_mirroring__", "__mirror_stage__"
+  "__ctx_group__",
+  "__lr_mult__",
+  "__wd_mult__",
+  "__force_mirroring__",
+  "__mirror_stage__",
+  "__profiler_scope__"
 };
 const char *kNamespaceSeparator = "$";
 
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index ccf0931..0971cfd 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -56,6 +56,9 @@ extern __cuda_fake_struct blockIdx;
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <curand.h>
+#if MXNET_USE_NVML
+#include <nvml.h>
+#endif  // MXNET_USE_NVML
 
 #include <vector>
 
@@ -175,6 +178,22 @@ inline __device__ bool __is_supported_cuda_architecture() {
   }
 
 
+#if MXNET_USE_NVML
+/*!
+ * \brief Protected NVML call.
+ * \param func Expression to call.
+ *
+ * It checks for NVML errors after invocation of the expression.
+ */
+#define NVML_CALL(func)                                 \
+  {                                                     \
+    nvmlReturn_t result = (func);                       \
+    CHECK_EQ(result, NVML_SUCCESS)                      \
+      << #func " failed with error "                    \
+      << nvmlErrorString(result);                       \
+  }
+#endif  // MXNET_USE_NVML
+
 #if !defined(_MSC_VER)
 #define CUDA_UNROLL _Pragma("unroll")
 #define CUDA_NOUNROLL _Pragma("nounroll")
diff --git a/src/common/utils.h b/src/common/utils.h
index 31e6dea..40d5a55 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -31,6 +31,7 @@
 #include <nnvm/node.h>
 #include <mxnet/engine.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/storage.h>
 #include <mxnet/op_attr_types.h>
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph_attr_types.h>
@@ -731,8 +732,10 @@ inline NDArray InitZeros(const NDArrayStorageType stype, const mxnet::TShape &sh
 /*!
  * \brief Helper to add a NDArray of zeros to a std::vector.
  */
-inline void EmplaceBackZeros(const NDArrayStorageType stype, const mxnet::TShape &shape,
-                             const Context &ctx, const int dtype,
+inline void EmplaceBackZeros(const NDArrayStorageType stype,
+                             const mxnet::TShape &shape,
+                             const Context &ctx,
+                             const int dtype,
                              std::vector<NDArray> *vec) {
   // NDArray with default storage
   if (stype == kDefaultStorage) {
@@ -915,6 +918,19 @@ inline int np_binary_out_infer_type(const int type1, const int type2) {
   return get_more_precise_type(type1, type2);
 }
 
+inline const std::string
+NodeAttrsGetProfilerScope(const nnvm::NodeAttrs& attrs) {
+  // obtain the profiler scope name, if assigned previously
+  std::string profiler_scope = MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR;
+  const std::unordered_map<std::string, std::string>& node_attrs_dict = attrs.dict;
+  const std::unordered_map<std::string, std::string>::const_iterator
+      profiler_scope_iter  = node_attrs_dict.find("__profiler_scope__");
+  if (profiler_scope_iter != node_attrs_dict.end()) {
+    profiler_scope = profiler_scope_iter->second;
+  }
+  return profiler_scope;
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 13bab2e..02ce818 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -342,7 +342,9 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
     NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
-    ngrad.node->attrs.name = "_head_grad_" + std::to_string(i);
+    const nnvm::NodeAttrs& attrs = g.outputs[i].node->attrs;
+    ngrad.node->attrs.name = attrs.name + "_head_grad";
+    ngrad.node->attrs.dict["__profiler_scope__"] = common::NodeAttrsGetProfilerScope(attrs);
     head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
     head_grad_map_[ngrad.node.get()] = i;
   }
@@ -517,9 +519,11 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const int inferred_dtype = inferred_dtypes[eid];
     const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(idx[nid].source->attrs);
     if (mutable_nodes.count(nid)) {  // aux_states
       EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
                        inferred_dtype, aux_state_vec);
+      aux_state_vec->back().AssignStorageInfo(profiler_scope + "aux_state:", arg_name);
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
       ++aux_top;
@@ -530,6 +534,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     } else {  // in_args
       EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
                        inferred_dtype, in_arg_vec);
+      in_arg_vec->back().AssignStorageInfo(profiler_scope + "in_arg:", arg_name);
       data_entry_[eid] = in_arg_vec->back();
       if (log_verbose_) {
         LOG(INFO) << "\tassign data entry\t" << eid << "\tas "
@@ -545,6 +550,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
         auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
         EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
                          inferred_dtype, arg_grad_vec);
+        arg_grad_vec->back().AssignStorageInfo(profiler_scope + "arg_grad:", arg_name);
         if (log_verbose_) {
           LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas "
                     << common::stype_string(grad_stype);
@@ -589,6 +595,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const int inferred_dtype = inferred_dtypes[eid];
     const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(idx[nid].source->attrs);
     // aux_states
     if (mutable_nodes.count(nid)) {
       if (nullptr != shared_exec) {
@@ -611,6 +618,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
       } else {
         EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
                          inferred_dtype, aux_state_vec);
+        aux_state_vec->back().AssignStorageInfo(profiler_scope + "aux_state:", arg_name);
       }  // if (has_shared_exec)
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
@@ -648,6 +656,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
           // doesn't have shared_exec, or non-default storage
           EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
                            inferred_dtype, in_arg_vec);
+          in_arg_vec->back().AssignStorageInfo(profiler_scope + "in_arg:", arg_name);
         }
         // gradient for model parameter
         if (kNullOp == grad_req_types[arg_top]) {
@@ -664,6 +673,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
             // no need to reuse memory from shared_exec for gradient of non-default storage
             EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
                              inferred_dtype, arg_grad_vec);
+            arg_grad_vec->back().AssignStorageInfo(profiler_scope + "arg_grad:", arg_name);
           }
           grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         }
@@ -673,6 +683,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
         in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype,
                                                  inferred_stype, in_arg_ctxes[arg_top],
                                                  shared_buffer, enable_row_sparse_sharing));
+        in_arg_vec->back().AssignStorageInfo(profiler_scope + "in_arg:", arg_name);
         // gradient for model parameter, row_sparse ndarray sharing disabled
         if (kNullOp == grad_req_types[arg_top]) {
           arg_grad_vec->emplace_back();
@@ -685,6 +696,7 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                                                      inferred_dtype, grad_stype,
                                                      arg_grad_ctxes[arg_top], shared_buffer,
                                                      enable_row_sparse_sharing));
+          arg_grad_vec->back().AssignStorageInfo(profiler_scope + "arg_grad:", arg_name);
           grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         }  // if (kNullOp == grad_req_types[arg_top])
       }  // if (shared_arg_names.count(arg_name))
@@ -1077,12 +1089,17 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   CHECK_EQ(data_entry_.size(), vshape.size());
   std::vector<Context> data_context(idx.num_node_entries());
   std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries(), kUndefinedStorage);
+  std::vector<std::string> data_storage_profiler_scope(idx.num_node_entries());
+  std::vector<std::string> data_storage_name(idx.num_node_entries());
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(idx[nid].source->attrs);
     for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
       auto eid = idx.entry_id(nid, i);
       data_context[eid] = vctx[nid];
       CHECK_NE(vstorage_type[eid], kUndefinedStorage);
       data_storage_type[eid] = (NDArrayStorageType) vstorage_type[eid];
+      data_storage_profiler_scope[eid] = profiler_scope;
+      data_storage_name[eid] = idx[nid].source->attrs.name;
     }
   }
 
@@ -1091,6 +1108,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     Context ctx;
     size_t bytes;
     NDArrayStorageType stype;
+    std::string profiler_scope;
+    std::string name;
   };
   std::vector<PoolEntry> pool_info;
 
@@ -1111,6 +1130,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     } else {
       data_entry_[data_eid] = NDArray(data_context[eid], vdtype[eid]);
     }
+    data_entry_[data_eid].AssignStorageInfo(data_storage_profiler_scope[data_eid],
+                                            data_storage_name[data_eid]);
     if (log_verbose_) {
       LOG(INFO) << "\tinit head_grad entry\t" << data_eid << "\tas "
                 << common::stype_string(stype);
@@ -1129,11 +1150,14 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     if (storage_id < 0) continue;
     size_t sid = static_cast<size_t>(storage_id);
     if (sid >= pool_info.size()) {
-      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage});
+      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage,
+                                          MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR,
+                                          MXNET_STORAGE_DEFAULT_NAME_CSTR});
     }
     PoolEntry& info = pool_info[sid];
     if (info.bytes == 0) {
-      info = PoolEntry{data_context[i], bytes, data_storage_type[i]};
+      info = PoolEntry{data_context[i], bytes, data_storage_type[i],
+                       data_storage_profiler_scope[i], data_storage_name[i]};
     } else {
       info.bytes = std::max(info.bytes, bytes);
     }
@@ -1183,6 +1207,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       // TODO(junwu): adding delay_alloc=true to create nd
       // is a temporary solution.
       NDArray nd(shape, ctx, true);
+      nd.AssignStorageInfo(pool_info[i].profiler_scope,
+                           pool_info[i].name);
       data_pool_[i] = nd;
       // put the new allocated arrays to shared pool
       if (shared_pool != nullptr)  {
@@ -1201,6 +1227,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     if (storage_type == kDefaultStorage) {
       if (!shape_is_known(vshape[i])) {
         data_entry_[i] = NDArray(data_context[i], vdtype[i]);
+        data_entry_[i].AssignStorageInfo(data_storage_profiler_scope[i],
+                                         data_storage_name[i]);
       } else {
         CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet";
         const NDArray& src = data_pool_.at(storage_id);
@@ -1209,6 +1237,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     } else {
       data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i],
                                true, vdtype[i]);
+      data_entry_[i].AssignStorageInfo(data_storage_profiler_scope[i],
+                                       data_storage_name[i]);
     }
     if (log_verbose_) {
       LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type);
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 89dabac..0f8b86a 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -683,6 +683,9 @@ OpStatePtr CachedOp::StaticForward(
     if (!outputs[i]->is_none()) continue;
     *outputs[i] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
                           shapes[eid], default_ctx, true, dtypes[eid]);
+    const nnvm::NodeAttrs& attrs = idx[idx.outputs()[i].node_id].source->attrs;
+    outputs[i]->AssignStorageInfo(common::NodeAttrsGetProfilerScope(attrs),
+                                  attrs.name);
   }
 
   StaticRunOps(default_ctx, g, state_ptr, arrays, 0, idx.num_nodes());
@@ -766,6 +769,22 @@ OpStatePtr CachedOp::Forward(
   static const auto cached_op = nnvm::Op::Get("_CachedOp");
 
   CHECK_EQ(inputs.size(), num_inputs());
+  // Assign the storage information for the input arguments. Similar to the
+  // implementation in `graph_executor.cc`, we use `mutable_input_nodes()` to
+  // distinguish between weight parameters and auxiliary states.
+  const auto& fwd_idx = fwd_graph_.indexed_graph();
+  const auto& mutable_input_nodes = fwd_idx.mutable_input_nodes();
+  for (size_t i = 0; i < fwd_idx.input_nodes().size(); ++i) {
+    const uint32_t nid = fwd_idx.input_nodes().at(i);
+    const nnvm::NodeAttrs& attrs = fwd_idx[nid].source->attrs;
+    const std::string& arg_name = attrs.name;
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(attrs);
+    if (mutable_input_nodes.count(nid)) {
+      inputs[i]->AssignStorageInfo(profiler_scope + "aux_state:", arg_name);
+    } else {
+      inputs[i]->AssignStorageInfo(profiler_scope + "in_arg:", arg_name);
+    }
+  }
 
   Context default_ctx = inputs[0]->ctx();
   {
@@ -993,6 +1012,27 @@ void CachedOp::Backward(
     const std::vector<NDArray*>& inputs,
     const std::vector<OpReqType>& reqs,
     const std::vector<NDArray*>& outputs) {
+  const auto& fwd_idx = fwd_graph_.indexed_graph();
+  const auto& full_idx = full_graph_.indexed_graph();
+  const auto& mutable_input_nodes = fwd_idx.mutable_input_nodes();
+  for (size_t i = 0, j = 0; i < fwd_idx.input_nodes().size(); ++i) {
+    const uint32_t nid = fwd_idx.input_nodes().at(i);
+    const std::string& arg_name = fwd_idx[nid].source->attrs.name;
+    const std::string profiler_scope =
+        common::NodeAttrsGetProfilerScope(fwd_idx[nid].source->attrs);
+    if (mutable_input_nodes.count(nid)) {
+      continue;
+    }
+    outputs[j++]->AssignStorageInfo(profiler_scope + "arg_grad:", arg_name);
+  }
+  for (size_t i = fwd_idx.input_nodes().size(), j = 0;
+       i < full_idx.input_nodes().size(); ++i) {
+    const nnvm::NodeAttrs& attrs = full_idx[full_idx.input_nodes().at(i)].source->attrs;
+    const std::string& entry_name = attrs.name;
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(attrs);
+    inputs[j++]->AssignStorageInfo(profiler_scope, entry_name);
+  }
+
   using namespace imperative;
   CHECK(!Imperative::Get()->is_recording())
       << "CachedOp does not support higher order gradients. "
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index c56d8cf..16a5d30 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -96,6 +96,10 @@ void CreateGraphNDs(const nnvm::Graph& g,
       continue;
     *((*arrays)[eid]) = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
                                 shapes[eid], default_ctx, true, dtypes[eid]);
+    const nnvm::NodeAttrs& attrs = idx[idx.outputs()[i].node_id].source->attrs;
+    (*arrays)[eid]->AssignStorageInfo(
+        common::NodeAttrsGetProfilerScope(attrs),
+        attrs.name);
   }
 }
 
@@ -136,7 +140,9 @@ void CreateBackwardGraph(nnvm::Graph* fwd_graph,
   ograd_entries->reserve(fwd_graph->outputs.size());
   for (size_t i = 0; i < fwd_graph->outputs.size(); ++i) {
     nnvm::ObjectPtr np = Node::Create();
-    np->attrs.name = "_head_grad_" + std::to_string(i);
+    const nnvm::NodeAttrs& attrs = fwd_graph->outputs[i].node->attrs;
+    np->attrs.name = attrs.name + "_head_grad";
+    np->attrs.dict["__profiler_scope__"] = common::NodeAttrsGetProfilerScope(attrs);
     ograd_entries->emplace_back(np);
   }
 
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 1560138..21d5298 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -189,7 +189,7 @@ inline void SetShapeType(const Context& ctx,
   } else {
     // if infer storage attr is not present, apply the default infer storage function
     infer_stype_success = common::DefaultStorageType(attrs, ctx.dev_mask(), dispatch_mode,
-                                                   &in_storage_types, &out_storage_types);
+                                                     &in_storage_types, &out_storage_types);
   }
   CHECK(infer_stype_success) << "Operator not implemented: "
      << common::operator_stype_string(attrs, ctx.dev_mask(), in_storage_types, out_storage_types);
@@ -206,10 +206,13 @@ inline void SetShapeType(const Context& ctx,
       if (is_dynamic_shape_existing) {
         // once there is dynamic shape somewhere, we could not pre-determine the shape.
         *outputs[i] = NDArray(ctx, out_types[i]);
+        outputs[i]->AssignStorageInfo(common::NodeAttrsGetProfilerScope(attrs), attrs.name);
       } else if (storage_type == kDefaultStorage) {
         *outputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
+        outputs[i]->AssignStorageInfo(common::NodeAttrsGetProfilerScope(attrs), attrs.name);
       } else {
         *outputs[i] = NDArray(storage_type, out_shapes[i], ctx, true, out_types[i]);
+        outputs[i]->AssignStorageInfo(common::NodeAttrsGetProfilerScope(attrs), attrs.name);
       }
     } else {
       CHECK_EQ(outputs[i]->shape(), out_shapes[i])
@@ -894,15 +897,31 @@ inline std::multimap<size_t, NDArray> AllocateMemory(
   const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
   const auto& shapes = g.GetAttr<mxnet::ShapeVector>("shape");
   const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  std::vector<std::string> data_entry_profiler_scopes(entry_end - entry_start);
+  std::vector<std::string> data_entry_names(entry_end - entry_start);
 
   std::multimap<size_t, NDArray> new_pool;
 
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const std::string profiler_scope = common::NodeAttrsGetProfilerScope(idx[nid].source->attrs);
+    for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
+      uint32_t eid = idx.entry_id(nid, i);
+      if (eid < entry_start || eid >= entry_end) {
+        continue;
+      }
+      data_entry_profiler_scopes[eid - entry_start] = profiler_scope;
+      data_entry_names[eid - entry_start] = idx[nid].source->attrs.name;
+    }
+  }
+
   for (uint32_t i = entry_start; i < entry_end; ++i) {
     if (mem_plan[i].storage_id == exec::kExternalStorageID) continue;
     CHECK(arrays[i]->is_none());
     if (mem_plan[i].storage_id == exec::kDynamicStorageID) {
       *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
                            shapes[i], default_ctx, true, dtypes[i]);
+      arrays[i]->AssignStorageInfo(data_entry_profiler_scopes[i - entry_start],
+                                   data_entry_names[i - entry_start]);
       continue;
     }
     CHECK_EQ(stypes[i], kDefaultStorage);
@@ -915,6 +934,8 @@ inline std::multimap<size_t, NDArray> AllocateMemory(
       } else {
         NDArray buff(mxnet::TShape({static_cast<nnvm::dim_t>(mem_plan[i].size)}),
                      default_ctx, true, mshadow::kUint8);
+        buff.AssignStorageInfo(data_entry_profiler_scopes[i - entry_start],
+                               data_entry_names[i - entry_start]);
         *arrays[i] = buff.AsArray(shapes[i], dtypes[i]);
         new_pool.insert({mem_plan[i].size, buff});
       }
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index 565b671..ad6e2a1 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -42,6 +42,7 @@
 #include "./image_iter_common.h"
 #include "./inst_vector.h"
 #include "../common/utils.h"
+#include "../profiler/profiler.h"
 
 namespace mxnet {
 
@@ -296,10 +297,16 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
     if (dev_id != -1) {
       ctx = Context::CPUPinned(dev_id);
     }
+
+    const std::string profiler_scope =
+        profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "image_io:";
+
     out->data.at(0) = NDArray(data_shape, ctx, false,
       mshadow::DataType<DType>::kFlag);
+    out->data.at(0).AssignStorageInfo(profiler_scope, "data");
     out->data.at(1) = NDArray(label_shape, ctx, false,
       mshadow::DataType<real_t>::kFlag);
+    out->data.at(1).AssignStorageInfo(profiler_scope, "label");
     unit_size_[0] = param_.data_shape.Size();
     unit_size_[1] = param_.label_width;
   }
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index 88e363b..b03b74c 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -34,6 +34,7 @@
 #include "gradient_compression.h"
 #include "../ndarray/ndarray_function.h"
 #include "../operator/tensor/sparse_retain-inl.h"
+#include "../profiler/profiler.h"
 #include "./kvstore_utils.h"
 namespace mxnet {
 namespace kvstore {
@@ -532,9 +533,12 @@ class CommDevice : public Comm {
         // NDArray.Slice or gpu direct memory access. for the latter, we need to
         // remove some ctx check, and also it reduces 20% perf
         buf.copy_buf.resize(src.size()-1);
+        const std::string profiler_scope =
+            profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "comm_dev:";
         for (size_t i = 0; i < src.size()-1; ++i) {
           buf.copy_buf[i] = NDArray(
             buf_merged.shape(), buf_merged.ctx(), false, buf_merged.dtype());
+          buf.copy_buf[i].AssignStorageInfo(profiler_scope, "copy_buf");
         }
       }
       for (size_t i = 0; i < src.size()-1; ++i) {
@@ -560,18 +564,23 @@ class CommDevice : public Comm {
       buf.compressed_recv_buf.resize(src.size());
       buf.compressed_send_buf.resize(src.size());
       buf.residual.resize(src.size());
-
+      const std::string profiler_scope =
+          profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "comm_dev:";
       for (size_t i = 0; i < src.size(); ++i) {
         buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx(),
                                   false, buf.merged.dtype());
+        buf.copy_buf[i].AssignStorageInfo(profiler_scope, "copy_buf");
         buf.residual[i] = NDArray(buf.merged.shape(), src[i].ctx(),
                                   false, buf.merged.dtype());
+        buf.residual[i].AssignStorageInfo(profiler_scope, "residual");
         buf.residual[i] = 0;
         int64_t small_size = gc_->GetCompressedSize(buf.merged.shape().Size());
         buf.compressed_recv_buf[i] = NDArray(mxnet::TShape{small_size}, buf.merged.ctx(),
-                                        false, buf.merged.dtype());
+                                             false, buf.merged.dtype());
+        buf.compressed_recv_buf[i].AssignStorageInfo(profiler_scope, "compressed_recv_buf");
         buf.compressed_send_buf[i] = NDArray(mxnet::TShape{small_size}, src[i].ctx(),
-                                        false, buf.merged.dtype());
+                                             false, buf.merged.dtype());
+        buf.compressed_send_buf[i].AssignStorageInfo(profiler_scope, "compressed_send_buf");
       }
     }
 
@@ -686,6 +695,9 @@ class CommDevice : public Comm {
       ctx_info[d.dev_id] = std::make_pair(d, 0);
     }
 
+    const std::string profiler_scope =
+        profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "kvstore:comm_dev:";
+
     for (auto& sorted_key_attr : sorted_key_attrs_) {
       const int key  = std::get<0>(sorted_key_attr);
       const mxnet::TShape& shape = std::get<1>(sorted_key_attr);
@@ -705,6 +717,7 @@ class CommDevice : public Comm {
       if (buf.merged.is_none()) {
         bool delay_alloc = true;
         buf.merged = NDArray(shape, ctx, delay_alloc, type);
+        buf.merged.AssignStorageInfo(profiler_scope, "merge_buf_" + std::to_string(key));
       }
       ctx_info[ctx.dev_id].second += shape.Size();
     }
diff --git a/src/kvstore/comm_tree.h b/src/kvstore/comm_tree.h
index 11ca2d6..07ffba5 100644
--- a/src/kvstore/comm_tree.h
+++ b/src/kvstore/comm_tree.h
@@ -400,6 +400,9 @@ class CommDeviceTree : public CommDevice {
     bool delay_alloc = true;
     std::map<int, int> key_dist;
 
+    const std::string profiler_scope =
+        profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "comm_dev_tree:";
+
     for (auto& tree_sorted_key_attr : tree_sorted_key_attrs_) {
       const int key  = std::get<0>(tree_sorted_key_attr);
       const mxnet::TShape& shape = std::get<1>(tree_sorted_key_attr);
@@ -457,6 +460,8 @@ class CommDeviceTree : public CommDevice {
               if (row == devs_.size()-1)
                 shape_copy[0] = last_slice;
               buf.merged[row] = NDArray(shape_copy, ctx, delay_alloc, type);
+              buf.merged[row].AssignStorageInfo(
+                  profiler_scope, "merged_" + std::to_string(key));
               buf.copy_buf.emplace_back();
               if (buf.copy_buf[row].empty()) {
                 buf.copy_buf[row].resize(kBranch-1);
@@ -465,18 +470,23 @@ class CommDeviceTree : public CommDevice {
                                                    buf.merged[row].ctx(),
                                                    delay_alloc,
                                                    buf.merged[row].dtype());
+                  buf.copy_buf[row][col].AssignStorageInfo(profiler_scope, "copy_buf");
                 }
               }
             }
           } else {
             buf.merged.emplace_back(shape, ctx, false, type);
+            buf.merged.back().AssignStorageInfo(
+                profiler_scope, "merged_" + std::to_string(key));
             if (buf.copy_buf.empty()) {
               buf.copy_buf.emplace_back();
               buf.copy_buf[0].resize(kBranch-1);
               for (size_t col = 0; col < buf.copy_buf[0].size(); ++col) {
                 buf.copy_buf[0][col] = NDArray(buf.merged[0].shape(),
-                                               buf.merged[0].ctx(), delay_alloc,
+                                               buf.merged[0].ctx(),
+                                               delay_alloc,
                                                buf.merged[0].dtype());
+                buf.copy_buf[0][col].AssignStorageInfo(profiler_scope, "copy_buf");
               }
             }
           }
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 38bd7d9..bc4e933 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -37,6 +37,7 @@
 #include "./comm_tree.h"
 #include "./kvstore_utils.h"
 #include "../ndarray/ndarray_function.h"
+#include "../profiler/profiler.h"
 
 namespace mxnet {
 namespace kvstore {
@@ -246,6 +247,15 @@ class KVStoreLocal : public KVStore {
       int key = uniq_keys[i];
       const NDArray& merged = comm_->Reduce(key, grouped_vals[i], priority);
       NDArray& local = local_[key];
+      if (key_type_ == kStringKey) {
+        local.AssignStorageInfo(
+            profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "kvstore:push:",
+            reverse_str_key_dict_[key]);
+      } else {
+        local.AssignStorageInfo(
+            profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "kvstore:push:",
+            "local_" + std::to_string(key));
+      }
       if (updater_ != nullptr) {
         CHECK(!local.is_none()) << "key " << key << " has not been inited";
         // if merged is on gpu, we may need copy weight from cpu to gpu
@@ -288,6 +298,18 @@ class KVStoreLocal : public KVStore {
       const NDArray& local = local_[key];
       CHECK(!local.is_none()) << "key " << key << " has not been inited";
       comm_->Broadcast(key, local, grouped_vals[i], priority);
+      for (std::vector<NDArray*>::iterator iter = grouped_vals[i].begin();
+           iter != grouped_vals[i].end(); ++iter) {
+        if (key_type_ == kStringKey) {
+          (*iter)->AssignStorageInfo(
+              profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "kvstore:pull:",
+              reverse_str_key_dict_[key]);
+        } else {
+          (*iter)->AssignStorageInfo(
+              profiler::ProfilerScope::Get()->GetCurrentProfilerScope() + "kvstore:pull:",
+              "grouped_vals_" + std::to_string(key));
+        }
+      }
     }
   }
 
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index d16b38e..f851383 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -36,6 +36,7 @@
 #include "../operator/tensor/matrix_op-inl.h"
 #include "../operator/tensor/init_op.h"
 #include "../operator/nn/mkldnn/mkldnn_base-inl.h"
+#include "../profiler/storage_profiler.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -93,6 +94,25 @@ NDArray::NDArray(const NDArrayStorageType stype, const mxnet::TShape &shape, Con
         dtype, aux_types, aux_shapes);
 }
 
+void NDArray::AssignStorageInfo(const std::string& profiler_scope,
+                                const std::string& name) {
+  if (is_none()) {
+    return;
+  }
+  ptr_->shandle.profiler_scope = profiler_scope;
+  ptr_->shandle.name = name;
+#if MXNET_USE_CUDA
+  profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(ptr_->shandle);
+#endif  // MXNET_USE_CUDA
+  for (Storage::Handle& aux_handle : ptr_->aux_handles) {
+    aux_handle.profiler_scope = profiler_scope;
+    aux_handle.name = name + "_aux_data";
+#if MXNET_USE_CUDA
+    profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(aux_handle);
+#endif  // MXNET_USE_CUDA
+  }
+}
+
 void NDArray::SetShapeFromChunk() {
   if (Imperative::Get()->is_np_shape() ||
       !(ptr_->storage_shape.ndim() == 1 && ptr_->storage_shape[0] == 0)) {
@@ -148,7 +168,8 @@ void NDArray::Chunk::CheckAndAllocData(const mxnet::TShape &shape, int dtype) {
     // free storage
     Storage::Get()->Free(shandle);
     // init storage
-    shandle = Storage::Get()->Alloc(dbytes, ctx);
+    shandle.size = dbytes;
+    Storage::Get()->Alloc(&shandle);
 #if MXNET_USE_MKLDNN == 1
     mkl_mem_ = nullptr;
 #endif
@@ -1869,9 +1890,9 @@ void NDArray::Load(dmlc::Stream* fi,
 NDArray NDArray::Copy(Context ctx) const {
   NDArray ret;
   if (kDefaultStorage == storage_type()) {
-    ret = NDArray(shape(), ctx, true, dtype_);
+    ret = NDArray(shape(), ctx, false, dtype_);
   } else if (kUndefinedStorage != storage_type()) {
-    ret = NDArray(storage_type(), shape(), ctx, true, dtype_,
+    ret = NDArray(storage_type(), shape(), ctx, false, dtype_,
                   ptr_->aux_types, ptr_->aux_shapes, storage_shape());
   } else {
     LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index d83eb0d..e441e27 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -39,6 +39,13 @@ inline void linalg_check_batch_size(int A, int B, int C) {
   CHECK_GT(A, 0) << "Zero batch size for arguments to linear algebra operator";
 }
 
+#ifdef __CUDACC__
+#define EPHEMERAL_GPU_STORAGE_ALLOC(func, var, dtype, size) \
+  Storage::Handle var = Storage::Get()->Alloc(sizeof(dtype) * size, Context::GPU()); \
+  var.profiler_scope = "<ephemeral>:"; \
+  var.name = #func"_"#var;
+#endif
+
 //////////////////////////////// GEMM ////////////////////////////////////////////
 
 // CPU/GPU-versions of BLAS3 function "gemm". Please refer to the BLAS3-documentation
@@ -725,8 +732,9 @@ void linalg_potrf<gpu, DType>(const Tensor<gpu, 2, DType>& A, bool lower, Stream
   CHECK_NOTNULL(s); \
   check_potrf(A, lower); \
   int buffsize(linalg_potrf_buffsize(A, lower, s)); \
-  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*buffsize, Context::GPU()); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_potrf, buffer, \
+      DType, buffsize); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_potrf, info, int, 1); \
   CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                 (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
                 A.size(0), A.dptr_, A.stride_, static_cast<DType *>(buffer.dptr), buffsize, \
@@ -746,8 +754,9 @@ void linalg_batch_potrf<gpu, DType>(const Tensor<gpu, 3, DType>& A, bool lower,
   CHECK_GT(A.size(0), 0); \
   check_potrf(A[0], lower); \
   int buffsize(linalg_potrf_buffsize(A[0], lower, s)); \
-  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*buffsize, Context::GPU()); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_potrf, buffer, \
+      DType, buffsize); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_potrf, info, int, 1); \
   for (mshadow::index_t i = 0; i < A.size(0); ++i) { \
     CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                  (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
@@ -818,7 +827,8 @@ void linalg_potri<gpu, DType>(const Tensor<gpu, 2, DType>& A, bool lower, Stream
   using namespace mxnet; \
   CHECK_NOTNULL(s); \
   check_potri(A, lower); \
-  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*A.MSize(), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_potri, buffer, \
+      DType, A.MSize()); \
   using namespace mshadow::cuda; \
   int ngrid = std::min(kMaxGridNum, \
                        static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
@@ -842,7 +852,8 @@ void linalg_batch_potri<gpu, DType>(const Tensor<gpu, 3, DType>& A, bool lower,
   CHECK_NOTNULL(s); \
   CHECK_GT(A.size(0), 0); \
   check_potri(A[0], lower); \
-  Storage::Handle buffer = Storage::Get()->Alloc(sizeof(DType)*A.MSize(), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_potri, buffer, \
+      DType, A.MSize()); \
   using namespace mshadow::cuda; \
   int ngrid = std::min(kMaxGridNum, \
                        static_cast<int>((A.MSize() + kBaseThreadNum - 1) / kBaseThreadNum)); \
@@ -1024,7 +1035,7 @@ void linalg_gelqf<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
   check_gelqf(A, work); \
   int m(A.size(0)); \
   int lwork(work.size(0) - m); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_gelqf, info, int, 1); \
   CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                 A.size(1), m, A.dptr_ , A.stride_, work.dptr_, \
                 work.dptr_ + m, lwork, static_cast<int *>(info.dptr))); \
@@ -1048,7 +1059,7 @@ void linalg_orglq<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
   check_gelqf(A, work); \
   int m(A.size(0)); \
   int lwork(work.size(0) - m); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_orglq, info, int, 1); \
   CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                 A.size(1), m, m, A.dptr_ , A.stride_, work.dptr_, \
                 work.dptr_ + m, lwork, static_cast<int *>(info.dptr))); \
@@ -1084,7 +1095,8 @@ int linalg_gelqf_workspace_query<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
   CUSOLVER_CALL(cusolverDn##prefix##geqrf_bufferSize(Stream<gpu>::GetSolverHandle(s), \
                 A.size(1), m, A.dptr_ , A.stride_, &work1)); \
   int work2(0);  \
-  Storage::Handle tau = Storage::Get()->Alloc(sizeof(DType), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_gelqf_workspace_query, \
+      tau, DType, 1); \
   CUSOLVER_CALL(cusolverDn##prefix##orgqr_bufferSize(Stream<gpu>::GetSolverHandle(s), \
                 A.size(1), m, m, A.dptr_ , A.stride_, static_cast<DType *>(tau.dptr), &work2)); \
   Storage::Get()->Free(tau); \
@@ -1182,7 +1194,7 @@ void linalg_syevd<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
   using mshadow::gpu; \
   CHECK_NOTNULL(s); \
   check_syevd(A, L); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_syevd, info, int, 1); \
   CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                 CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, \
                 A.size(0), A.dptr_ , A.stride_, L.dptr_, work.dptr_, \
@@ -1302,7 +1314,7 @@ void linalg_gesvd<gpu, DType>(const Tensor<gpu, 2, DType>& UT, \
   using namespace mxnet; \
   using mshadow::gpu; \
   check_gesvd(UT, L, V); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_gesvd, info, int, 1); \
   CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
                 'O', 'S', V.size(1), V.size(0), V.dptr_, V.stride_, L.dptr_, V.dptr_, V.stride_, \
                 UT.dptr_, UT.stride_, work.dptr_, work.size(0), \
@@ -1422,8 +1434,9 @@ void linalg_batch_getrf<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
   using namespace mxnet; \
   using namespace mxnet::op::mxnet_op; \
   CHECK_NOTNULL(s); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int) * A.size(0), Context::GPU()); \
-  Storage::Handle A_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_getrf, info, int, A.size(0)); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_getrf, A_ptr_buf, \
+      DType *, A.size(0)); \
   DType **A_ptr = static_cast<DType **>(A_ptr_buf.dptr); \
   Kernel<set_matrix, gpu>::Launch(s, A.size(0), \
                                   A_ptr, A.dptr_, \
@@ -1511,10 +1524,12 @@ void linalg_batch_getri<gpu, DType>(const Tensor<gpu, 3, DType>& A, \
   using namespace mxnet; \
   using namespace mxnet::op::mxnet_op; \
   CHECK_NOTNULL(s); \
-  Storage::Handle info = Storage::Get()->Alloc(sizeof(int) * A.size(0), Context::GPU()); \
-  Storage::Handle A_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_getri, info, int, A.size(0)); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_getri, \
+      A_ptr_buf, DType *, A.size(0)); \
   DType **A_ptr = static_cast<DType **>(A_ptr_buf.dptr); \
-  Storage::Handle LU_ptr_buf = Storage::Get()->Alloc(sizeof(DType *) * A.size(0), Context::GPU()); \
+  EPHEMERAL_GPU_STORAGE_ALLOC(linalg_batch_getri, \
+      LU_ptr_buf, DType *, A.size(0)); \
   DType **LU_ptr = static_cast<DType **>(LU_ptr_buf.dptr); \
   Kernel<set_matrix, gpu>::Launch(s, A.size(0), \
                                   A_ptr, A.dptr_, \
diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h
index ddd0729..056f93b 100644
--- a/src/operator/nn/cudnn/cudnn_convolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h
@@ -760,8 +760,11 @@ class CuDNNConvolutionOp {
   // cudaMalloc() calls by (say) cudnnFind().  `elements` spec the alloc size in DTypes, not bytes.
   void ReserveElements(const std::vector<size_t> &elements) {
     std::vector<Storage::Handle> handles;
-    for (size_t alloc_element : elements)
+    for (size_t alloc_element : elements) {
         handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU()));
+        handles.back().profiler_scope = "<ephemeral>:";
+        handles.back().name = "reserve_elements";
+    }
     for (auto &handle : handles)
         Storage::Get()->DirectFree(handle);
   }
diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
index 9783adc..b701883 100644
--- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
+++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h
@@ -763,8 +763,11 @@ class CuDNNDeconvolutionOp {
   // cudaMalloc() calls by (say) cudnnFind().  `elements` spec the alloc size in DTypes, not bytes.
   void ReserveElements(const std::vector<size_t> &elements) {
     std::vector<Storage::Handle> handles;
-    for (size_t alloc_element : elements)
+    for (size_t alloc_element : elements) {
         handles.push_back(Storage::Get()->Alloc(alloc_element * sizeof(DType), Context::GPU()));
+        handles.back().profiler_scope = "<ephemeral>:";
+        handles.back().name = "reserve_elements";
+    }
     for (auto &handle : handles)
         Storage::Get()->DirectFree(handle);
   }
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index d41b5b4..461f90b 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -41,6 +41,7 @@
 #include "./math_functions-inl.h"
 #include "./operator_common.h"
 #include "./rnn_impl.h"
+#include "../profiler/storage_profiler.h"
 
 #if MXNET_USE_CUDNN == 1
 STATIC_ASSERT_CUDNN_VERSION_GE(7000);
@@ -1400,6 +1401,9 @@ class RNNOp {
       workspace_size_ = workspace_byte_ / sizeof(DType);
       // Allocate the reserve space
       reserve_space_ = Storage::Get()->Alloc(reserve_space_byte_, Context::GPU(s->dev_id));
+      reserve_space_.profiler_scope = "cudnn_rnn:";
+      reserve_space_.name = "reserve_space";
+      profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(reserve_space_);
       // Check that number of params are correct
       size_t cudnn_param_size;
       CUDNN_CALL(cudnnGetRNNParamsSize(s->dnn_handle_,
diff --git a/src/profiler/profiler.cc b/src/profiler/profiler.cc
index c429661..13ab462 100644
--- a/src/profiler/profiler.cc
+++ b/src/profiler/profiler.cc
@@ -294,5 +294,23 @@ void Profiler::SetContinuousProfileDump(bool continuous_dump, float delay_in_sec
   }
 }
 
+ProfilerScope* ProfilerScope::Get() {
+  static std::mutex mtx;
+  static std::shared_ptr<ProfilerScope> prof_scope = nullptr;
+  std::unique_lock<std::mutex> lk(mtx);
+  if (!prof_scope) {
+    prof_scope = std::make_shared<ProfilerScope>();
+  }
+  return prof_scope.get();
+}
+
+void ProfilerScope::SetCurrentProfilerScope(const std::string& scope) {
+  current_profiler_scope_ = scope;
+}
+
+std::string ProfilerScope::GetCurrentProfilerScope() const {
+  return current_profiler_scope_;
+}
+
 }  // namespace profiler
 }  // namespace mxnet
diff --git a/src/profiler/profiler.h b/src/profiler/profiler.h
index f9f997f..132a9f9 100644
--- a/src/profiler/profiler.h
+++ b/src/profiler/profiler.h
@@ -601,7 +601,7 @@ struct ProfileCounter : public ProfileObject {
   }
   /*! \brief operator: object -= v */
   inline uint64_t operator -=(int64_t v) {
-    CHECK_GE(value_, v);
+    CHECK_GE(value_, static_cast<uint64_t>(v));
     if (v >= 0) {
       return DecrementValue(static_cast<uint64_t>(v));
     } else {
@@ -1314,6 +1314,18 @@ inline void Profiler::AddProfileStat<ProfileOperator::OprExecStat>(
 
 #undef VTUNE_ONLY_CODE  // This macro not meant to be used outside of this file
 
+class ProfilerScope {
+ public:
+  /*! \brief Get the profiler scope instance */
+  static ProfilerScope* Get();
+  /*! \brief Set the current profiler scope */
+  void SetCurrentProfilerScope(const std::string& scope);
+  /*! \brief Get the current profiler scope */
+  std::string GetCurrentProfilerScope() const;
+ private:
+  std::string current_profiler_scope_ = MXNET_STORAGE_DEFAULT_PROFILER_SCOPE_CSTR;
+};
+
 }  // namespace profiler
 }  // namespace mxnet
 #endif  // MXNET_PROFILER_PROFILER_H_
diff --git a/src/profiler/storage_profiler.cc b/src/profiler/storage_profiler.cc
new file mode 100644
index 0000000..873cc67
--- /dev/null
+++ b/src/profiler/storage_profiler.cc
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./storage_profiler.h"
+
+#if MXNET_USE_NVML
+#include <nvml.h>
+#endif  // MXNET_USE_NVML
+#include <fstream>
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include "./profiler.h"
+#include "../common/utils.h"
+#include "../common/cuda_utils.h"
+
+namespace mxnet {
+namespace profiler {
+
+#if MXNET_USE_CUDA
+
+GpuDeviceStorageProfiler* GpuDeviceStorageProfiler::Get() {
+  static std::mutex mtx;
+  static std::shared_ptr<GpuDeviceStorageProfiler> gpu_dev_storage_profiler = nullptr;
+  std::unique_lock<std::mutex> lk(mtx);
+  if (!gpu_dev_storage_profiler) {
+    gpu_dev_storage_profiler = std::make_shared<GpuDeviceStorageProfiler>();
+  }
+  return gpu_dev_storage_profiler.get();
+}
+
+void GpuDeviceStorageProfiler::DumpProfile() const {
+  size_t current_pid = common::current_process_id();
+  std::ofstream fout((filename_prefix_ + "-pid_" + std::to_string(current_pid)
+                      + ".csv").c_str());
+  if (!fout.is_open()) {
+    return;
+  }
+  struct AllocEntryDumpFmt {
+    size_t requested_size;
+    int dev_id;
+    size_t actual_size;
+    bool reuse;
+  };
+  // order the GPU memory allocation entries by their attribute name
+  std::multimap<std::string, AllocEntryDumpFmt> gpu_mem_ordered_alloc_entries;
+  // map the GPU device ID to the total amount of allocations
+  std::unordered_map<int, size_t> gpu_dev_id_total_alloc_map;
+  for (const std::pair<void*, AllocEntry>& alloc_entry :
+       gpu_mem_alloc_entries_) {
+    gpu_mem_ordered_alloc_entries.emplace(
+        alloc_entry.second.profiler_scope +
+        alloc_entry.second.name, AllocEntryDumpFmt{
+          alloc_entry.second.requested_size,
+          alloc_entry.second.dev_id,
+          alloc_entry.second.actual_size,
+          alloc_entry.second.reuse});
+    gpu_dev_id_total_alloc_map[alloc_entry.second.dev_id] = 0;
+  }
+  fout << "\"Attribute Name\",\"Requested Size\","
+          "\"Device\",\"Actual Size\",\"Reuse?\"" << std::endl;
+  for (const std::pair<std::string, AllocEntryDumpFmt>& alloc_entry :
+       gpu_mem_ordered_alloc_entries) {
+    fout << "\"" << alloc_entry.first << "\","
+         << "\"" << alloc_entry.second.requested_size << "\","
+         << "\"" << alloc_entry.second.dev_id << "\","
+         << "\"" << alloc_entry.second.actual_size << "\","
+         << "\"" << alloc_entry.second.reuse << "\"" << std::endl;
+    gpu_dev_id_total_alloc_map[alloc_entry.second.dev_id] +=
+        alloc_entry.second.actual_size;
+  }
+#if MXNET_USE_NVML
+  // If NVML has been enabled, add amend term to the GPU memory profile.
+  nvmlDevice_t nvml_device;
+
+  NVML_CALL(nvmlInit());
+  for (std::pair<const int, size_t>& dev_id_total_alloc_pair :
+       gpu_dev_id_total_alloc_map) {
+    unsigned info_count = 0;
+    std::vector<nvmlProcessInfo_t> infos(info_count);
+
+    NVML_CALL(nvmlDeviceGetHandleByIndex(dev_id_total_alloc_pair.first, &nvml_device));
+    // The first call to `nvmlDeviceGetComputeRunningProcesses` is to set the
+    // size of info. Since `NVML_ERROR_INSUFFICIENT_SIZE` will always be
+    // returned, we do not wrap the function call with `NVML_CALL`.
+    nvmlDeviceGetComputeRunningProcesses(nvml_device, &info_count, infos.data());
+    infos = std::vector<nvmlProcessInfo_t>(info_count);
+    NVML_CALL(nvmlDeviceGetComputeRunningProcesses(nvml_device, &info_count, infos.data()));
+
+    bool amend_made = false;
+
+    for (unsigned i = 0; i < info_count; ++i) {
+      if (current_pid == infos[i].pid) {
+        amend_made = true;
+        fout << "\"" << "nvml_amend" << "\","
+             << "\"" << infos[i].usedGpuMemory - dev_id_total_alloc_pair.second << "\","
+             << "\"" << dev_id_total_alloc_pair.first << "\","
+             << "\"" << infos[i].usedGpuMemory - dev_id_total_alloc_pair.second << "\","
+             << "\"0\"" << std::endl;
+        break;
+      }
+    }
+    if (!amend_made) {
+      LOG(INFO) << "NVML is unable to make amendment to the GPU memory profile "
+                   "since it is unable to locate the current process ID. "
+                   "Are you working in Docker without setting --pid=host?";
+    }
+  }  // for (dev_id_total_alloc_pair : gpu_dev_id_total_alloc_map)
+#endif  // MXNET_USE_NVML
+}
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace profiler
+}  // namespace mxnet
diff --git a/src/profiler/storage_profiler.h b/src/profiler/storage_profiler.h
index 5ab5983..ad87bf1 100644
--- a/src/profiler/storage_profiler.h
+++ b/src/profiler/storage_profiler.h
@@ -19,13 +19,16 @@
 #ifndef MXNET_PROFILER_STORAGE_PROFILER_H_
 #define MXNET_PROFILER_STORAGE_PROFILER_H_
 
+#include <mxnet/libinfo.h>
 #include <mxnet/storage.h>
 #include <string>
+#include <tuple>
 #include <vector>
+#include <unordered_map>
 #include "./profiler.h"
 
 namespace mxnet {
-namespace storage {
+namespace profiler {
 
 /*!
  * \brief Storage allocation/deallocation profiling via ProfileCounters
@@ -106,7 +109,95 @@ class DeviceStorageProfiler {
   std::vector<std::shared_ptr<profiler::ProfileCounter>> mem_counters_;
 };
 
-}  // namespace storage
+#if MXNET_USE_CUDA
+
+/*!
+ * \brief GPU storage allocation/deallocation profiling
+ */
+class GpuDeviceStorageProfiler {
+ public:
+  /*! \brief get the global instance to record an allocation entry */
+  static GpuDeviceStorageProfiler* Get();
+  /*!
+   * \brief Similar functions to the `DeviceStorageProfiler` methods above.
+   *        However, in the case of the `GpuDeviceStorageProfiler`, we are 
+   *        recording extra piece of information on the actual allocation size
+   *        and whether the allocation is a reuse or not.
+   */
+  void OnAlloc(const Storage::Handle &handle,
+               const size_t actual_size, const bool reuse) {
+    if (handle.size > 0) {
+      profiler::Profiler *prof = profiler::Profiler::Get();
+      if (prof->IsProfiling(profiler::Profiler::kMemory)) {
+#ifdef _MSC_VER
+        gpu_mem_alloc_entries_[handle.dptr] = AllocEntry{
+            handle.profiler_scope,
+            handle.name,
+            handle.size,
+            handle.ctx.dev_id,
+            actual_size, reuse};
+#else
+        gpu_mem_alloc_entries_[handle.dptr] = {
+            handle.profiler_scope,
+            handle.name,
+            handle.size,
+            handle.ctx.dev_id,
+            actual_size, reuse};
+#endif
+      }
+    }
+  }
+
+  void OnFree(const Storage::Handle &handle) {
+    if (handle.size > 0) {
+      profiler::Profiler *prof = profiler::Profiler::Get();
+      if (prof->IsProfiling(profiler::Profiler::kMemory)) {
+        // In case of bug which tries to free first
+        if (gpu_mem_alloc_entries_.find(handle.dptr) !=
+            gpu_mem_alloc_entries_.end()) {
+          gpu_mem_alloc_entries_.erase(handle.dptr);
+        }
+      }
+    }
+  }
+
+  void UpdateStorageInfo(const Storage::Handle &handle) {
+    if (handle.size > 0) {
+      profiler::Profiler *prof = profiler::Profiler::Get();
+      if (prof->IsProfiling(profiler::Profiler::kMemory)) {
+        auto entry_iter = gpu_mem_alloc_entries_.find(handle.dptr);
+        if (entry_iter != gpu_mem_alloc_entries_.end()) {
+          entry_iter->second.profiler_scope = handle.profiler_scope;
+          entry_iter->second.name = handle.name;
+        }
+      }
+    }
+  }
+
+  /*! \brief set the dumping filename */
+  void SetConfig(const std::string& filename_prefix) {
+    filename_prefix_ = filename_prefix;
+  }
+  /*! \brief dump the allocation entries to file */
+  void DumpProfile() const;
+
+ private:
+  std::string filename_prefix_ = "gpu_memory_profile";
+  /*! \brief Dynamically-sized dictionary of memory profile counters */
+  struct AllocEntry {
+    std::string profiler_scope;  // profiler scope of the storage handle
+    std::string name;            // name of the storage handle
+    size_t requested_size;       // requested size of the storage handle
+    int dev_id;                  // device ID of the storage handle
+    size_t actual_size;          // actual allocation size
+    bool reuse;                  // whether the allocation is a reuse
+  };
+  std::unordered_map<void*, AllocEntry> gpu_mem_alloc_entries_;
+};
+
+#endif  // MXNET_USE_CUDA
+
+}  // namespace profiler
 }  // namespace mxnet
 
 #endif  // MXNET_PROFILER_STORAGE_PROFILER_H_
diff --git a/src/resource.cc b/src/resource.cc
index 3f46124..65af53d 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -29,12 +29,12 @@
 #include <mxnet/engine.h>
 #include <mxnet/random_generator.h>
 #include <mxnet/resource.h>
-#include <mxnet/storage.h>
 #include <limits>
 #include <atomic>
 #include "./common/lazy_alloc_array.h"
 #include "./common/utils.h"
 #include "./common/cuda_utils.h"
+#include "./profiler/storage_profiler.h"
 
 namespace mxnet {
 namespace resource {
@@ -65,11 +65,16 @@ struct SpaceAllocator {
     host_handle.size = 0;
   }
 
-  inline void* GetSpace(size_t size) {
+  inline void* GetSpace(size_t size, const std::string &name) {
     if (handle.size >= size) return handle.dptr;
 
     Storage::Get()->DirectFree(handle);
     handle = Storage::Get()->Alloc(size, ctx);
+    handle.profiler_scope = "resource:";
+    handle.name = name;
+#if MXNET_USE_CUDA
+    profiler::GpuDeviceStorageProfiler::Get()->UpdateStorageInfo(handle);
+#endif  // MXNET_USE_CUDA
     return handle.dptr;
   }
 
@@ -410,8 +415,9 @@ class ResourceManagerImpl : public ResourceManager {
 };
 }  // namespace resource
 
-void* Resource::get_space_internal(size_t size) const {
-  return static_cast<resource::SpaceAllocator*>(ptr_)->GetSpace(size);
+void* Resource::get_space_internal(size_t size,
+    const std::string &name) const {
+  return static_cast<resource::SpaceAllocator*>(ptr_)->GetSpace(size, name);
 }
 
 void* Resource::get_host_space_internal(size_t size) const {
@@ -420,27 +426,25 @@ void* Resource::get_host_space_internal(size_t size) const {
 
 #if MXNET_USE_CUDNN == 1
 void Resource::get_cudnn_dropout_desc(
-    cudnnDropoutDescriptor_t* dropout_desc,
+    cudnnDropoutDescriptor_t *dropout_desc,
     mshadow::Stream<gpu> *stream,
-    const float dropout,
-    uint64_t seed) const {
+    const float dropout, uint64_t seed,
+    const std::string &name) const {
 
   CHECK_EQ(req.type, ResourceRequest::kCuDNNDropoutDesc);
   auto state_space = static_cast<resource::SpaceAllocator*>(ptr_);
   CHECK_EQ(state_space->ctx.dev_id, stream->dev_id)
-    << "The device id of cudnn dropout state space doesn't match that from stream.";
+    << "The device id of cuDNN dropout state space doesn't match that from stream.";
   if (!state_space->handle.size) {
     // not initialized yet.
     size_t dropout_state_size;
     CUDNN_CALL(cudnnDropoutGetStatesSize(stream->dnn_handle_, &dropout_state_size));
     // reserve GPU space
-    Storage::Get()->DirectFree(
-      Storage::Get()->Alloc(dropout_state_size, state_space->ctx));
+    Storage::Get()->DirectFree(Storage::Get()->Alloc(dropout_state_size, state_space->ctx));
     CUDNN_CALL(cudnnSetDropoutDescriptor(*dropout_desc, stream->dnn_handle_,
                                          dropout,
-                                         state_space->GetSpace(dropout_state_size),
-                                         dropout_state_size,
-                                         seed));
+                                         state_space->GetSpace(dropout_state_size, name),
+                                         dropout_state_size, seed));
   } else {
     // cudnnRestoreDropoutDescriptor() introduced with cuDNN v7
     STATIC_ASSERT_CUDNN_VERSION_GE(7000);
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 5e09561..3eabe1b 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -28,6 +28,7 @@
 #include "mxnet/base.h"
 #include "mxnet/storage.h"
 #include "../common/cuda_utils.h"
+#include "../profiler/storage_profiler.h"
 #if MXNET_USE_CUDA
 #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
@@ -64,8 +65,11 @@ inline void GPUDeviceStorage::Alloc(Storage::Handle* handle) {
   std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
 #endif  // MXNET_USE_NCCL
   cudaError_t e = cudaMalloc(&handle->dptr, size);
-  if (e != cudaSuccess && e != cudaErrorCudartUnloading)
+  if (e != cudaSuccess && e != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(e);
+  }
+  // record the allocation event in the memory profiler
+  profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
 #else   // MXNET_USE_CUDA
   LOG(FATAL) << "Please compile with CUDA enabled";
 #endif  // MXNET_USE_CUDA
@@ -83,6 +87,8 @@ inline void GPUDeviceStorage::Free(Storage::Handle handle) {
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
   }
+  // record the deallocation event in the memory profiler
+  profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
 #else   // MXNET_USE_CUDA
   LOG(FATAL) << "Please compile with CUDA enabled";
 #endif  // MXNET_USE_CUDA
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 13573d9..5d03fd1 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -30,6 +30,7 @@
 #include "mxnet/base.h"
 #include "mxnet/storage.h"
 #include "../common/cuda_utils.h"
+#include "../profiler/storage_profiler.h"
 
 namespace mxnet {
 namespace storage {
@@ -60,6 +61,8 @@ inline void PinnedMemoryStorage::Alloc(Storage::Handle* handle) {
   mxnet::common::cuda::DeviceStore device_store(handle->ctx.real_dev_id(), true);
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&handle->dptr, size, cudaHostAllocPortable));
+  // record the allocation event in the memory profiler
+  profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
 }
 
 inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
@@ -72,6 +75,8 @@ inline void PinnedMemoryStorage::Free(Storage::Handle handle) {
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
     LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
   }
+  // record the deallocation event in the memory profiler
+  profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
 }
 
 }  // namespace storage
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index d9d7277..80358fc 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -39,6 +39,7 @@
 #include "./storage_manager.h"
 #include "../common/cuda_utils.h"
 #include "../common/utils.h"
+#include "../profiler/storage_profiler.h"
 
 
 namespace mxnet {
@@ -97,6 +98,7 @@ class GPUPooledStorageManager final : public StorageManager {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
     }
     used_memory_ -= size;
+    profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
   }
 
   // Round a value 'x' up to the next multiple of 'multiple'
@@ -166,11 +168,15 @@ void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
     }
     used_memory_ += size;
     handle->dptr = ret;
+    // record the allocation event in the memory profiler
+    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
     reuse_pool.pop_back();
     handle->dptr = ret;
+    // record the allocation event in the memory profiler
+    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, true);
   }
 }
 
@@ -292,6 +298,7 @@ class GPUPooledRoundedStorageManager final : public StorageManager {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
     }
     used_memory_ -= size;
+    profiler::GpuDeviceStorageProfiler::Get()->OnFree(handle);
   }
 
  private:
@@ -349,10 +356,14 @@ void GPUPooledRoundedStorageManager::Alloc(Storage::Handle* handle) {
     }
     used_memory_ += size;
     handle->dptr = ret;
+    // record the allocation event in the memory profiler
+    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, false);
   } else {
     auto ret = reuse_pool.back();
     reuse_pool.pop_back();
     handle->dptr = ret;
+    // record the allocation event in the memory profiler
+    profiler::GpuDeviceStorageProfiler::Get()->OnAlloc(*handle, size, true);
   }
 }
 
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index 7a59a77..c0903d2 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -53,7 +53,7 @@ class StorageImpl : public Storage {
   // internal storage managers
   std::array<common::LazyAllocArray<storage::StorageManager>,
              kMaxNumberOfDevices> storage_managers_;
-  storage::DeviceStorageProfiler profiler_;
+  profiler::DeviceStorageProfiler profiler_;
 };  // struct Storage::Impl
 #if MXNET_USE_CUDA
 int StorageImpl::num_gpu_device = 0;
diff --git a/tests/python/unittest/test_profiler.py b/tests/python/unittest/test_profiler.py
index 8e8209f..5a0baca 100644
--- a/tests/python/unittest/test_profiler.py
+++ b/tests/python/unittest/test_profiler.py
@@ -18,12 +18,15 @@
 from __future__ import print_function
 import time
 import os
+import csv
 import json
 import unittest
+import numpy as np
 from collections import OrderedDict
 
 import mxnet as mx
 from mxnet import profiler
+from mxnet.gluon import nn
 from common import run_in_spawned_process
 
 
@@ -34,8 +37,7 @@ def enable_profiler(profile_filename, run=True, continuous_dump=False, aggregate
                         profile_api=True,
                         filename=profile_filename,
                         continuous_dump=continuous_dump,
-                        aggregate_stats=aggregate_stats
-                        )
+                        aggregate_stats=aggregate_stats)
     if run is True:
         profiler.set_state('run')
 
@@ -459,6 +461,156 @@ def test_custom_operator_profiling_naive_engine():
             'test_custom_operator_profiling_multiple_custom_ops_symbolic_naive.json')
 
 
+@unittest.skipIf(mx.context.num_gpus() == 0, "GPU memory profiler records allocation on GPUs only")
+def test_gpu_memory_profiler_symbolic():
+    iter_num = 5
+
+    enable_profiler('test_profiler.json', False, False)
+    profiler.set_state('run')
+
+    with profiler.Scope("tensordot"):
+        A = mx.sym.Variable('A')
+        B = mx.sym.Variable('B')
+        C = mx.symbol.dot(A, B, name='dot')
+
+    executor = C.simple_bind(mx.gpu(), 'write', A=(4096, 4096), B=(4096, 4096))
+
+    a = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
+    b = mx.random.uniform(-1.0, 1.0, shape=(4096, 4096))
+
+    a.copyto(executor.arg_dict['A'])
+    b.copyto(executor.arg_dict['B'])
+
+    for i in range(iter_num):
+        executor.forward()
+        c = executor.outputs[0]
+        mx.nd.waitall()
+    profiler.set_state('stop')
+    profiler.dump(True)
+
+    expected_alloc_entries = [
+            {'Attribute Name' : 'tensordot:in_arg:A',
+             'Requested Size' : str(4 * a.size)},
+            {'Attribute Name' : 'tensordot:in_arg:B',
+             'Requested Size' : str(4 * b.size)},
+            {'Attribute Name' : 'tensordot:arg_grad:A',
+             'Requested Size' : str(4 * a.size)},
+            {'Attribute Name' : 'tensordot:arg_grad:B',
+             'Requested Size' : str(4 * b.size)},
+            {'Attribute Name' : 'tensordot:dot',
+             'Requested Size' : str(4 * c.size)},
+            {'Attribute Name' : 'tensordot:dot_head_grad',
+             'Requested Size' : str(4 * c.size)}]
+
+    # Sample gpu_memory_profile.csv:
+    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
+    # "tensordot:arg_grad:A","67108864","0","67108864","0"
+    # "tensordot:arg_grad:B","67108864","0","67108864","0"
+    # "tensordot:dot","67108864","0","67108864","0"
+    # "tensordot:dot_head_grad","67108864","0","67108864","0"
+    # "tensordot:in_arg:A","67108864","0","67108864","0"
+    # "tensordot:in_arg:B","67108864","0","67108864","0"
+
+    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        for expected_alloc_entry in expected_alloc_entries:
+            csv_file.seek(0)
+            entry_found = False
+            for row in csv_reader:
+                if row['Attribute Name'] == expected_alloc_entry['Attribute Name']:
+                    assert row['Requested Size'] == expected_alloc_entry['Requested Size'], \
+                           "requested size={} is not equal to the expected size={}" \
+                           .format(row['Requested Size'],
+                                   expected_alloc_entry['Requested Size'])
+                    entry_found = True
+                    break
+            assert entry_found, \
+                   "Entry for attr_name={} has not been found" \
+                   .format(expected_alloc_entry['Attribute Name'])
+
+
+@unittest.skipIf(mx.context.num_gpus() == 0, "GPU memory profiler records allocation on GPUs only")
+def test_gpu_memory_profiler_gluon():
+    enable_profiler(profile_filename='test_profiler.json',
+                    run=True, continuous_dump=True)
+    profiler.set_state('run')
+
+    model = nn.HybridSequential(prefix='net_')
+    with model.name_scope():
+        model.add(nn.Dense(128, activation='tanh'))
+        model.add(nn.Dropout(0.5))
+        model.add(nn.Dense(64, activation='tanh'),
+                  nn.Dense(32, in_units=64))
+        model.add(nn.Activation('relu'))
+    model.initialize(ctx=mx.gpu())
+    model.hybridize()
+
+    inputs = mx.sym.var('data')
+
+    with mx.autograd.record():
+        out = model(mx.nd.zeros((16, 10), ctx=mx.gpu()))
+    out.backward()
+    mx.nd.waitall()
+    profiler.set_state('stop')
+    profiler.dump(True)
+
+    # Sample gpu_memory_profiler.csv
+    # "Attribute Name","Requested Size","Device","Actual Size","Reuse?"
+    # "<unk>:in_arg:data","640","0","4096","0"
+    # "net:arg_grad:net_dense0_bias","512","0","4096","0"
+    # "net:arg_grad:net_dense0_weight","5120","0","8192","0"
+    # "net:arg_grad:net_dense1_bias","256","0","4096","0"
+    # "net:arg_grad:net_dense1_weight","32768","0","32768","0"
+    # "net:arg_grad:net_dense2_bias","128","0","4096","0"
+    # "net:arg_grad:net_dense2_weight","8192","0","8192","0"
+    # "net:dense0:net_dense0_fwd","8192","0","8192","0"
+    # "net:dense0:tanh:net_dense0_tanh_fwd","8192","0","8192","0"
+    # "net:dense1:net_dense1_fwd","4096","0","4096","0"
+    # "net:dense1:tanh:net_dense1_tanh_fwd","4096","0","4096","0"
+    # "net:dense2:net_dense2_fwd","2048","0","4096","0"
+    # "net:dense2:net_dense2_fwd_backward","4096","0","4096","0"
+    # "net:dropout0:net_dropout0_fwd","8192","0","8192","0"
+    # "net:dropout0:net_dropout0_fwd","8192","0","8192","0"
+    # "net:in_arg:net_dense0_bias","512","0","4096","0"
+    # "net:in_arg:net_dense0_weight","5120","0","8192","0"
+    # "net:in_arg:net_dense1_bias","256","0","4096","0"
+    # "net:in_arg:net_dense1_weight","32768","0","32768","0"
+    # "net:in_arg:net_dense2_bias","128","0","4096","0"
+    # "net:in_arg:net_dense2_weight","8192","0","8192","0"
+    # "net:relu0:net_relu0_fwd","2048","0","4096","0"
+    # "net:relu0:net_relu0_fwd_backward","8192","0","8192","0"
+    # "net:relu0:net_relu0_fwd_head_grad","2048","0","4096","0"
+    # "resource:cudnn_dropout_state (dropout-inl.h +258)","1671168","0","1671168","0"
+    # "resource:temp_space (fully_connected-inl.h +316)","34816","0","36864","0"
+
+    # We are only checking for weight parameters here, also making sure that
+    # there is no unknown entries in the memory profile.
+    with open('gpu_memory_profile-pid_%d.csv' % (os.getpid()), mode='r') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        for scope in ['in_arg', 'arg_grad']:
+            for key, nd in model.collect_params().items():
+                expected_arg_name = "net:%s:" % scope + key
+                expected_arg_size = str(4 * np.prod(nd.shape))
+                csv_file.seek(0)
+                entry_found = False
+                for row in csv_reader:
+                    if row['Attribute Name'] == expected_arg_name:
+                        assert row['Requested Size'] == expected_arg_size, \
+                            "requested size={} is not equal to the expected size={}" \
+                            .format(row['Requested Size'], expected_arg_size)
+                        entry_found = True
+                        break
+                assert entry_found, \
+                    "Entry for attr_name={} has not been found" \
+                    .format(expected_arg_name)
+        # Make sure that there is no unknown allocation entry.
+        csv_file.seek(0)
+        for row in csv_reader:
+            if row['Attribute Name'] == "<unk>:unknown" or \
+               row['Attribute Name'] == "<unk>:":
+                assert False, "Unknown allocation entry has been encountered"
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 8e4fe11..c913f5c 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -296,7 +296,8 @@ def test_load_000800():
         assert k in attr2, k
         v2 = attr2[k]
         for kk, vv1 in v1.items():
-            if kk.startswith('__') and kk.endswith('__'):
+            if kk.startswith('__') and kk.endswith('__') and \
+               kk != '__profiler_scope__':
                 assert kk in v2 and v2[kk] == vv1, k + str(v1) + str(v2)
 
     check_symbol_consistency(sym1, sym2,
diff --git a/tests/python/unittest/test_thread_local.py b/tests/python/unittest/test_thread_local.py
index f0e3c66..63d97f1 100644
--- a/tests/python/unittest/test_thread_local.py
+++ b/tests/python/unittest/test_thread_local.py
@@ -120,6 +120,7 @@ def test_blockscope():
         def __init__(self, prefix):
             self.prefix = prefix
             self._empty_prefix = False
+            self._profiler_scope_name = '<unk>:'
     blockscope_list = []
     status = [False]
     event = threading.Event()