You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by wa...@apache.org on 2015/05/03 16:04:14 UTC

[09/12] incubator-singa git commit: Transfer code from nusinga repo to singa apache repo. New commuinication framework is implemented to unify the frameworks of existing distributed deep learning systems. Communication is now implmented using ZeroMQ. API

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/gtest/gtest_main.cc
----------------------------------------------------------------------
diff --git a/include/gtest/gtest_main.cc b/include/gtest/gtest_main.cc
new file mode 100644
index 0000000..f302822
--- /dev/null
+++ b/include/gtest/gtest_main.cc
@@ -0,0 +1,38 @@
+// Copyright 2006, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+GTEST_API_ int main(int argc, char **argv) {
+  printf("Running main() from gtest_main.cc\n");
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/cuda_reduce.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/cuda_reduce.cuh b/include/mshadow/cuda/cuda_reduce.cuh
new file mode 100644
index 0000000..b7808a6
--- /dev/null
+++ b/include/mshadow/cuda/cuda_reduce.cuh
@@ -0,0 +1,117 @@
+#ifndef MSHADOW_CUDA_REDUCE_CUH
+#define MSHADOW_CUDA_REDUCE_CUH
+/*!
+ * \file cuda_reduce.cuh
+ * \brief helper functions to do reduction
+ * \author Tianqi Chen
+ */
+namespace mshadow{
+    namespace cuda{
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam x_bits dimension = 1<<x_bits
+         */
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
+        /*
+         * \brief reduce over the dimension x
+         * \tparam Reducer reducer
+         * \tparam xmax_bits maximum size of buffer
+         * \param xsize size of x dimension, not sure if aligned
+         */
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
+    };
+};
+
+// ===============================================x===
+//  implementations afterwards, 
+//  no need to read if only use the functions
+// --------------------------------------------------
+#ifdef  __DEVICE_EMULATION__
+#define __MSHADOW_EMUSYNC__ __syncthreads()
+#else
+#define __MSHADOW_EMUSYNC__ 
+#endif
+
+namespace mshadow{
+    namespace cuda{        
+        template<typename Reducer, int x_bits>
+        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
+            if( x_bits >= 10 ){
+                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 9 ){
+                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 8 ){
+                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
+                __syncthreads(); 
+            }
+            if( x_bits >= 7 ){
+                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
+                __syncthreads(); 
+            }            
+            if( x_bits >= 6 ){
+                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
+                __syncthreads();
+            }
+            // in warp optimization
+            if( x_bits >= 5 ){
+                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 4 ){
+                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
+                __MSHADOW_EMUSYNC__;            
+            }
+            if( x_bits >= 3 ){
+                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 2 ){
+                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
+                __MSHADOW_EMUSYNC__;
+            }
+            if( x_bits >= 1 ){
+                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
+                __MSHADOW_EMUSYNC__;
+            }  
+        };
+        
+        template<typename Reducer,int x_bits>
+        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
+            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
+        }
+
+        // reduce with a upper bound
+        #define __RD_NON_ALIGN(els,x_bits)                              \
+            els                                                         \
+            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
+                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
+                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
+                }                                                       \
+                __syncthreads();                                        \
+                ReduceX<Reducer, x_bits>( buf, tid );                   \
+            }                                                           \
+            
+        template<typename Reducer, int xmax_bits>
+        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
+            int tid = threadIdx.x;
+            __RD_NON_ALIGN(, 8)
+            __RD_NON_ALIGN(else, 7)
+            __RD_NON_ALIGN(else, 6)
+            __RD_NON_ALIGN(else, 5) 
+            __RD_NON_ALIGN(else, 4) 
+            __RD_NON_ALIGN(else, 3) 
+            __RD_NON_ALIGN(else, 2) 
+            __RD_NON_ALIGN(else, 1)                     
+        }
+    };
+};
+
+#endif // MSHADOW_CUDA_REDUCE_CUH
+

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cuda/tensor_gpu-inl.cuh
----------------------------------------------------------------------
diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh b/include/mshadow/cuda/tensor_gpu-inl.cuh
new file mode 100644
index 0000000..61e477c
--- /dev/null
+++ b/include/mshadow/cuda/tensor_gpu-inl.cuh
@@ -0,0 +1,231 @@
+#ifndef MSHADOW_TENSOR_GPU_INL_CUH
+#define MSHADOW_TENSOR_GPU_INL_CUH
+/*!
+ * \file tensor_gpu-inl.cuh
+ * \brief implementation of GPU code using CUDA
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "../tensor.h"
+#include "cuda_reduce.cuh"
+
+namespace mshadow{
+    namespace cuda{
+        #ifndef __CUDA_ARCH__
+        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0"
+        #endif
+        /* load unit for memory access */
+        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
+        const int kMemUnitBits = 5;
+        const int kMaxThreadsPerBlock = 1024;
+        #else
+        const int kMemUnitBits = 4;
+        const int kMaxThreadsPerBlock = 512;
+        #endif
+        /*! \brief number of units that can do synchronized update, half warp size */
+        const int kMemUnit     = 1 << kMemUnitBits;
+        /*! \brief mask that could be helpful sometime */
+        const int kMemUnitMask = kMemUnit - 1;
+        /*! \brief suggested thread number(logscale) for mapping kernel */
+        const int kBaseThreadBits = 8;
+        /*! \brief suggested thread number for mapping kernel */
+        const int kBaseThreadNum  = 1 << kBaseThreadBits;
+        /*! \brief maximum value of grid */
+        const int kMaxGridNum     = 65535;
+        /*! \brief suggested grid number for mapping kernel */
+        const int kBaseGridNum    = 1024;
+        
+        /*! \brief get align stride for given size in x dimension */
+        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
+            if( (xstride & (kMemUnit-1)) == 0 ){
+                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
+            }else{
+                // if originally space is not aligned, no necessary to to alligned thread allocation
+                return xsize;
+            }
+        }
+        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){
+            if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock ||
+                dimGrid.x > 65535 || dimGrid.y > 65535 ){
+                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z );
+                utils::Error( "too large launch parameter\n");
+            } 
+        }        
+    };
+
+    namespace cuda {
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){
+            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
+            const int y   = tid / xstride;
+            const int x   = tid % xstride;
+            if (y < dst.shape[1] && x < dst.shape[0]) {
+                Saver::Save(dst[y][x], exp.Eval(y,x));
+            }
+        }
+        template<typename Saver, typename Plan, int block_dim_bits>
+        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){
+            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x );
+        }
+        template<typename Saver, typename Plan, int block_dim_bits, int grid_size>
+        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){
+            for( int i = 0; i < repeat; ++i ){
+                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size );
+            }
+        }        
+        
+        template<typename Saver, typename E>
+        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
+            const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ );
+            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum;
+            dim3 dimBlock(kBaseThreadNum, 1, 1);
+
+            if (num_block < kMaxGridNum) {
+                dim3 dimGrid(num_block, 1, 1);
+                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
+            } else {
+                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
+                dim3 dimGrid( kBaseGridNum, 1 , 1 );
+                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \
+                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
+            }
+        }        
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
+        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){
+            const unsigned warp_size = 1 << warp_bits;
+            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
+            // to avoid bank conflict
+            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
+
+            // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
+            if( threadIdx.y < eshape[1] && x < eshape[0] ){
+                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x );
+            }
+            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
+                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
+                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) );
+                }
+            } 
+            __syncthreads();
+            if( eshape[1] >= warp_size ){
+                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
+            }else{
+                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] );
+            }
+            __syncthreads();            
+            
+            if( threadIdx.y == 0 && x < eshape[0] ){
+                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
+            } 
+        }        
+        
+        template<typename Saver, typename Reducer, typename E>
+        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
+            dim3 dimBlock( kMemUnit, kMemUnit );
+            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
+            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
+            MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape );
+        } 
+    }; // namespace cuda
+    
+    namespace cuda{
+        template<typename Saver,typename Reducer, int block_dim_bits, typename Plan>
+        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){
+            const int block_size = 1 << block_dim_bits;
+            __shared__ real_t s_rec[ block_size ];
+            const int c = blockIdx.x;            
+            const index_t tot = pshape[0]*pshape[1]*pshape[3];
+
+            real_t res = Reducer::kInitV;
+            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){
+                index_t i = i_offset + threadIdx.x;
+                if( i< tot ){
+                    const index_t x = i % pshape[0];
+                    i /= pshape[0]; 
+                    const index_t y = i % pshape[1];
+                    const index_t n = i / pshape[1];
+                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
+                }
+            }                
+            s_rec[ threadIdx.x ] = res;
+            __syncthreads();
+            Reduce1D<Reducer,block_dim_bits>( s_rec );
+            if( threadIdx.x == 0 ){
+                Saver::Save( dst[c], s_rec[0]*scale );
+            }
+        }
+
+        template<typename Saver, typename Reducer, typename Plan>
+        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){  
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[0] );
+            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
+            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
+                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
+        }
+    };
+    
+    namespace cuda{
+        template<int x_bits>        
+        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
+            const unsigned x_size = 1 << x_bits;  
+            const int y = blockIdx.x;
+            __shared__ real_t s_rec[ x_size ];
+            
+            // step 1: get max
+            if( threadIdx.x < dst.shape[ 0 ] ){
+                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
+            }
+            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t a = src[ y ][ x + threadIdx.x ];
+                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
+                }
+            }
+            __syncthreads();
+            if( threadIdx.x >= dst.shape[0] ){
+                s_rec[ threadIdx.x ] = s_rec[0];
+            }
+            __syncthreads();
+            Reduce1D<red::maximum,x_bits>( s_rec );
+            __syncthreads();
+            real_t smax = s_rec[0];            
+            __syncthreads();
+            s_rec[ threadIdx.x ] = 0.0f;
+            __syncthreads();
+
+            // calculate normalizer, with writeback
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
+                    s_rec[ threadIdx.x ] += p;
+                    // write back first, will fetch later
+                    dst[ y ][ x + threadIdx.x ] = p;
+                }
+            }
+            // calculate normalizer
+            __syncthreads();
+            Reduce1D<red::sum,x_bits>( s_rec );
+            __syncthreads();
+            real_t ssum = s_rec[0];
+
+            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
+                if( x + threadIdx.x < dst.shape[0] ){
+                    dst[ y ][ x + threadIdx.x ] /= ssum;
+                }
+            }
+        }
+    
+        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
+            dim3 dimBlock( kBaseThreadNum );
+            dim3 dimGrid ( dst.shape[1] );
+            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
+            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
+            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
+        }
+    }; // namespace cuda
+}; // namespace mshadow
+#endif // TENSOR_GPU_INL_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/cxxnet_op.h
----------------------------------------------------------------------
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
new file mode 100644
index 0000000..930caf2
--- /dev/null
+++ b/include/mshadow/cxxnet_op.h
@@ -0,0 +1,116 @@
+#ifndef CXXNET_OP_H
+#define CXXNET_OP_H
+#pragma once
+/*!
+ * \file cxxnet_op.h
+ * \brief extra mshadow operation for cxxnet
+ * \author Bing Xu
+ */
+#include "mshadow/tensor.h"
+
+namespace mshadow {
+    /*! \brief operations for algorithm */
+    namespace op {
+        struct sigmoid {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct sigmoid_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * ( 1.0f - a );
+            }
+        };
+
+        /*! \brief Rectified Linear Operation */
+        struct relu {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                using namespace std;
+                return max( a, 0.0f );
+            }
+        };
+        struct relu_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? 1.0f : 0.0f;
+            }
+        };
+
+        struct tanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return tanhf( a );
+            }
+        };
+        struct tanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f - a * a;
+            }
+        };
+        struct softplus {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return logf(1 + expf(a));
+            }
+        };
+        struct softplus_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 1.0f / (1.0f + expf(-a));
+            }
+        };
+        struct bnll {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + expf(a));
+            }
+        };
+        struct bnll_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 50.0f
+                expval = expf(-expval);
+                return 1.0f / (1.0f + expval);
+            }
+        };
+
+        struct square {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a * a;
+            }
+        };
+       /*! \brief scaled tanh, hard code the scale factor*/
+        struct stanh {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+              return  1.7159047*tanhf(0.66666667 *a);
+            }
+        };
+        /*! \breif back prop for scaled tanh: */
+        struct stanh_grad {
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a;
+            }
+        };
+
+    }; //namespace op
+
+}; //namespace mshadow
+
+namespace mshadow {
+    namespace op {
+        /*! \brief used for generate Bernoulli mask */
+        struct threshold {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a < b ? 1.0f : 0.0f;
+            }
+        };
+
+        /*! \brief used for generate element of power */
+        struct power {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return powf( a, b );
+            }
+        };
+        struct sqrtop {
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return sqrt(a+b);
+            }
+        };
+    }; // namespace op
+}; // namespace mshadow
+
+#endif // CXXNET_OP_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
new file mode 100644
index 0000000..42d13d3
--- /dev/null
+++ b/include/mshadow/tensor.h
@@ -0,0 +1,472 @@
+#ifndef MSHADOW_TENSOR_H
+#define MSHADOW_TENSOR_H
+/*!
+ * \file tensor.h
+ * \brief header file of tensor data structure and functions
+ *        covention: this lib requires explicit memory allocation and de-allocation
+ *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
+ *                   no memory allocation is happening during calculation
+ * \author Bing Xu, Tianqi Chen
+ */
+#include "tensor_base.h"
+#include "tensor_expr.h"
+
+namespace mshadow {
+    /*!
+     * \brief shape of a tensor
+     *       IMPORTANT NOTE: this shape is different from numpy.shape
+     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
+     *       shape[k] corresponds to k-th dimension of tensor
+     * \tparam dimension dimension of tensor
+     */
+    template<int dimension>
+    struct Shape {
+    public:
+        /*! \brief maximum dimension of tensor */
+        const static int kMaxShape = dimension;
+        /*! \brief maximum dimension minus 1 */
+        const static int kSubShape = dimension - 1;
+    public:
+        /*! \brief default constructor, do nothing */
+        MSHADOW_XINLINE Shape(void) {}
+        /*! \brief constuctor */
+        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
+            #pragma unroll
+            for( int i = 0; i < kMaxShape; ++i ){
+                this->shape_[i] = s[i];
+            }
+            this->stride_ = s.stride_;
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE index_t& operator[](index_t idx) {
+            return shape_[ idx ];
+        }
+        /*!
+         * \brief get corresponding index
+         * \param idx dimension index
+         * \return the corresponding dimension size
+         */
+        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
+            return shape_[ idx ];
+        }
+        /*! \return whether two shape equals */
+        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
+            #pragma unroll
+            for ( int i = 0; i < kMaxShape; ++i ) {
+                if (s.shape_[i] != this->shape_[i]) return false;
+            }
+            return true;
+        }
+        /*!
+         * flatten the higher dimension to second dimension, return a 2D shape
+         * \return the flat 2d shape
+         */
+        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
+            Shape<2> s;
+            s.stride_ = this->stride_;
+            s.shape_[ 0 ] = this->shape_[ 0 ];
+            index_t ymax = 1;
+
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                ymax *= this->shape_[ i ];
+            }
+            s.shape_[1] = ymax;
+            return s;
+        }
+        /*! \return number of valid elements */
+        MSHADOW_XINLINE size_t Size(void) const{
+            size_t memsz = this->shape_[ 0 ];
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*! \return memory size, including the aligned x dimension */
+        MSHADOW_XINLINE size_t MSize(void) const {
+            size_t memsz = this->stride_;
+            #pragma unroll
+            for (int i = 1; i < kMaxShape; ++i) {
+                memsz *= this->shape_[ i ];
+            }
+            return memsz;
+        }
+        /*!
+         * \return product shape in [dimstart,dimend)
+         * \param dimstart start dimension
+         * \param dimend   end dimension
+         */
+        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
+            index_t num = 1;
+            #pragma unroll
+            for (int i = dimstart; i < dimend; ++i) {
+                num *= this->shape_[ i ];
+            }
+            return num;
+        }
+        /*!
+         * \brief get subshape
+         * \return subshape
+         */
+        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
+            Shape<kSubShape> s;
+            s.stride_ = this->stride_;
+            // for cuda
+            #pragma unroll
+            for (int i = 0; i < kSubShape; ++i) {
+                s.shape_[ i ] = this->shape_[ i ];
+            }
+            return s;
+        }
+
+    public:
+        /*! \brief storing the dimension information */
+        index_t shape_[ kMaxShape ];
+        /*!
+         * \brief storing the stride information in x dimension
+         *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
+         */
+        index_t stride_;
+    };
+    // useful construction functions to generate shape
+    /*!
+     * \brief construct a one dimension shape, stride will equal s0
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
+        Shape<1> s; s[0] = s0; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a two dimension shape, stride will equal s0
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
+        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a three dimension shape, stride will equal s0
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
+        Shape<3> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
+        return s;
+    }
+    /*!
+     * \brief construct a four dimension shape, stride will equal s0
+     * \param s3 size of dimension 3
+     * \param s2 size of dimension 2
+     * \param s1 size of dimension 1
+     * \param s0 size of dimension 0
+     * \return the shape construction
+     */
+    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
+        Shape<4> s;
+        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
+        return s;
+    }
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief device name CPU */
+    struct cpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = true;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<0;
+    };
+    /*! \brief device name CPU */
+    struct gpu {
+        /*! \brief whether this device is CPU or not */
+        const static bool kDevCPU = false;
+        /*! \brief device flag number, identifies this device */
+        const static int kDevMask = 1<<1;
+    };
+
+    // more compact template
+    /*!
+     * \brief general tensor
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
+    public:
+        /*! \brief whether current type lies in cpu */
+        const static bool kDevCPU = Device::kDevCPU;
+        /*! \brief dimension of subtype */
+        const static int  kSubdim = dimension - 1;
+
+    public:
+        /*! \brief pointer to the data */
+        real_t *dptr;
+        /*! \brief shape of the tensor */
+        Shape<dimension> shape;
+    public:
+        /*! \brief default constructor */
+        MSHADOW_XINLINE Tensor(void) {}
+        /*! \brief constructor from shape  */
+        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
+        /*! \brief constructor from data pointer and shape  */
+        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
+        /*!
+         * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
+         * \return tensor after flatten
+         */
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        /*!
+         * \brief get a element of dimension - 1
+         * \param idx index
+         * \return the result tensor
+         */
+        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
+            Shape<kSubdim> s = shape.SubShape();
+            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
+                                           (dptr) + s.MSize() * idx, s);
+        }
+        /*!
+         * \brief slice the tensor in highest dimension [begin,end)
+         * \param begin begin position of slice
+         * \param end end position of slice
+         * \return tensor after slice
+         */
+        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
+            Shape<dimension> s = this->shape;
+            s[ dimension - 1 ] = end - begin;
+            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
+                                             (dptr) + s.SubShape().MSize() * begin, s);
+        }
+    public:
+        /*!\brief functions to fit expression template */
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        /*!\brief functions to fit expression template */
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+
+    /*
+     *  respecialized class Tensor1D,thei is due to different implementation in operator[]
+     */
+    template<typename Device>
+    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
+    public:
+        real_t *dptr;
+        Shape<1> shape;
+    public:
+        MSHADOW_XINLINE Tensor(void) {}
+        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
+        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}
+
+        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
+            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
+                                     (dptr), shape.FlatTo2D());
+        }
+        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
+            Shape<1> s;
+            s[0] = s.stride_ = end  - begin;
+            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
+                                     (dptr) + begin, s);
+        }
+        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
+        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
+    public:
+        // functions to fit expression template
+        inline Tensor<Device,1>& operator=( double s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    };
+}; // namespace mshadow
+
+// add unroll loops for the shape
+namespace mshadow {
+    // function declarations
+    /*!
+     * \brief initialize tensor engine, used to call intialization functions of dependent libs
+     *        this function should be called before all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     * \param device_id GPU device id to be choosed
+     */
+    inline void InitTensorEngine( int device_id=0 );
+    /*!
+     * \brief Shutdown tensor engine,
+     *        this function should be called after all GPU tensor operations,
+     *        for using tensors in CPU, this call is actually not needed
+     */
+    inline void ShutdownTensorEngine( void );
+
+    /*!
+     * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
+     *        this function is responsible to set the stride_ in each obj.shape
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object, with shape specified
+     * \param pad whether padding dimension 0, to make last dimension aligned,
+     *            padding may help improve efficiency of matrix multiplications
+     *            if true, will allocate space with stride_ that may not equals shape[0]
+     *            if false, will allocate continuous space
+     */
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+    /*! \brief refer to comment of cpu ver \sa AllocSpace */
+    template<int dim>
+    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
+     * \tparam dim specify the dim of tensor
+     * \param obj the tensor object
+     */
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj);
+    /*! \brief refer to comment of cpu ver \sa FreeSpace */
+    template<int dim>
+    inline void FreeSpace(Tensor<gpu,dim> &obj);
+
+    /*!
+     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
+     * \tparam Device device of tensor
+     * \tparam dim dimention of tensor
+     * \param shape: shape of tensor
+     * \param initv: initialization value
+     * \param pad : padding option
+     * \sa AllocSpace
+     */
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);
+
+    /*!
+     * \brief copy data from one tensor to another, with same shape
+     * \tparam dim specify the dim of tensor
+     * \param dst target tensor
+     * \param src source tensor
+     */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
+    /*! \brief refer to comment of cpu ver \sa Copy */
+    template<int dim>
+    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
+
+
+    /*!
+     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
+     * \param dst destination
+     * \param energy input energy
+     */
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
+    /*! \brief refer to comment of cpu ver \sa Softmax */
+    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
+
+}; // namespace mshadow
+
+
+namespace mshadow{
+    // function declarations to support expression, no need to understand them
+    // these functions do not need to be directly used
+
+    /*!
+     * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
+     * \tparam Saver specify storage method
+     * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
+     */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
+    /*! \brief refer to comment of cpu ver \sa MapExp */
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+
+    /*!
+     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
+     * \tparam Saver specify storage method
+     * \tparam Reducer specify a reducer method
+     * \tparam E specifies the expression type, not need to specify this parameter during usage
+     * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
+     * \tparam etype expression type
+     * \param dst destination
+     * \param exp expression
+     * \param scale scale the result before save
+     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
+     */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
+
+};// namespace mshadow
+
+// execution implementation of expression evaluations
+#include "tensor_expr_engine-inl.hpp"
+// cpu implementation of functions
+#include "tensor_cpu-inl.hpp"
+// gpu implementation of functions
+#include "tensor_gpu-inl.hpp"
+// extension of expressions
+#include "tensor_expr_ext.h"
+// io 
+#include "tensor_io.h"
+// container
+#include "tensor_container.h"
+// random number generator
+#include "tensor_random.h"
+#endif // TENSOR_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_base.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h
new file mode 100644
index 0000000..b251cba
--- /dev/null
+++ b/include/mshadow/tensor_base.h
@@ -0,0 +1,298 @@
+#ifndef MSHADOW_TENSOR_BASE_H
+#define MSHADOW_TENSOR_BASE_H
+/*!
+ * \file tensor_base.h
+ * \brief definitions of base types, macros functions
+ *
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cmath>
+#include <cstdio>
+#include <cfloat>
+#include <climits>
+#include <algorithm>
+// macro defintiions
+
+/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */
+#ifndef MSHADOW_STAND_ALONE
+    #define MSHADOW_STAND_ALONE 0
+#endif
+
+/*! \brief whether do padding during allocation */
+#ifndef MSHADOW_ALLOC_PAD
+    #define MSHADOW_ALLOC_PAD true
+#endif
+
+/*! 
+ * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation 
+ *        for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64
+ *        set it to 0 then we will always allocate padded memory
+ */
+#ifndef MSHADOW_MIN_PAD_RATIO
+    #define MSHADOW_MIN_PAD_RATIO 2
+#endif
+
+#if MSHADOW_STAND_ALONE
+   #define MSHADOW_USE_CBLAS 0
+   #define MSHADOW_USE_MKL   0
+   #define MSHADOW_USE_CUDA  0
+#endif
+
+/*! \brief use CBLAS for CBLAS */
+#ifndef MSHADOW_USE_CBLAS
+   #define MSHADOW_USE_CBLAS 0
+#endif
+/*! \brief use MKL for BLAS */
+#ifndef MSHADOW_USE_MKL
+   #define MSHADOW_USE_MKL   1
+#endif
+/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */
+#ifndef MSHADOW_USE_CUDA
+  #define MSHADOW_USE_CUDA   1
+#endif
+/*! \brief use single precition float */
+#ifndef MSHADOW_SINGLE_PRECISION
+  #define MSHADOW_SINGLE_PRECISION 1
+#endif
+/*! \brief whether use SSE */
+#ifndef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 1
+#endif
+/*! \brief whether use NVML to get dynamic info */
+#ifndef MSHADOW_USE_NVML
+  #define MSHADOW_USE_NVML 0
+#endif
+// SSE is conflict with cudacc
+#ifdef __CUDACC__
+  #undef MSHADOW_USE_SSE
+  #define MSHADOW_USE_SSE 0
+#endif
+
+#if MSHADOW_USE_CBLAS
+extern "C"{
+    #include <cblas.h>
+}
+#elif MSHADOW_USE_MKL
+  #include <mkl.h>
+  #include <mkl_cblas.h>
+  #include <mkl_vsl.h>
+  #include <mkl_vsl_functions.h>
+#endif
+
+#if MSHADOW_USE_CUDA
+  #include <cublas.h>
+  #include <curand.h>
+#endif
+
+#if MSHADOW_USE_NVML
+  #include <nvml.h>
+#endif
+// --------------------------------
+// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code.
+#ifdef MSHADOW_XINLINE
+  #error "MSHADOW_XINLINE must not be defined"
+#endif
+#ifdef __CUDACC__
+  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__
+#else
+  #define MSHADOW_XINLINE inline __attribute__((always_inline))
+#endif
+/*! \brief cpu force inline */
+#define MSHADOW_CINLINE inline __attribute__((always_inline))
+
+#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
+  #define MSHADOW_CONSTEXPR constexpr
+#else
+  #define MSHADOW_CONSTEXPR const
+#endif
+
+/*! \brief namespace for mshadow */
+namespace mshadow {
+    /*! \brief buffer size for each random number generator */
+    const unsigned kRandBufferSize = 1000000;
+    /*! \brief pi  */
+    const float kPi = 3.1415926f;
+
+#if MSHADOW_SINGLE_PRECISION
+    /*! \brief type that will be used for content */
+    typedef float real_t;
+#else
+    typedef double real_t;
+#endif
+    /*! \brief type that will be used for index */
+    typedef unsigned index_t;
+}; // namespace mshadow
+
+namespace mshadow {
+    /*! \brief namespace for operators */
+    namespace op {
+        // binary operator
+        /*! \brief mul operator */
+        struct mul{
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a * b;
+            }
+        };
+        /*! \brief plus operator */
+        struct plus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a + b;
+            }
+        };
+        /*! \brief minus operator */
+        struct minus {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a - b;
+            }
+        };
+        /*! \brief divide operator */
+        struct div {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return a / b;
+            }
+        };
+        /*! \brief get rhs */
+        struct right {
+            /*! \brief map a, b to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+                return b;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for savers */
+    namespace sv {
+        /*! \brief save to saver: = */
+        struct saveto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a  = b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::right OPType;
+        };
+        /*! \brief save to saver: += */
+        struct plusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a += b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::plus OPType;
+        };
+        /*! \brief minus to saver: -= */
+        struct minusto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a -= b;
+            }
+            /*! \brief helper constant to use BLAS, alpha */
+            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
+            /*! \brief helper constant to use BLAS, beta */
+            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
+            /*! \brief corresponding binary operator type */
+            typedef op::minus OPType;
+        };
+        /*! \brief multiply to saver: *= */
+        struct multo {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a *= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::mul OPType;
+        };
+        /*! \brief divide to saver: /= */
+        struct divto {
+            /*! \brief save b to a using save method */
+            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
+                a /= b;
+            }
+            /*! \brief corresponding binary operator type */
+            typedef op::div OPType;
+        };
+    }; // namespace sv
+
+
+    namespace op {
+        // unary operator/ function: example
+        // these operators can be defined by user, in the same style as binary and unary operator
+        // to use, simply write F<op::identity>( src )
+        /*! \brief identity function that maps a real number to it self */
+        struct identity{
+            /*! \brief map a to result using defined operation */
+            MSHADOW_XINLINE static real_t Map(real_t a) {
+                return a;
+            }
+        };
+    }; // namespace op
+
+    /*! \brief namespace for potential reducer operations */
+    namespace red {
+        /*! \brief sum reducer */
+        struct sum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
+                dst += src;
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
+                return 1.0f;
+            }
+            /*! \brief an intial value of reducer */
+            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
+        };
+        /*! \brief maximum reducer */
+        struct maximum {
+            /*! \brief do reduction into dst */
+            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
+                using namespace std;
+                dst = max( dst, src );
+            }
+            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
+            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
+                return redres == redsrc ? 1.0f: 0.0f;
+            }
+            /*! \brief an intial value of reducer */
+#if MSHADOW_SINGLE_PRECISION
+            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
+#else
+            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
+#endif
+        };
+    };
+
+    /*! \brief namespace for helper utils of the project */
+    namespace utils{
+        /*! \brief send error message then exit */
+        inline void Error( const char *msg ){
+            fprintf( stderr, "Error:%s\n",msg );
+            exit( -1 );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp ){
+            if( !exp ) Error( "AssertError" );
+        }
+        /*! \brief assert a expression is true */
+        inline void Assert( bool exp, const char *msg ){
+            if( !exp ) Error( msg );
+        }
+        /*! \brief warning */
+        inline void Warning( const char *msg ){
+            fprintf( stderr, "warning:%s\n",msg );
+        }
+    }; // namespace utils
+}; // namespace mshadow
+#endif // TENSOR_BASE_H

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_container.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_container.h b/include/mshadow/tensor_container.h
new file mode 100644
index 0000000..f0699e7
--- /dev/null
+++ b/include/mshadow/tensor_container.h
@@ -0,0 +1,152 @@
+#ifndef MSHADOW_TENSOR_CONTAINER_H
+#define MSHADOW_TENSOR_CONTAINER_H
+/*!
+ * \file tensor_container.h
+ * \brief tensor container that does memory allocation and resize like STL
+ * \author Tianqi Chen
+ */
+#include "tensor.h"
+#include "tensor_io.h"
+
+namespace mshadow{
+    /*!
+     * \brief tensor container that does memory allocation and resize like STL,
+     *        use it to save the lines of FreeSpace in class.
+     *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
+     *
+     * \tparam Device which device the tensor is on
+     * \tparam dimension dimension of the tensor
+     */
+    template<typename Device, int dimension>
+    class TensorContainer: public Tensor<Device,dimension>{
+    public:
+        /*! 
+         * \brief constructor 
+         * \param pad whether use padding alignment in space allocation
+         */
+        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
+            this->pad_ = pad;
+            this->dptr = data_.dptr = NULL;
+            this->shape[0] = 0;
+            this->shape.stride_ = 0;
+            this->data_.shape.stride_ = 0;
+            this->data_.shape[1] = 0;
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         */
+        TensorContainer( const Shape<dimension> &shape ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+        }
+        /*! 
+         * \brief constructor 
+         * \param shape intial shape
+         * \param initv intial value
+         */
+        TensorContainer( const Shape<dimension> &shape, real_t initv ){
+            this->pad_ = MSHADOW_ALLOC_PAD;
+            data_.dptr = NULL;
+            this->AllocByShape( shape );
+            (*this) = initv;
+        }
+        ~TensorContainer( void ){
+            this->FreeSpace();
+        }
+        /*! 
+         * \brief resize the container to given shape, content is NOT preserved
+         * \param shape target shape
+         */
+        inline void Resize( const Shape<dimension> &shape ){
+            Shape<2> s2 = shape.FlatTo2D();            
+            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){
+                this->AllocByShape( shape );
+            }else{
+                this->shape = shape;
+                if( this->pad_ ){
+                    this->shape.stride_ = data_.shape.stride_;
+                }else{
+                    this->shape.stride_ = this->shape[ 0 ];
+                }
+            }
+        }
+        /*! 
+         * \brief resize the container to given shape, and initialize, content is NOT preserved
+         * \param shape target shape
+         * \param initv initialization value
+         */
+        inline void Resize( const Shape<dimension> &shape, real_t initv ){
+            this->Resize( shape );
+            (*this) = initv;
+        }
+        /*! \brief set whether padding is allowed in tensor */
+        inline void set_pad( bool pad ){
+            this->pad_ = pad;
+        }
+        /*! 
+         * \brief save by binary format
+         * \param fo output binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void SaveBinary( TStream &fo ) const{
+            mshadow::SaveBinary( fo, *this );
+        }
+        /*! 
+         * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
+         * \param fi input binary stream
+         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
+         */
+        template<typename TStream>
+        inline void LoadBinary( TStream &fi ) {
+            Tensor<cpu,dimension> tmp;
+            mshadow::LoadBinary( fi, tmp, false );
+            this->Resize( tmp.shape );
+            Copy( *this, tmp );
+            mshadow::FreeSpace( tmp );
+        }
+    public:
+        // functions to fit exp template
+        inline Tensor<Device,dimension>& operator=( real_t s ){
+            return this->__assign( s );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
+            return this->__assign( exp );
+        }
+        template<typename E>
+        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
+            return this->__assign( exp );
+        }
+    private:
+        /*! \brief whether we do padding in the space */
+        bool pad_;
+        /*! \brief the shape of data_ is actually current data space */
+        Tensor<Device, 2> data_;
+    private:
+        inline void FreeSpace (void){
+            if( data_.dptr != NULL ){
+                mshadow::FreeSpace( data_ );
+                data_.dptr = this->dptr = NULL;
+            }
+        }
+        inline void AllocByShape (const Shape<dimension>& shape){
+            if( data_.dptr != NULL ){
+                this->FreeSpace();
+            }
+            data_.shape = shape.FlatTo2D();
+            mshadow::AllocSpace( data_, pad_ );
+            this->dptr  = data_.dptr;
+            this->shape = shape;
+            if( this->pad_ ){
+                this->shape.stride_ = data_.shape.stride_;
+            }else{
+                this->shape.stride_ = shape[0];
+            }
+        }
+    };
+};// namespace mshadow
+
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_cpu-inl.hpp
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_cpu-inl.hpp b/include/mshadow/tensor_cpu-inl.hpp
new file mode 100644
index 0000000..0fa3cfa
--- /dev/null
+++ b/include/mshadow/tensor_cpu-inl.hpp
@@ -0,0 +1,168 @@
+#ifndef MSHADOW_TENSOR_CPU_INL_HPP
+#define MSHADOW_TENSOR_CPU_INL_HPP
+/*!
+ * \file tensor_cpu-inl.hpp
+ * \brief implementation of CPU host code
+ * \author Bing Xu, Tianqi Chen
+ */
+#include <cstring>
+#include "tensor_base.h"
+#include "tensor_sse-inl.hpp"
+
+namespace mshadow {
+    template<int dim>
+    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
+        size_t pitch;
+        if( pad ){
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
+            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
+        }else{
+            obj.shape.stride_ = obj.shape[0];
+            obj.dptr = (real_t*)sse2::AlignedMallocPitch
+                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
+        }
+    }
+
+    template<typename Device, int dim>
+    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){
+        Tensor<Device, dim> obj( shape );
+        AllocSpace( obj, pad );
+        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
+        return obj;
+    }
+
+    template<int dim>
+    inline void FreeSpace(Tensor<cpu,dim> &obj){
+        sse2::AlignedFree( obj.dptr );
+        obj.dptr = NULL;
+    }
+
+    template<int dim>
+    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
+        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        Tensor<cpu,2> src = _src.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
+        }
+    }
+
+    template<typename Saver, typename E, int dim>
+    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
+        Tensor<cpu,2> dst = _dst.FlatTo2D();
+        for (index_t y = 0; y < dst.shape[1]; ++y ) {
+            for (index_t x = 0; x < dst.shape[0]; ++x ) {
+                // trust your compiler! -_- they will optimize it
+                Saver::Save(dst[y][x], plan.Eval( y, x ) );
+            }
+        }
+    }
+
+    // code to handle SSE optimization
+    template<bool pass_check,typename Saver, int dim, typename E, int etype>
+    struct MapExpCPUEngine;
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<false,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+            MapPlan<SV>( dst, MakePlan( exp.self() ) );
+        }
+    };
+
+    #if MSHADOW_USE_SSE
+    template<typename SV, int dim, typename E, int etype>
+    struct MapExpCPUEngine<true,SV,dim,E,etype>{
+        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+            using namespace expr;
+            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){
+                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
+            }else{
+                MapPlan<SV>( dst, MakePlan( exp.self() ) );
+            }
+        }
+    };
+    #endif
+
+    template<typename Saver, int dim, typename E, int etype>
+    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
+        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
+        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
+        #if MSHADOW_USE_SSE
+        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp );
+        #else
+        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
+        #endif
+    }
+
+    template<typename Saver, typename Reducer, typename E, int etype>
+    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
+
+        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
+        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+        for( index_t x = 0; x < eshape[0]; ++x ){
+            real_t res = plan.Eval( 0, x );
+            for( index_t y = 1; y < eshape[1]; ++y ){
+                Reducer::Reduce( res, plan.Eval( y, x ) );
+            }
+            Saver::Save( dst[x], res*scale );
+        }
+    }
+
+    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
+    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
+        using namespace expr;
+        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
+        typedef Shape< ExpInfo<E>::kDim > EShape;
+        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
+        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
+        // use equvalent form
+        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
+                                  eshape.ProdShape(1,dimkeep), eshape[0] );
+
+        // execution
+        expr::Plan<E> plan = MakePlan( exp.self() );
+
+        for( index_t c = 0; c < pshape[2]; ++c ){
+            real_t res = Reducer::kInitV;
+            for( index_t n = 0; n < pshape[3]; ++n ){
+                real_t tres = Reducer::kInitV;
+                for( index_t y = 0; y < pshape[1]; ++y ){
+                    for( index_t x = 0; x < pshape[0]; ++x ){
+                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
+                    }
+                }
+                Reducer::Reduce( res, tres );
+            }
+            Saver::Save( dst[c], res*scale );
+        }
+    }
+
+    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
+        real_t mmax = energy[0];
+        for( real_t x = 1; x < dst.shape[0]; ++x )
+            if( mmax < energy[x] ) mmax = energy[x];
+        real_t sum = 0.0f;
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] = std::exp( energy[x] - mmax );
+            sum += dst[x];
+        }
+        for( index_t x = 0; x < dst.shape[0]; ++x ){
+            dst[x] /= sum;
+        }
+    }
+    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
+        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
+        for( index_t y = 0; y < dst.shape[1]; ++y ){
+            Softmax( dst[y], energy[y] );
+        }
+    }
+}; // namespace mshadow
+
+#endif // TENSOR_CPU_INL_HPP

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/b2dc51d2/include/mshadow/tensor_expr.h
----------------------------------------------------------------------
diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h
new file mode 100644
index 0000000..ac8fde7
--- /dev/null
+++ b/include/mshadow/tensor_expr.h
@@ -0,0 +1,367 @@
+#ifndef MSHADOW_TENSOR_EXPR_H
+#define MSHADOW_TENSOR_EXPR_H
+/*!
+ * \file tensor_expr.h
+ * \brief definitions of abstract expressions and expressions template
+ * \author Tianqi Chen, Bing Xu
+ */
+#include "tensor_base.h"
+
+namespace mshadow{
+    /*!
+     * \brief namespace for abstract expressions and expressions template,
+     *        have no dependecy on tensor.h,
+     *        These data structure takes no charge in computations,
+     *        they are only used to define operations and represent expression in a symbolic way
+     */
+    namespace expr{
+
+        /*! \brief type of expressions */
+        namespace type{
+            /*! \brief this expression directly correspnds to a data class */
+            const int kContainer = 0;
+            /*! \brief this only contains element-wise vector operations */
+            const int kMapper    = 1;
+            /*! \brief othercase: e.g dot product */
+            const int kComplex   = 3;
+        };
+
+        /*!
+         * \brief expression engine that actually interprets these expressions
+         *        this is a function template that needed to be implemented for specific expressions
+         */
+        template<typename Saver,typename Container>
+        struct ExpEngine{
+            template<typename EType>
+            inline static void Eval( Container& dst, const EType &exp );
+        };
+
+        template<typename Container>
+        class ContainerExp;
+        class ScalarExp;
+
+        /*!
+         * \brief base class for expression
+         * \tparam SubType inheritated class must put their type into this parameter
+         * \tparam exp_type expression type, see namespace type
+         */
+        template<typename SubType, int exp_type>
+        struct Exp{
+        public:
+            /*! \return  subtype instance of current class */
+            inline const SubType& self( void ) const{
+                return *static_cast<const SubType*>(this);
+            }
+            /*! \return reference of subtype instance of current class */
+            inline SubType& refself( void ){
+                return *static_cast<SubType*>(this);
+            }
+        };
+
+        /*! \brief scalar expression */
+        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
+            /*! \brief scalar value */
+            real_t scalar_;
+            /*! \brief constructor */
+            ScalarExp( real_t scalar ):scalar_(scalar){}
+        };
+
+        /*! \brief represent a transpose expression of a container */
+        template<typename EType>
+        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
+        public:
+            /*! \brief expression to be transposed */
+            const EType &exp;
+            /*! \brief constructor */
+            TransposeExp( const EType &e ):exp(e){}
+            /*! \brief transpose expression */
+            inline const EType & T( void ) const{
+                return exp;
+            }
+        };
+        
+        /*!
+         * \brief base class of all variables, that can be assigned to values
+         * \tparam Container the actually class of data container, e.g. CTensor1D
+         */
+        template<typename Container>
+        class ContainerExp: public Exp< Container, type::kContainer >{
+        public:
+            /*!
+             *\brief transpose of a matrix
+             *\return transpose of current expression
+             */
+            inline const TransposeExp<Container> T( void ) const{
+                return TransposeExp<Container>( this->self() );
+            }
+        public:
+            /*! \brief operator overload */
+            inline Container &operator+=( real_t s ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator-=( real_t s ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator*=( real_t s ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &operator/=( real_t s ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+            /*! \brief operator overload */
+            inline Container &__assign( real_t s ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) );
+                return this->refself();
+            }
+        public:
+            /*! \brief implementation of operator=, note that we can not define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator=, note that we can not define container = container */
+            template<typename E>
+            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
+                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator+= */
+            template<typename E,int etype>
+            inline Container &operator+=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator-= */
+            template<typename E,int etype>
+            inline Container &operator-=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator*= */
+            template<typename E,int etype>
+            inline Container &operator*=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+            /*! \brief implementation of operator/= */
+            template<typename E,int etype>
+            inline Container &operator/=( const Exp<E,etype> &exp ){
+                ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() );
+                return this->refself();
+            }
+        };
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam ltrans whether lhs is transposed
+         * \tparam rtrans whether rhs is transposed
+         */
+        template<typename TA,typename TB,bool ltrans,bool rtrans>
+        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief scale over result */
+            real_t scale_;
+            /*! \brief constructor */
+            DotExp( const TA &lhs, const TB &rhs, real_t scale )
+                :lhs_(lhs),rhs_(rhs),scale_(scale){}
+        };
+
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){
+            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB>
+        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){
+            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
+        }
+        /*! \brief dot operator def */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs );
+        }
+        /*! \brief scale of dot operation */
+        template<typename TA, typename TB, bool ltrans, bool rtrans >
+        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){
+            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs );
+        }
+    }; // namespace expr
+
+    namespace expr{
+        /*!
+         * \brief binary map expression lhs [op] rhs
+         * \tparam OP operator
+         * \tparam TA type of lhs
+         * \tparam TB type of rhs
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, typename TB, int etype >
+        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
+            /*! \brief left operand */
+            const TA& lhs_;
+            /*! \brief right operand */
+            const TB& rhs_;
+            /*! \brief constructor */
+            BinaryMapExp( const TA &lhs, const TB &rhs )
+                :lhs_(lhs), rhs_(rhs){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
+         * \param lhs left operand
+         * \param rhs right operand
+         * \tparam binary operator 
+         * \tparam TA lhs expression
+         * \tparam ta lhs expression type
+         * \tparam TB rhs expression
+         * \tparam tb rhs expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TA, int ta>
+        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+        /*! \brief operator overload for const */
+        template<typename OP,typename TB, int tb>
+        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<OP>( lhs, rhs );
+        }
+
+        // operator rules
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, typename TB, int ta, int tb>
+        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TA, int ta>
+        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+        // constant operators 2
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::plus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::minus>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::mul>( lhs, rhs );
+        }
+        /*! \brief operator overload */
+        template<typename TB, int tb>
+        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
+            return MakeExp<op::div>( lhs, rhs );
+        }
+    };
+
+    namespace expr{
+        /*!
+         * \brief unary map expression op(src)
+         * \tparam OP operator
+         * \tparam TA type of src
+         * \tparam etype expression type, sa namespace::type
+         */
+        template<typename OP, typename TA, int etype >
+        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
+            /*! \brief source expression */
+            const TA& src_;
+            /*! \brief constructor */
+            UnaryMapExp( const TA &src ):src_(src){}
+        };
+
+        /*! \brief make expression */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){
+            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
+        }
+
+        /*! 
+         * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
+         * \param src source expression
+         * \tparam operator 
+         * \tparam TA source expression
+         * \tparam ta source expression type
+         * \sa mshadow::op
+         */
+        template<typename OP,typename TA, int ta>
+        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){
+            return MakeExp<OP>(src);
+        }
+    };
+};
+#endif