You are viewing a plain text version of this content. The canonical link for it is here.
Posted to by on 2016/06/03 07:48:20 UTC

[15/60] incubator-singa git commit: SINGA-163 - Reorganize the project folder layout
diff --git a/include/gtest/ b/include/gtest/
deleted file mode 100644
index f302822..0000000
--- a/include/gtest/
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2006, Google Inc.
-// All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-#include <stdio.h>
-#include "gtest/gtest.h"
-GTEST_API_ int main(int argc, char **argv) {
-  printf("Running main() from\n");
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
diff --git a/include/mshadow/cuda/cuda_reduce.cuh b/include/mshadow/cuda/cuda_reduce.cuh
deleted file mode 100644
index b7808a6..0000000
--- a/include/mshadow/cuda/cuda_reduce.cuh
+++ /dev/null
@@ -1,117 +0,0 @@
- * \file cuda_reduce.cuh
- * \brief helper functions to do reduction
- * \author Tianqi Chen
- */
-namespace mshadow{
-    namespace cuda{
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam x_bits dimension = 1<<x_bits
-         */
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] );
-        /*
-         * \brief reduce over the dimension x
-         * \tparam Reducer reducer
-         * \tparam xmax_bits maximum size of buffer
-         * \param xsize size of x dimension, not sure if aligned
-         */
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[1<<xmax_bits], int xsize );
-    };
-// ===============================================x===
-//  implementations afterwards, 
-//  no need to read if only use the functions
-// --------------------------------------------------
-#define __MSHADOW_EMUSYNC__ __syncthreads()
-#define __MSHADOW_EMUSYNC__ 
-namespace mshadow{
-    namespace cuda{        
-        template<typename Reducer, int x_bits>
-        inline __device__ void ReduceX( volatile real_t buf[], int tid ){
-            if( x_bits >= 10 ){
-                if( tid < 512 ) Reducer::Reduce( buf[tid] , buf[tid + 512] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 9 ){
-                if( tid < 256 ) Reducer::Reduce( buf[tid] , buf[tid + 256] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 8 ){
-                if( tid < 128 ) Reducer::Reduce( buf[tid] , buf[tid + 128] );
-                __syncthreads(); 
-            }
-            if( x_bits >= 7 ){
-                if( tid < 64  ) Reducer::Reduce( buf[tid] , buf[tid + 64 ] );
-                __syncthreads(); 
-            }            
-            if( x_bits >= 6 ){
-                if( tid < 32 ) Reducer::Reduce( buf[tid] , buf[tid + 32] );
-                __syncthreads();
-            }
-            // in warp optimization
-            if( x_bits >= 5 ){
-                if( tid < 16 ) Reducer::Reduce( buf[tid] , buf[tid + 16] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 4 ){
-                if( tid < 8 ) Reducer::Reduce( buf[tid] , buf[tid + 8 ] );
-                __MSHADOW_EMUSYNC__;            
-            }
-            if( x_bits >= 3 ){
-                if( tid < 4 ) Reducer::Reduce( buf[tid] , buf[tid + 4 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 2 ){
-                if( tid < 2 ) Reducer::Reduce( buf[tid] , buf[tid + 2 ] );
-                __MSHADOW_EMUSYNC__;
-            }
-            if( x_bits >= 1 ){
-                if( tid < 1 ) Reducer::Reduce( buf[tid] , buf[tid + 1 ] );
-                __MSHADOW_EMUSYNC__;
-            }  
-        };
-        template<typename Reducer,int x_bits>
-        inline __device__ void Reduce1D( volatile real_t buf[1<<x_bits] ){
-            ReduceX<Reducer,x_bits>( buf, threadIdx.x );
-        }
-        // reduce with a upper bound
-        #define __RD_NON_ALIGN(els,x_bits)                              \
-            els                                                         \
-            if( xmax_bits >= x_bits && x_size >= (1 << x_bits) ){       \
-                if( tid < (1 << x_bits) && tid + (1<<x_bits) < x_size ){ \
-                    Reducer::Reduce( buf[tid] , buf[tid + (1<<x_bits)] ); \
-                }                                                       \
-                __syncthreads();                                        \
-                ReduceX<Reducer, x_bits>( buf, tid );                   \
-            }                                                           \
-        template<typename Reducer, int xmax_bits>
-        inline __device__ void Reduce1DNotAlign( volatile real_t buf[], int x_size ){
-            int tid = threadIdx.x;
-            __RD_NON_ALIGN(, 8)
-            __RD_NON_ALIGN(else, 7)
-            __RD_NON_ALIGN(else, 6)
-            __RD_NON_ALIGN(else, 5) 
-            __RD_NON_ALIGN(else, 4) 
-            __RD_NON_ALIGN(else, 3) 
-            __RD_NON_ALIGN(else, 2) 
-            __RD_NON_ALIGN(else, 1)                     
-        }
-    };
diff --git a/include/mshadow/cuda/tensor_gpu-inl.cuh b/include/mshadow/cuda/tensor_gpu-inl.cuh
deleted file mode 100644
index 61e477c..0000000
--- a/include/mshadow/cuda/tensor_gpu-inl.cuh
+++ /dev/null
@@ -1,231 +0,0 @@
- * \file tensor_gpu-inl.cuh
- * \brief implementation of GPU code using CUDA
- * \author Bing Xu, Tianqi Chen
- */
-#include "../tensor.h"
-#include "cuda_reduce.cuh"
-namespace mshadow{
-    namespace cuda{
-        #ifndef __CUDA_ARCH__
-        #warning "__CUDA_ARCH__ is not defined, I will assume compiling with CUDA verion greater than 2.0"
-        #endif
-        /* load unit for memory access */
-        #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 200
-        const int kMemUnitBits = 5;
-        const int kMaxThreadsPerBlock = 1024;
-        #else
-        const int kMemUnitBits = 4;
-        const int kMaxThreadsPerBlock = 512;
-        #endif
-        /*! \brief number of units that can do synchronized update, half warp size */
-        const int kMemUnit     = 1 << kMemUnitBits;
-        /*! \brief mask that could be helpful sometime */
-        const int kMemUnitMask = kMemUnit - 1;
-        /*! \brief suggested thread number(logscale) for mapping kernel */
-        const int kBaseThreadBits = 8;
-        /*! \brief suggested thread number for mapping kernel */
-        const int kBaseThreadNum  = 1 << kBaseThreadBits;
-        /*! \brief maximum value of grid */
-        const int kMaxGridNum     = 65535;
-        /*! \brief suggested grid number for mapping kernel */
-        const int kBaseGridNum    = 1024;
-        /*! \brief get align stride for given size in x dimension */
-        inline index_t GetAlignStride( index_t xsize, index_t xstride ){ 
-            if( (xstride & (kMemUnit-1)) == 0 ){
-                return ( (xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
-            }else{
-                // if originally space is not aligned, no necessary to to alligned thread allocation
-                return xsize;
-            }
-        }
-        inline void CheckLaunchParam( dim3 dimGrid, dim3 dimBlock, const char *estr = "" ){
-            if( dimBlock.x*dimBlock.y*dimBlock.z > (unsigned)kMaxThreadsPerBlock ||
-                dimGrid.x > 65535 || dimGrid.y > 65535 ){
-                fprintf( stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z );
-                utils::Error( "too large launch parameter\n");
-            } 
-        }        
-    };
-    namespace cuda {
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __device__ void MapPlanProc( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int block_idx ){
-            const index_t tid = (block_idx << block_dim_bits) + threadIdx.x;
-            const int y   = tid / xstride;
-            const int x   = tid % xstride;
-            if (y < dst.shape[1] && x < dst.shape[0]) {
-                Saver::Save(dst[y][x], exp.Eval(y,x));
-            }
-        }
-        template<typename Saver, typename Plan, int block_dim_bits>
-        __global__ void MapPlanKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp ){
-            MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x );
-        }
-        template<typename Saver, typename Plan, int block_dim_bits, int grid_size>
-        __global__ void MapPlanLargeKernel( Tensor<gpu,2> dst, const index_t xstride, const Plan exp, int repeat ){
-            for( int i = 0; i < repeat; ++i ){
-                MapPlanProc<Saver, Plan,block_dim_bits>( dst, xstride, exp, blockIdx.x + i*grid_size );
-            }
-        }        
-        template<typename Saver, typename E>
-        inline void MapPlan( Tensor<gpu,2> dst, const expr::Plan<E> &plan ){
-            const index_t xstride = GetAlignStride( dst.shape[0], dst.shape.stride_ );
-            const int num_block = ( dst.shape[1]*xstride + kBaseThreadNum-1) / kBaseThreadNum;
-            dim3 dimBlock(kBaseThreadNum, 1, 1);
-            if (num_block < kMaxGridNum) {
-                dim3 dimGrid(num_block, 1, 1);
-                MapPlanKernel<Saver, expr::Plan<E>, kBaseThreadBits>   \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan);
-            } else {
-                int repeat = (num_block + kBaseGridNum-1) / kBaseGridNum;
-                dim3 dimGrid( kBaseGridNum, 1 , 1 );
-                MapPlanLargeKernel<Saver,expr::Plan<E>, kBaseThreadBits, kBaseGridNum> \
-                    <<<dimGrid,dimBlock>>>(dst, xstride, plan, repeat );
-            }
-        }        
-    }; // namespace cuda
-    namespace cuda{
-        template<typename Saver,typename Reducer, int warp_bits, typename Plan>
-        __global__ void MapRedKeepLowestKernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<2> eshape ){
-            const unsigned warp_size = 1 << warp_bits;
-            const unsigned x = (blockIdx.x<<warp_bits) + threadIdx.x;
-            // to avoid bank conflict
-            __shared__ real_t s_res[ warp_size ][ warp_size + 1 ];
-            // note: reverse store [y][x], so that we can reduce over threadIdx.x, use warp optimization
-            if( threadIdx.y < eshape[1] && x < eshape[0] ){
-                s_res[ threadIdx.x ][ threadIdx.y ] = plan.Eval( threadIdx.y, x );
-            }
-            for( unsigned y = warp_size; y < eshape[1]; y += warp_size ){
-                if( threadIdx.y + y < eshape[1] && x < eshape[0] ){
-                    Reducer::Reduce( s_res[ threadIdx.x ][ threadIdx.y ], plan.Eval( threadIdx.y + y, x ) );
-                }
-            } 
-            __syncthreads();
-            if( eshape[1] >= warp_size ){
-                Reduce1D<Reducer,warp_bits>( s_res[ threadIdx.y ] );
-            }else{
-                Reduce1DNotAlign<Reducer,warp_bits>( s_res[ threadIdx.y ], eshape[1] );
-            }
-            __syncthreads();            
-            if( threadIdx.y == 0 && x < eshape[0] ){
-                Saver::Save( dst[x],  s_res[ threadIdx.x ][ 0 ] * scale );
-            } 
-        }        
-        template<typename Saver, typename Reducer, typename E>
-        inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Plan<E> &plan, real_t scale, Shape<2> eshape ){
-            dim3 dimBlock( kMemUnit, kMemUnit );
-            dim3 dimGrid ( (eshape[0]+kMemUnit-1) >> kMemUnitBits );
-            CheckLaunchParam( dimGrid, dimBlock, "MapRedKeepLowestKernel" );
-            MapRedKeepLowestKernel<Saver,Reducer,kMemUnitBits><<<dimGrid,dimBlock>>>( dst, plan, scale, eshape );
-        } 
-    }; // namespace cuda
-    namespace cuda{
-        template<typename Saver,typename Reducer, int block_dim_bits, typename Plan>
-        __global__ void MapReduceKeepDim2Kernel( Tensor<gpu,1> dst, Plan plan, real_t scale, Shape<4> pshape ){
-            const int block_size = 1 << block_dim_bits;
-            __shared__ real_t s_rec[ block_size ];
-            const int c = blockIdx.x;            
-            const index_t tot = pshape[0]*pshape[1]*pshape[3];
-            real_t res = Reducer::kInitV;
-            for( index_t i_offset = 0; i_offset < tot; i_offset += block_size ){
-                index_t i = i_offset + threadIdx.x;
-                if( i< tot ){
-                    const index_t x = i % pshape[0];
-                    i /= pshape[0]; 
-                    const index_t y = i % pshape[1];
-                    const index_t n = i / pshape[1];
-                    Reducer::Reduce( res, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                }
-            }                
-            s_rec[ threadIdx.x ] = res;
-            __syncthreads();
-            Reduce1D<Reducer,block_dim_bits>( s_rec );
-            if( threadIdx.x == 0 ){
-                Saver::Save( dst[c], s_rec[0]*scale );
-            }
-        }
-        template<typename Saver, typename Reducer, typename Plan>
-        inline void MapReduceKeepDim2( Tensor<gpu,1> dst, const Plan &plan, real_t scale, Shape<4> pshape ){  
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[0] );
-            CheckLaunchParam( dimGrid, dimBlock, "MapReduceKeepDim2" );
-            MapReduceKeepDim2Kernel<Saver,Reducer,kBaseThreadBits>
-                <<<dimGrid,dimBlock>>>( dst, plan, scale, pshape );
-        }
-    };
-    namespace cuda{
-        template<int x_bits>        
-        __global__ void SoftmaxKernel( Tensor<gpu,2> dst, Tensor<gpu,2> src ){
-            const unsigned x_size = 1 << x_bits;  
-            const int y = blockIdx.x;
-            __shared__ real_t s_rec[ x_size ];
-            // step 1: get max
-            if( threadIdx.x < dst.shape[ 0 ] ){
-                s_rec[ threadIdx.x ] = src[ y ][ threadIdx.x ] ; 
-            }
-            for( unsigned x = x_size; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t a = src[ y ][ x + threadIdx.x ];
-                    s_rec[ threadIdx.x ] = max( a, s_rec[ threadIdx.x] );
-                }
-            }
-            __syncthreads();
-            if( threadIdx.x >= dst.shape[0] ){
-                s_rec[ threadIdx.x ] = s_rec[0];
-            }
-            __syncthreads();
-            Reduce1D<red::maximum,x_bits>( s_rec );
-            __syncthreads();
-            real_t smax = s_rec[0];            
-            __syncthreads();
-            s_rec[ threadIdx.x ] = 0.0f;
-            __syncthreads();
-            // calculate normalizer, with writeback
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    real_t p = expf( src[ y ][ x + threadIdx.x ] - smax );
-                    s_rec[ threadIdx.x ] += p;
-                    // write back first, will fetch later
-                    dst[ y ][ x + threadIdx.x ] = p;
-                }
-            }
-            // calculate normalizer
-            __syncthreads();
-            Reduce1D<red::sum,x_bits>( s_rec );
-            __syncthreads();
-            real_t ssum = s_rec[0];
-            for( unsigned x = 0; x < dst.shape[0]; x += x_size ){
-                if( x + threadIdx.x < dst.shape[0] ){
-                    dst[ y ][ x + threadIdx.x ] /= ssum;
-                }
-            }
-        }
-        inline void Softmax( Tensor<gpu,2> &dst, const Tensor<gpu,2> &src ){
-            dim3 dimBlock( kBaseThreadNum );
-            dim3 dimGrid ( dst.shape[1] );
-            utils::Assert( dst.shape == src.shape, "Softmax: shape mismatch" );
-            CheckLaunchParam( dimGrid, dimBlock, "Softmax" );
-            SoftmaxKernel<kBaseThreadBits><<<dimGrid,dimBlock>>>( dst, src );
-        }
-    }; // namespace cuda
-}; // namespace mshadow
-#endif // TENSOR_GPU_INL_H
diff --git a/include/mshadow/cxxnet_op.h b/include/mshadow/cxxnet_op.h
deleted file mode 100644
index 1422070..0000000
--- a/include/mshadow/cxxnet_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-#ifndef CXXNET_OP_H
-#define CXXNET_OP_H
-#pragma once
- * \file cxxnet_op.h
- * \brief extra mshadow operation for cxxnet
- * \author Bing Xu
- */
-#include "mshadow/tensor.h"
-namespace mshadow {
-    /*! \brief operations for algorithm */
-    namespace op {
-        struct sigmoid {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f / (1.0f + expf(-a));
-            }
-        };
-        struct sigmoid_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a * ( 1.0f - a );
-            }
-        };
-        /*! \brief Rectified Linear Operation */
-        struct relu {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                using namespace std;
-                return max( a, 0.0f );
-            }
-        };
-        struct relu_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a > 0.0f ? 1.0f : 0.0f;
-            }
-        };
-        struct tanh {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return tanhf( a );
-            }
-        };
-        struct tanh_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f - a * a;
-            }
-        };
-        struct softplus {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return logf(1 + expf(a));
-            }
-        };
-        struct softplus_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 1.0f / (1.0f + expf(-a));
-            }
-        };
-        struct bnll {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a > 0.0f ? a + logf(1.0f + expf(-a)) : logf(1.0f + expf(a));
-            }
-        };
-        struct bnll_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                real_t expval = a > 50.0f ? 50.0f : a; // kBNLL_THRESHOLD = 50.0f
-                expval = expf(-expval);
-                return 1.0f / (1.0f + expval);
-            }
-        };
-        struct square {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a * a;
-            }
-        };
-       /*! \brief scaled tanh, hard code the scale factor*/
-        struct stanh {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-              return  1.7159047*tanhf(0.66666667 *a);
-            }
-        };
-        /*! \breif back prop for scaled tanh: */
-        struct stanh_grad {
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return 0.66666667*1.7159047 -0.66666667/1.7159047*a*a;
-            }
-        };
-        struct abs{
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a < 0 ? -a : a;
-            }
-        };
-    }; //namespace op
-}; //namespace mshadow
-namespace mshadow {
-    namespace op {
-        /*! \brief used for generate Bernoulli mask */
-        struct threshold {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a < b ? 1.0f : 0.0f;
-            }
-        };
-        /*! \brief used for generate element of power */
-        struct power {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return powf( a, b );
-            }
-        };
-        struct sqrtop {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return sqrt(a+b);
-            }
-        };
-        struct max {
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a > b ? a : b;
-            }
-        };
-    }; // namespace op
-}; // namespace mshadow
-#endif // CXXNET_OP_H
diff --git a/include/mshadow/tensor.h b/include/mshadow/tensor.h
deleted file mode 100644
index 42d13d3..0000000
--- a/include/mshadow/tensor.h
+++ /dev/null
@@ -1,472 +0,0 @@
- * \file tensor.h
- * \brief header file of tensor data structure and functions
- *        covention: this lib requires explicit memory allocation and de-allocation
- *                   all the data structure Tensor<cpu,1>, Tensor<gpu,1> are like handles(pointers),
- *                   no memory allocation is happening during calculation
- * \author Bing Xu, Tianqi Chen
- */
-#include "tensor_base.h"
-#include "tensor_expr.h"
-namespace mshadow {
-    /*!
-     * \brief shape of a tensor
-     *       IMPORTANT NOTE: this shape is different from numpy.shape
-     *       shape[0] gives the lowest dimension, shape[dimension-1] gives the highest dimension
-     *       shape[k] corresponds to k-th dimension of tensor
-     * \tparam dimension dimension of tensor
-     */
-    template<int dimension>
-    struct Shape {
-    public:
-        /*! \brief maximum dimension of tensor */
-        const static int kMaxShape = dimension;
-        /*! \brief maximum dimension minus 1 */
-        const static int kSubShape = dimension - 1;
-    public:
-        /*! \brief default constructor, do nothing */
-        MSHADOW_XINLINE Shape(void) {}
-        /*! \brief constuctor */
-        MSHADOW_XINLINE Shape( const Shape<dimension> &s ){
-            #pragma unroll
-            for( int i = 0; i < kMaxShape; ++i ){
-                this->shape_[i] = s[i];
-            }
-            this->stride_ = s.stride_;
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE index_t& operator[](index_t idx) {
-            return shape_[ idx ];
-        }
-        /*!
-         * \brief get corresponding index
-         * \param idx dimension index
-         * \return the corresponding dimension size
-         */
-        MSHADOW_XINLINE const index_t& operator[](index_t idx) const {
-            return shape_[ idx ];
-        }
-        /*! \return whether two shape equals */
-        MSHADOW_XINLINE bool operator==(const Shape<kMaxShape> &s) const {
-            #pragma unroll
-            for ( int i = 0; i < kMaxShape; ++i ) {
-                if (s.shape_[i] != this->shape_[i]) return false;
-            }
-            return true;
-        }
-        /*!
-         * flatten the higher dimension to second dimension, return a 2D shape
-         * \return the flat 2d shape
-         */
-        MSHADOW_XINLINE Shape<2> FlatTo2D(void) const {
-            Shape<2> s;
-            s.stride_ = this->stride_;
-            s.shape_[ 0 ] = this->shape_[ 0 ];
-            index_t ymax = 1;
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                ymax *= this->shape_[ i ];
-            }
-            s.shape_[1] = ymax;
-            return s;
-        }
-        /*! \return number of valid elements */
-        MSHADOW_XINLINE size_t Size(void) const{
-            size_t memsz = this->shape_[ 0 ];
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*! \return memory size, including the aligned x dimension */
-        MSHADOW_XINLINE size_t MSize(void) const {
-            size_t memsz = this->stride_;
-            #pragma unroll
-            for (int i = 1; i < kMaxShape; ++i) {
-                memsz *= this->shape_[ i ];
-            }
-            return memsz;
-        }
-        /*!
-         * \return product shape in [dimstart,dimend)
-         * \param dimstart start dimension
-         * \param dimend   end dimension
-         */
-        MSHADOW_XINLINE index_t ProdShape( int dimstart, int dimend ) const{
-            index_t num = 1;
-            #pragma unroll
-            for (int i = dimstart; i < dimend; ++i) {
-                num *= this->shape_[ i ];
-            }
-            return num;
-        }
-        /*!
-         * \brief get subshape
-         * \return subshape
-         */
-        MSHADOW_XINLINE Shape<kSubShape> SubShape(void) const {
-            Shape<kSubShape> s;
-            s.stride_ = this->stride_;
-            // for cuda
-            #pragma unroll
-            for (int i = 0; i < kSubShape; ++i) {
-                s.shape_[ i ] = this->shape_[ i ];
-            }
-            return s;
-        }
-    public:
-        /*! \brief storing the dimension information */
-        index_t shape_[ kMaxShape ];
-        /*!
-         * \brief storing the stride information in x dimension
-         *    this is used to deal with pitch allocation in gpu or sse(align x dimension to 64bit) for efficiency
-         */
-        index_t stride_;
-    };
-    // useful construction functions to generate shape
-    /*!
-     * \brief construct a one dimension shape, stride will equal s0
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<1> Shape1( index_t s0 ){
-        Shape<1> s; s[0] = s0; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a two dimension shape, stride will equal s0
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<2> Shape2( index_t s1, index_t s0 ){
-        Shape<2> s; s[0] = s0; s[1] = s1; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a three dimension shape, stride will equal s0
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<3> Shape3( index_t s2, index_t s1, index_t s0 ){
-        Shape<3> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s.stride_ = s0;
-        return s;
-    }
-    /*!
-     * \brief construct a four dimension shape, stride will equal s0
-     * \param s3 size of dimension 3
-     * \param s2 size of dimension 2
-     * \param s1 size of dimension 1
-     * \param s0 size of dimension 0
-     * \return the shape construction
-     */
-    MSHADOW_XINLINE Shape<4> Shape4( index_t s3, index_t s2, index_t s1, index_t s0 ){
-        Shape<4> s;
-        s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; s.stride_ = s0;
-        return s;
-    }
-}; // namespace mshadow
-namespace mshadow {
-    /*! \brief device name CPU */
-    struct cpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = true;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<0;
-    };
-    /*! \brief device name CPU */
-    struct gpu {
-        /*! \brief whether this device is CPU or not */
-        const static bool kDevCPU = false;
-        /*! \brief device flag number, identifies this device */
-        const static int kDevMask = 1<<1;
-    };
-    // more compact template
-    /*!
-     * \brief general tensor
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    struct Tensor: public expr::ContainerExp< Tensor<Device,dimension> >{
-    public:
-        /*! \brief whether current type lies in cpu */
-        const static bool kDevCPU = Device::kDevCPU;
-        /*! \brief dimension of subtype */
-        const static int  kSubdim = dimension - 1;
-    public:
-        /*! \brief pointer to the data */
-        real_t *dptr;
-        /*! \brief shape of the tensor */
-        Shape<dimension> shape;
-    public:
-        /*! \brief default constructor */
-        MSHADOW_XINLINE Tensor(void) {}
-        /*! \brief constructor from shape  */
-        MSHADOW_XINLINE Tensor(const Shape<dimension> &shape): shape(shape) {}
-        /*! \brief constructor from data pointer and shape  */
-        MSHADOW_XINLINE Tensor(real_t *dptr, const Shape<dimension> &shape): dptr((real_t*)dptr), shape(shape) {}
-        /*!
-         * \brief flatten the tensor to 2 dimension, collapse the higher dimensions together
-         * \return tensor after flatten
-         */
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        /*!
-         * \brief get a element of dimension - 1
-         * \param idx index
-         * \return the result tensor
-         */
-        MSHADOW_XINLINE Tensor<Device, kSubdim> operator[](index_t idx) const {
-            Shape<kSubdim> s = shape.SubShape();
-            return Tensor<Device, kSubdim>(reinterpret_cast<real_t*> \
-                                           (dptr) + s.MSize() * idx, s);
-        }
-        /*!
-         * \brief slice the tensor in highest dimension [begin,end)
-         * \param begin begin position of slice
-         * \param end end position of slice
-         * \return tensor after slice
-         */
-        MSHADOW_XINLINE Tensor<Device, dimension> Slice(index_t begin, index_t end) const {
-            Shape<dimension> s = this->shape;
-            s[ dimension - 1 ] = end - begin;
-            return Tensor<Device, dimension>(reinterpret_cast<real_t*>\
-                                             (dptr) + s.SubShape().MSize() * begin, s);
-        }
-    public:
-        /*!\brief functions to fit expression template */
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        /*!\brief functions to fit expression template */
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-    /*
-     *  respecialized class Tensor1D,thei is due to different implementation in operator[]
-     */
-    template<typename Device>
-    struct Tensor<Device,1>: public expr::ContainerExp< Tensor<Device,1> >{
-    public:
-        real_t *dptr;
-        Shape<1> shape;
-    public:
-        MSHADOW_XINLINE Tensor(void) {}
-        MSHADOW_XINLINE Tensor(const Shape<1> &shape): shape(shape) {}
-        MSHADOW_XINLINE Tensor(real_t *dptr, Shape<1> shape) :dptr(dptr), shape(shape) {}
-        MSHADOW_XINLINE Tensor<Device, 2> FlatTo2D(void) const {
-            return Tensor<Device, 2>(reinterpret_cast<real_t*> \
-                                     (dptr), shape.FlatTo2D());
-        }
-        MSHADOW_XINLINE Tensor<Device, 1> Slice(index_t begin, index_t end) const {
-            Shape<1> s;
-            s[0] = s.stride_ = end  - begin;
-            return Tensor<Device, 1>(reinterpret_cast<real_t*> \
-                                     (dptr) + begin, s);
-        }
-        MSHADOW_XINLINE real_t &operator[](index_t idx) { return dptr[ idx ]; }
-        MSHADOW_XINLINE const real_t &operator[](index_t idx)const { return dptr[ idx ]; }
-    public:
-        // functions to fit expression template
-        inline Tensor<Device,1>& operator=( double s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,1>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    };
-}; // namespace mshadow
-// add unroll loops for the shape
-namespace mshadow {
-    // function declarations
-    /*!
-     * \brief initialize tensor engine, used to call intialization functions of dependent libs
-     *        this function should be called before all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     * \param device_id GPU device id to be choosed
-     */
-    inline void InitTensorEngine( int device_id=0 );
-    /*!
-     * \brief Shutdown tensor engine,
-     *        this function should be called after all GPU tensor operations,
-     *        for using tensors in CPU, this call is actually not needed
-     */
-    inline void ShutdownTensorEngine( void );
-    /*!
-     * \brief CPU/CPU: allocate space for CTensor, according to the shape in the obj
-     *        this function is responsible to set the stride_ in each obj.shape
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object, with shape specified
-     * \param pad whether padding dimension 0, to make last dimension aligned,
-     *            padding may help improve efficiency of matrix multiplications
-     *            if true, will allocate space with stride_ that may not equals shape[0]
-     *            if false, will allocate continuous space
-     */
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-    /*! \brief refer to comment of cpu ver \sa AllocSpace */
-    template<int dim>
-    inline void AllocSpace(Tensor<gpu,dim> &obj, bool pad = MSHADOW_ALLOC_PAD);
-    /*!
-     * \brief CPU/GPU: free the space of tensor, will set obj.dptr to NULL
-     * \tparam dim specify the dim of tensor
-     * \param obj the tensor object
-     */
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj);
-    /*! \brief refer to comment of cpu ver \sa FreeSpace */
-    template<int dim>
-    inline void FreeSpace(Tensor<gpu,dim> &obj);
-    /*!
-     * \brief CPU/GPU: short cut to allocate and initialize a Tensor
-     * \tparam Device device of tensor
-     * \tparam dim dimention of tensor
-     * \param shape: shape of tensor
-     * \param initv: initialization value
-     * \param pad : padding option
-     * \sa AllocSpace
-     */
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad = MSHADOW_ALLOC_PAD);
-    /*!
-     * \brief copy data from one tensor to another, with same shape
-     * \tparam dim specify the dim of tensor
-     * \param dst target tensor
-     * \param src source tensor
-     */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> dst, const Tensor<gpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<cpu,dim> &src );
-    /*! \brief refer to comment of cpu ver \sa Copy */
-    template<int dim>
-    inline void Copy(Tensor<gpu,dim> dst, const Tensor<gpu,dim> &src );
-    /*!
-     * \brief CPU/GPU: normalize softmax: dst[i][j] = exp( energy[i][j] ) /( sum_j exp( energy[i][j] ) )
-     * \param dst destination
-     * \param energy input energy
-     */
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2> &energy );
-    /*! \brief refer to comment of cpu ver \sa Softmax */
-    inline void Softmax( Tensor<gpu,2> dst, const Tensor<gpu,2> &energy );
-}; // namespace mshadow
-namespace mshadow{
-    // function declarations to support expression, no need to understand them
-    // these functions do not need to be directly used
-    /*!
-     * \brief CPU/GPU: map a expression to a tensor, this function calls MapPlan
-     * \tparam Saver specify storage method
-     * \tparam dim dim of the tensor, during usage, there is no need to specify this parameter
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::expr
-     */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp );
-    /*! \brief refer to comment of cpu ver \sa MapExp */
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<gpu,dim> dst, const expr::Exp<E,etype> &exp );
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in lowest dimension (dimension 0)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepLowest */
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*!
-     * \brief CPU/GPU: map a expression, do reduction to 1D Tensor in third dimension (dimension 2)
-     * \tparam Saver specify storage method
-     * \tparam Reducer specify a reducer method
-     * \tparam E specifies the expression type, not need to specify this parameter during usage
-     * \tparam dimkeep the target dimension to be kept, should be larger than 0, for 0, use MapReduceKeepLowest
-     * \tparam etype expression type
-     * \param dst destination
-     * \param exp expression
-     * \param scale scale the result before save
-     * \sa namespace mshadow:sv, mshadow::op, mshadow::red, mshadow::expr
-     */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-    /*! \brief refer to comment of cpu ver \sa MapReduceKeepHighDim */
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<gpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale = 1.0f );
-};// namespace mshadow
-// execution implementation of expression evaluations
-#include "tensor_expr_engine-inl.hpp"
-// cpu implementation of functions
-#include "tensor_cpu-inl.hpp"
-// gpu implementation of functions
-#include "tensor_gpu-inl.hpp"
-// extension of expressions
-#include "tensor_expr_ext.h"
-// io 
-#include "tensor_io.h"
-// container
-#include "tensor_container.h"
-// random number generator
-#include "tensor_random.h"
-#endif // TENSOR_H
diff --git a/include/mshadow/tensor_base.h b/include/mshadow/tensor_base.h
deleted file mode 100644
index b251cba..0000000
--- a/include/mshadow/tensor_base.h
+++ /dev/null
@@ -1,298 +0,0 @@
- * \file tensor_base.h
- * \brief definitions of base types, macros functions
- *
- * \author Bing Xu, Tianqi Chen
- */
-#include <cmath>
-#include <cstdio>
-#include <cfloat>
-#include <climits>
-#include <algorithm>
-// macro defintiions
-/*!\brief if this macro is define to be 1, mshadow should compile without any of other libs */
-    #define MSHADOW_STAND_ALONE 0
-/*! \brief whether do padding during allocation */
-    #define MSHADOW_ALLOC_PAD true
- * \brief x dimension of data must be bigger pad_size * ratio to be alloced padded memory, otherwise use tide allocation 
- *        for example, if pad_ratio=2, GPU memory alignement size is 32, then we will only allocate padded memory if x dimension > 64
- *        set it to 0 then we will always allocate padded memory
- */
-    #define MSHADOW_MIN_PAD_RATIO 2
-   #define MSHADOW_USE_CBLAS 0
-   #define MSHADOW_USE_MKL   0
-   #define MSHADOW_USE_CUDA  0
-/*! \brief use CBLAS for CBLAS */
-   #define MSHADOW_USE_CBLAS 0
-/*! \brief use MKL for BLAS */
-   #define MSHADOW_USE_MKL   1
-/*! \brief use CUDA support, must ensure that the cuda include path is correct, or directly compile using nvcc */
-  #define MSHADOW_USE_CUDA   1
-/*! \brief use single precition float */
-/*! \brief whether use SSE */
-  #define MSHADOW_USE_SSE 1
-/*! \brief whether use NVML to get dynamic info */
-  #define MSHADOW_USE_NVML 0
-// SSE is conflict with cudacc
-#ifdef __CUDACC__
-  #define MSHADOW_USE_SSE 0
-extern "C"{
-    #include <cblas.h>
-  #include <mkl.h>
-  #include <mkl_cblas.h>
-  #include <mkl_vsl.h>
-  #include <mkl_vsl_functions.h>
-  #include <cublas.h>
-  #include <curand.h>
-  #include <nvml.h>
-// --------------------------------
-// MSHADOW_XINLINE is used for inlining template code for both CUDA and CPU code.
-  #error "MSHADOW_XINLINE must not be defined"
-#ifdef __CUDACC__
-  #define MSHADOW_XINLINE inline __attribute__((always_inline)) __device__ __host__
-  #define MSHADOW_XINLINE inline __attribute__((always_inline))
-/*! \brief cpu force inline */
-#define MSHADOW_CINLINE inline __attribute__((always_inline))
-#if defined(__GXX_EXPERIMENTAL_CXX0X) || defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L
-  #define MSHADOW_CONSTEXPR constexpr
-  #define MSHADOW_CONSTEXPR const
-/*! \brief namespace for mshadow */
-namespace mshadow {
-    /*! \brief buffer size for each random number generator */
-    const unsigned kRandBufferSize = 1000000;
-    /*! \brief pi  */
-    const float kPi = 3.1415926f;
-    /*! \brief type that will be used for content */
-    typedef float real_t;
-    typedef double real_t;
-    /*! \brief type that will be used for index */
-    typedef unsigned index_t;
-}; // namespace mshadow
-namespace mshadow {
-    /*! \brief namespace for operators */
-    namespace op {
-        // binary operator
-        /*! \brief mul operator */
-        struct mul{
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a * b;
-            }
-        };
-        /*! \brief plus operator */
-        struct plus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a + b;
-            }
-        };
-        /*! \brief minus operator */
-        struct minus {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a - b;
-            }
-        };
-        /*! \brief divide operator */
-        struct div {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return a / b;
-            }
-        };
-        /*! \brief get rhs */
-        struct right {
-            /*! \brief map a, b to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
-                return b;
-            }
-        };
-    }; // namespace op
-    /*! \brief namespace for savers */
-    namespace sv {
-        /*! \brief save to saver: = */
-        struct saveto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a  = b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 0.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::right OPType;
-        };
-        /*! \brief save to saver: += */
-        struct plusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a += b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = 1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::plus OPType;
-        };
-        /*! \brief minus to saver: -= */
-        struct minusto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a -= b;
-            }
-            /*! \brief helper constant to use BLAS, alpha */
-            MSHADOW_CONSTEXPR static real_t kAlphaBLAS = -1.0f;
-            /*! \brief helper constant to use BLAS, beta */
-            MSHADOW_CONSTEXPR static real_t kBetaBLAS  = 1.0f;
-            /*! \brief corresponding binary operator type */
-            typedef op::minus OPType;
-        };
-        /*! \brief multiply to saver: *= */
-        struct multo {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a *= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::mul OPType;
-        };
-        /*! \brief divide to saver: /= */
-        struct divto {
-            /*! \brief save b to a using save method */
-            MSHADOW_XINLINE static void Save(real_t& a, real_t b) {
-                a /= b;
-            }
-            /*! \brief corresponding binary operator type */
-            typedef op::div OPType;
-        };
-    }; // namespace sv
-    namespace op {
-        // unary operator/ function: example
-        // these operators can be defined by user, in the same style as binary and unary operator
-        // to use, simply write F<op::identity>( src )
-        /*! \brief identity function that maps a real number to it self */
-        struct identity{
-            /*! \brief map a to result using defined operation */
-            MSHADOW_XINLINE static real_t Map(real_t a) {
-                return a;
-            }
-        };
-    }; // namespace op
-    /*! \brief namespace for potential reducer operations */
-    namespace red {
-        /*! \brief sum reducer */
-        struct sum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                dst += src;
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return 1.0f;
-            }
-            /*! \brief an intial value of reducer */
-            MSHADOW_CONSTEXPR static real_t kInitV = 0.0f;
-        };
-        /*! \brief maximum reducer */
-        struct maximum {
-            /*! \brief do reduction into dst */
-            MSHADOW_XINLINE static void Reduce( volatile real_t& dst,  volatile real_t src ) {
-                using namespace std;
-                dst = max( dst, src );
-            }
-            /*! \brief calculate gradient of redres with respect to redsrc,  redres: reduced result, redsrc: one of reduction element */
-            MSHADOW_XINLINE static real_t PartialGrad( real_t redres, real_t redsrc ) {
-                return redres == redsrc ? 1.0f: 0.0f;
-            }
-            /*! \brief an intial value of reducer */
-            MSHADOW_CONSTEXPR static real_t kInitV = -FLT_MAX;
-            MSHADOW_CONSTEXPR static real_t kInitV = -DBL_MAX;
-        };
-    };
-    /*! \brief namespace for helper utils of the project */
-    namespace utils{
-        /*! \brief send error message then exit */
-        inline void Error( const char *msg ){
-            fprintf( stderr, "Error:%s\n",msg );
-            exit( -1 );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp ){
-            if( !exp ) Error( "AssertError" );
-        }
-        /*! \brief assert a expression is true */
-        inline void Assert( bool exp, const char *msg ){
-            if( !exp ) Error( msg );
-        }
-        /*! \brief warning */
-        inline void Warning( const char *msg ){
-            fprintf( stderr, "warning:%s\n",msg );
-        }
-    }; // namespace utils
-}; // namespace mshadow
-#endif // TENSOR_BASE_H
diff --git a/include/mshadow/tensor_container.h b/include/mshadow/tensor_container.h
deleted file mode 100644
index f0699e7..0000000
--- a/include/mshadow/tensor_container.h
+++ /dev/null
@@ -1,152 +0,0 @@
- * \file tensor_container.h
- * \brief tensor container that does memory allocation and resize like STL
- * \author Tianqi Chen
- */
-#include "tensor.h"
-#include "tensor_io.h"
-namespace mshadow{
-    /*!
-     * \brief tensor container that does memory allocation and resize like STL,
-     *        use it to save the lines of FreeSpace in class.
-     *        Do not abuse it, efficiency can come from pre-allocation and no re-allocation
-     *
-     * \tparam Device which device the tensor is on
-     * \tparam dimension dimension of the tensor
-     */
-    template<typename Device, int dimension>
-    class TensorContainer: public Tensor<Device,dimension>{
-    public:
-        /*! 
-         * \brief constructor 
-         * \param pad whether use padding alignment in space allocation
-         */
-        TensorContainer( bool pad = MSHADOW_ALLOC_PAD ){
-            this->pad_ = pad;
-            this->dptr = data_.dptr = NULL;
-            this->shape[0] = 0;
-            this->shape.stride_ = 0;
-            this->data_.shape.stride_ = 0;
-            this->data_.shape[1] = 0;
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         */
-        TensorContainer( const Shape<dimension> &shape ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-        }
-        /*! 
-         * \brief constructor 
-         * \param shape intial shape
-         * \param initv intial value
-         */
-        TensorContainer( const Shape<dimension> &shape, real_t initv ){
-            this->pad_ = MSHADOW_ALLOC_PAD;
-            data_.dptr = NULL;
-            this->AllocByShape( shape );
-            (*this) = initv;
-        }
-        ~TensorContainer( void ){
-            this->FreeSpace();
-        }
-        /*! 
-         * \brief resize the container to given shape, content is NOT preserved
-         * \param shape target shape
-         */
-        inline void Resize( const Shape<dimension> &shape ){
-            Shape<2> s2 = shape.FlatTo2D();            
-            if( s2.shape_[0] > data_.shape.stride_ || s2.shape_[1] > data_.shape[1] ){
-                this->AllocByShape( shape );
-            }else{
-                this->shape = shape;
-                if( this->pad_ ){
-                    this->shape.stride_ = data_.shape.stride_;
-                }else{
-                    this->shape.stride_ = this->shape[ 0 ];
-                }
-            }
-        }
-        /*! 
-         * \brief resize the container to given shape, and initialize, content is NOT preserved
-         * \param shape target shape
-         * \param initv initialization value
-         */
-        inline void Resize( const Shape<dimension> &shape, real_t initv ){
-            this->Resize( shape );
-            (*this) = initv;
-        }
-        /*! \brief set whether padding is allowed in tensor */
-        inline void set_pad( bool pad ){
-            this->pad_ = pad;
-        }
-        /*! 
-         * \brief save by binary format
-         * \param fo output binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void SaveBinary( TStream &fo ) const{
-            mshadow::SaveBinary( fo, *this );
-        }
-        /*! 
-         * \brief load by binary format, a temp Tensor<cpu,dim> storage will be allocated
-         * \param fi input binary stream
-         * \tparam TStream type of stream, need to support Read, Write, one example is utils::IStream.
-         */
-        template<typename TStream>
-        inline void LoadBinary( TStream &fi ) {
-            Tensor<cpu,dimension> tmp;
-            mshadow::LoadBinary( fi, tmp, false );
-            this->Resize( tmp.shape );
-            Copy( *this, tmp );
-            mshadow::FreeSpace( tmp );
-        }
-    public:
-        // functions to fit exp template
-        inline Tensor<Device,dimension>& operator=( real_t s ){
-            return this->__assign( s );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kMapper> &exp ){
-            return this->__assign( exp );
-        }
-        template<typename E>
-        inline Tensor<Device,dimension>& operator=( const expr::Exp<E,expr::type::kComplex> &exp ){
-            return this->__assign( exp );
-        }
-    private:
-        /*! \brief whether we do padding in the space */
-        bool pad_;
-        /*! \brief the shape of data_ is actually current data space */
-        Tensor<Device, 2> data_;
-    private:
-        inline void FreeSpace (void){
-            if( data_.dptr != NULL ){
-                mshadow::FreeSpace( data_ );
-                data_.dptr = this->dptr = NULL;
-            }
-        }
-        inline void AllocByShape (const Shape<dimension>& shape){
-            if( data_.dptr != NULL ){
-                this->FreeSpace();
-            }
-            data_.shape = shape.FlatTo2D();
-            mshadow::AllocSpace( data_, pad_ );
-            this->dptr  = data_.dptr;
-            this->shape = shape;
-            if( this->pad_ ){
-                this->shape.stride_ = data_.shape.stride_;
-            }else{
-                this->shape.stride_ = shape[0];
-            }
-        }
-    };
-};// namespace mshadow
diff --git a/include/mshadow/tensor_cpu-inl.hpp b/include/mshadow/tensor_cpu-inl.hpp
deleted file mode 100644
index 0fa3cfa..0000000
--- a/include/mshadow/tensor_cpu-inl.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
- * \file tensor_cpu-inl.hpp
- * \brief implementation of CPU host code
- * \author Bing Xu, Tianqi Chen
- */
-#include <cstring>
-#include "tensor_base.h"
-#include "tensor_sse-inl.hpp"
-namespace mshadow {
-    template<int dim>
-    inline void AllocSpace(Tensor<cpu,dim> &obj, bool pad ){
-        size_t pitch;
-        if( pad ){
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape[0] * sizeof(real_t), obj.FlatTo2D().shape[1] );
-            obj.shape.stride_ = static_cast<index_t>( pitch / sizeof(real_t) );
-        }else{
-            obj.shape.stride_ = obj.shape[0];
-            obj.dptr = (real_t*)sse2::AlignedMallocPitch
-                ( pitch, obj.shape.Size() * sizeof(real_t), 1 );
-        }
-    }
-    template<typename Device, int dim>
-    inline Tensor<Device,dim> NewTensor(const Shape<dim> &shape, real_t initv, bool pad ){
-        Tensor<Device, dim> obj( shape );
-        AllocSpace( obj, pad );
-        MapExp<sv::saveto>( obj, expr::ScalarExp( initv ) );
-        return obj;
-    }
-    template<int dim>
-    inline void FreeSpace(Tensor<cpu,dim> &obj){
-        sse2::AlignedFree( obj.dptr );
-        obj.dptr = NULL;
-    }
-    template<int dim>
-    inline void Copy(Tensor<cpu,dim> _dst, const Tensor<cpu,dim> &_src ){
-        utils::Assert( _dst.shape == _src.shape, "Copy:shape mismatch" );
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        Tensor<cpu,2> src = _src.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            memcpy( dst[y].dptr, src[y].dptr, sizeof(real_t) * dst.shape[0] );
-        }
-    }
-    template<typename Saver, typename E, int dim>
-    inline void MapPlan(Tensor<cpu,dim> _dst, const expr::Plan<E> &plan){
-        Tensor<cpu,2> dst = _dst.FlatTo2D();
-        for (index_t y = 0; y < dst.shape[1]; ++y ) {
-            for (index_t x = 0; x < dst.shape[0]; ++x ) {
-                // trust your compiler! -_- they will optimize it
-                Saver::Save(dst[y][x], plan.Eval( y, x ) );
-            }
-        }
-    }
-    // code to handle SSE optimization
-    template<bool pass_check,typename Saver, int dim, typename E, int etype>
-    struct MapExpCPUEngine;
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<false,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            MapPlan<SV>( dst, MakePlan( exp.self() ) );
-        }
-    };
-    template<typename SV, int dim, typename E, int etype>
-    struct MapExpCPUEngine<true,SV,dim,E,etype>{
-        inline static void Map(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-            using namespace expr;
-            if( SSEAlignCheck<dim,E>::Check( exp.self() ) && SSEAlignCheck< dim,Tensor<cpu,dim> >::Check(dst) ){
-                MapSSEPlan<SV>( dst, MakeSSEPlan( exp.self() ) );
-            }else{
-                MapPlan<SV>( dst, MakePlan( exp.self() ) );
-            }
-        }
-    };
-    #endif
-    template<typename Saver, int dim, typename E, int etype>
-    inline void MapExp(Tensor<cpu,dim> dst, const expr::Exp<E,etype> &exp ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dim,E>::kMapPass >::Error_All_Tensor_in_Exp_Must_Have_Same_Type();
-        Shape<dim> eshape = ShapeCheck<dim,E>::Check( exp.self() );
-        utils::Assert( eshape[0] == 0 || eshape == dst.shape, "Assignment: Shape of Tensors in expression is not consistent with target" );
-        #if MSHADOW_USE_SSE
-        MapExpCPUEngine< SSECheck<E>::kPass,Saver,dim,E,etype >::Map( dst, exp );
-        #else
-        MapExpCPUEngine< false,Saver,dim,E,etype >::Map( dst, exp );
-        #endif
-    }
-    template<typename Saver, typename Reducer, typename E, int etype>
-    inline void MapReduceKeepLowest( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,1,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        Shape<2> eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() ).FlatTo2D();
-        utils::Assert( eshape[0] == dst.shape[0], "reduction dimension do not match" );
-        utils::Assert( eshape[1] != 0, "can not reduce over empty tensor" );
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-        for( index_t x = 0; x < eshape[0]; ++x ){
-            real_t res = plan.Eval( 0, x );
-            for( index_t y = 1; y < eshape[1]; ++y ){
-                Reducer::Reduce( res, plan.Eval( y, x ) );
-            }
-            Saver::Save( dst[x], res*scale );
-        }
-    }
-    template<typename Saver, typename Reducer, int dimkeep, typename E, int etype>
-    inline void MapReduceKeepHighDim( Tensor<cpu,1> dst, const expr::Exp<E,etype> &exp, real_t scale ){
-        using namespace expr;
-        TypeCheckPass< TypeCheck<cpu,dimkeep,E>::kRedPass >::Error_TypeCheck_Not_Pass_For_Reduce_Exp();
-        typedef Shape< ExpInfo<E>::kDim > EShape;
-        EShape eshape = ShapeCheck< ExpInfo<E>::kDim, E >::Check( exp.self() );
-        utils::Assert( eshape[dimkeep] == dst.shape[0], "reduction dimension do not match" );
-        // use equvalent form
-        Shape<4> pshape = Shape4( eshape.ProdShape(dimkeep+1,EShape::kMaxShape), eshape[dimkeep], 
-                                  eshape.ProdShape(1,dimkeep), eshape[0] );
-        // execution
-        expr::Plan<E> plan = MakePlan( exp.self() );
-        for( index_t c = 0; c < pshape[2]; ++c ){
-            real_t res = Reducer::kInitV;
-            for( index_t n = 0; n < pshape[3]; ++n ){
-                real_t tres = Reducer::kInitV;
-                for( index_t y = 0; y < pshape[1]; ++y ){
-                    for( index_t x = 0; x < pshape[0]; ++x ){
-                        Reducer::Reduce( tres, plan.Eval( (n*pshape[2] + c) * pshape[1] + y, x ) );
-                    }
-                }
-                Reducer::Reduce( res, tres );
-            }
-            Saver::Save( dst[c], res*scale );
-        }
-    }
-    inline void Softmax( Tensor<cpu,1> dst, const Tensor<cpu,1>& energy ){
-        real_t mmax = energy[0];
-        for( real_t x = 1; x < dst.shape[0]; ++x )
-            if( mmax < energy[x] ) mmax = energy[x];
-        real_t sum = 0.0f;
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] = std::exp( energy[x] - mmax );
-            sum += dst[x];
-        }
-        for( index_t x = 0; x < dst.shape[0]; ++x ){
-            dst[x] /= sum;
-        }
-    }
-    inline void Softmax( Tensor<cpu,2> dst, const Tensor<cpu,2>& energy ){
-        utils::Assert( dst.shape == energy.shape, "Softmax: shape mismatch" );
-        for( index_t y = 0; y < dst.shape[1]; ++y ){
-            Softmax( dst[y], energy[y] );
-        }
-    }
-}; // namespace mshadow
diff --git a/include/mshadow/tensor_expr.h b/include/mshadow/tensor_expr.h
deleted file mode 100644
index ac8fde7..0000000
--- a/include/mshadow/tensor_expr.h
+++ /dev/null
@@ -1,367 +0,0 @@
- * \file tensor_expr.h
- * \brief definitions of abstract expressions and expressions template
- * \author Tianqi Chen, Bing Xu
- */
-#include "tensor_base.h"
-namespace mshadow{
-    /*!
-     * \brief namespace for abstract expressions and expressions template,
-     *        have no dependecy on tensor.h,
-     *        These data structure takes no charge in computations,
-     *        they are only used to define operations and represent expression in a symbolic way
-     */
-    namespace expr{
-        /*! \brief type of expressions */
-        namespace type{
-            /*! \brief this expression directly correspnds to a data class */
-            const int kContainer = 0;
-            /*! \brief this only contains element-wise vector operations */
-            const int kMapper    = 1;
-            /*! \brief othercase: e.g dot product */
-            const int kComplex   = 3;
-        };
-        /*!
-         * \brief expression engine that actually interprets these expressions
-         *        this is a function template that needed to be implemented for specific expressions
-         */
-        template<typename Saver,typename Container>
-        struct ExpEngine{
-            template<typename EType>
-            inline static void Eval( Container& dst, const EType &exp );
-        };
-        template<typename Container>
-        class ContainerExp;
-        class ScalarExp;
-        /*!
-         * \brief base class for expression
-         * \tparam SubType inheritated class must put their type into this parameter
-         * \tparam exp_type expression type, see namespace type
-         */
-        template<typename SubType, int exp_type>
-        struct Exp{
-        public:
-            /*! \return  subtype instance of current class */
-            inline const SubType& self( void ) const{
-                return *static_cast<const SubType*>(this);
-            }
-            /*! \return reference of subtype instance of current class */
-            inline SubType& refself( void ){
-                return *static_cast<SubType*>(this);
-            }
-        };
-        /*! \brief scalar expression */
-        struct ScalarExp: public Exp<ScalarExp, type::kMapper>{
-            /*! \brief scalar value */
-            real_t scalar_;
-            /*! \brief constructor */
-            ScalarExp( real_t scalar ):scalar_(scalar){}
-        };
-        /*! \brief represent a transpose expression of a container */
-        template<typename EType>
-        struct TransposeExp: public Exp< TransposeExp<EType>, type::kComplex >{
-        public:
-            /*! \brief expression to be transposed */
-            const EType &exp;
-            /*! \brief constructor */
-            TransposeExp( const EType &e ):exp(e){}
-            /*! \brief transpose expression */
-            inline const EType & T( void ) const{
-                return exp;
-            }
-        };
-        /*!
-         * \brief base class of all variables, that can be assigned to values
-         * \tparam Container the actually class of data container, e.g. CTensor1D
-         */
-        template<typename Container>
-        class ContainerExp: public Exp< Container, type::kContainer >{
-        public:
-            /*!
-             *\brief transpose of a matrix
-             *\return transpose of current expression
-             */
-            inline const TransposeExp<Container> T( void ) const{
-                return TransposeExp<Container>( this->self() );
-            }
-        public:
-            /*! \brief operator overload */
-            inline Container &operator+=( real_t s ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator-=( real_t s ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator*=( real_t s ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &operator/=( real_t s ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-            /*! \brief operator overload */
-            inline Container &__assign( real_t s ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), ScalarExp(s) );
-                return this->refself();
-            }
-        public:
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kMapper> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator=, note that we can not define container = container */
-            template<typename E>
-            inline Container &__assign( const Exp<E,type::kComplex> &exp ){
-                ExpEngine<sv::saveto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator+= */
-            template<typename E,int etype>
-            inline Container &operator+=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::plusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator-= */
-            template<typename E,int etype>
-            inline Container &operator-=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::minusto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator*= */
-            template<typename E,int etype>
-            inline Container &operator*=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::multo,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-            /*! \brief implementation of operator/= */
-            template<typename E,int etype>
-            inline Container &operator/=( const Exp<E,etype> &exp ){
-                ExpEngine<sv::divto,Container>::Eval( this->refself(), exp.self() );
-                return this->refself();
-            }
-        };
-    }; // namespace expr
-    namespace expr{
-        /*!
-         * \brief matrix multiplication expression dot( lhs[.T], rhs[.T] )
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam ltrans whether lhs is transposed
-         * \tparam rtrans whether rhs is transposed
-         */
-        template<typename TA,typename TB,bool ltrans,bool rtrans>
-        struct DotExp: public Exp< DotExp<TA,TB,ltrans,rtrans>, type::kComplex >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief scale over result */
-            real_t scale_;
-            /*! \brief constructor */
-            DotExp( const TA &lhs, const TB &rhs, real_t scale )
-                :lhs_(lhs),rhs_(rhs),scale_(scale){}
-        };
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,false> dot( const ContainerExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,false,false>( lhs.self(), rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,false> dot( const TransposeExp<TA> &lhs, const ContainerExp<TB> &rhs ){
-            return DotExp<TA,TB,true,false>( lhs.exp, rhs.self(), 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,false,true> dot( const ContainerExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,false,true>( lhs.self(), rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB>
-        inline DotExp<TA,TB,true,true> dot( const TransposeExp<TA> &lhs, const TransposeExp<TB> &rhs ){
-            return DotExp<TA,TB,true,true>( lhs.exp, rhs.exp, 1.0f );
-        }
-        /*! \brief dot operator def */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( const DotExp<TA,TB,ltrans,rtrans> &lhs, real_t rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( lhs.lhs_, lhs.rhs_, lhs.scale_ * rhs );
-        }
-        /*! \brief scale of dot operation */
-        template<typename TA, typename TB, bool ltrans, bool rtrans >
-        inline DotExp<TA,TB,ltrans,rtrans> operator*( real_t lhs, const DotExp<TA,TB,ltrans,rtrans> &rhs ){
-            return DotExp<TA,TB,ltrans,rtrans>( rhs.lhs_, rhs.rhs_, rhs.scale_ * lhs );
-        }
-    }; // namespace expr
-    namespace expr{
-        /*!
-         * \brief binary map expression lhs [op] rhs
-         * \tparam OP operator
-         * \tparam TA type of lhs
-         * \tparam TB type of rhs
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, typename TB, int etype >
-        struct BinaryMapExp: public Exp< BinaryMapExp<OP,TA,TB,etype>, etype >{
-            /*! \brief left operand */
-            const TA& lhs_;
-            /*! \brief right operand */
-            const TB& rhs_;
-            /*! \brief constructor */
-            BinaryMapExp( const TA &lhs, const TB &rhs )
-                :lhs_(lhs), rhs_(rhs){}
-        };
-        /*! \brief make expression */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > MakeExp( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) >( lhs.self(), rhs.self() );
-        }
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(lhs, rhs). create a binary operation expression 
-         * \param lhs left operand
-         * \param rhs right operand
-         * \tparam binary operator 
-         * \tparam TA lhs expression
-         * \tparam ta lhs expression type
-         * \tparam TB rhs expression
-         * \tparam tb rhs expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<OP,TA,TB, (ta|tb|type::kMapper) > F( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TA, int ta>
-        inline BinaryMapExp<OP,TA,ScalarExp, (ta|type::kMapper) > F( const Exp<TA,ta> &lhs, const ScalarExp &rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        /*! \brief operator overload for const */
-        template<typename OP,typename TB, int tb>
-        inline BinaryMapExp<OP,ScalarExp,TB, (tb|type::kMapper) > F( const ScalarExp &lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<OP>( lhs, rhs );
-        }
-        // operator rules
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::plus,TA,TB, (ta|tb|type::kMapper) > operator+( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::minus,TA,TB, (ta|tb|type::kMapper) > operator-( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::mul,TA,TB, (ta|tb|type::kMapper) > operator*( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, typename TB, int ta, int tb>
-        inline BinaryMapExp<op::div,TA,TB, (ta|tb|type::kMapper) > operator/( const Exp<TA,ta> &lhs, const Exp<TB,tb> &rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::plus, TA, ScalarExp, (ta|type::kMapper) > operator+( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::minus, TA, ScalarExp, (ta|type::kMapper) > operator-( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::mul, TA, ScalarExp, (ta|type::kMapper) > operator*( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TA, int ta>
-        inline BinaryMapExp<op::div, TA, ScalarExp, (ta|type::kMapper) > operator/( const Exp<TA,ta>& lhs,  const ScalarExp& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-        // constant operators 2
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::plus, ScalarExp, TB, (tb|type::kMapper) > operator+( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::plus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::minus, ScalarExp, TB, (tb|type::kMapper) > operator-( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::minus>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::mul, ScalarExp, TB, (tb|type::kMapper) > operator*( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::mul>( lhs, rhs );
-        }
-        /*! \brief operator overload */
-        template<typename TB, int tb>
-        inline BinaryMapExp<op::div, ScalarExp, TB, (tb|type::kMapper) > operator/( const ScalarExp& lhs, const Exp<TB,tb>& rhs ){
-            return MakeExp<op::div>( lhs, rhs );
-        }
-    };
-    namespace expr{
-        /*!
-         * \brief unary map expression op(src)
-         * \tparam OP operator
-         * \tparam TA type of src
-         * \tparam etype expression type, sa namespace::type
-         */
-        template<typename OP, typename TA, int etype >
-        struct UnaryMapExp: public Exp< UnaryMapExp<OP,TA,etype>, etype >{
-            /*! \brief source expression */
-            const TA& src_;
-            /*! \brief constructor */
-            UnaryMapExp( const TA &src ):src_(src){}
-        };
-        /*! \brief make expression */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > MakeExp( const Exp<TA,ta> &src ){
-            return UnaryMapExp<OP,TA, (ta|type::kMapper) >( src.self() );
-        }
-        /*! 
-         * \brief short hand for MakeExp, usage F<op>(src), create a unary operation expression 
-         * \param src source expression
-         * \tparam operator 
-         * \tparam TA source expression
-         * \tparam ta source expression type
-         * \sa mshadow::op
-         */
-        template<typename OP,typename TA, int ta>
-        inline UnaryMapExp<OP,TA,(ta|type::kMapper) > F( const Exp<TA,ta> &src ){
-            return MakeExp<OP>(src);
-        }
-    };