You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@singa.apache.org by "hacker99 (JIRA)" <ji...@apache.org> on 2016/09/26 08:30:21 UTC
[jira] [Comment Edited] (SINGA-249) Convolution BP

    [ https://issues.apache.org/jira/browse/SINGA-249?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15522446#comment-15522446 ] 

hacker99 edited comment on SINGA-249 at 9/26/16 8:29 AM:
---------------------------------------------------------

Thank you very much for reply!

I just look at code in Caffe for BP Algorithm which just same with http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm .The code in line#62 #63 show that when compute gradient in  one layer , Caffe  use  input of current  layer and the gradient of next layer , but in Singa i find that Singa use the input of current layer and gradient of final layer. if there anything i miss with BP in Singa? Looking forward your reply again.
 
(code in https://github.com/apache/incubator-singa/blob/master/src/model/layer/convolution.cc
162     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
163     dw += Mult(grad_b, col_data.T());
)


https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cpp
 42 template <typename Dtype>
 43 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 44       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 45   const Dtype* weight = this->blobs_[0]->cpu_data();
 46   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 47   for (int i = 0; i < top.size(); ++i) {
 48     const Dtype* top_diff = top[i]->cpu_diff();
 49     const Dtype* bottom_data = bottom[i]->cpu_data();
 50     Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
 51     // Bias gradient, if necessary.
 52     if (this->bias_term_ && this->param_propagate_down_[1]) {
 53       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
 54       for (int n = 0; n < this->num_; ++n) {
 55         this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
 56       }
 57     }
 58     if (this->param_propagate_down_[0] || propagate_down[i]) {
 59       for (int n = 0; n < this->num_; ++n) {
 60         // gradient w.r.t. weight. Note that we will accumulate diffs.
 61         if (this->param_propagate_down_[0]) {
 62           this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
 63               top_diff + n * this->top_dim_, weight_diff);
 64         }
 65         // gradient w.r.t. bottom data, if necessary.
 66         if (propagate_down[i]) {
 67           this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
 68               bottom_diff + n * this->bottom_dim_);
 69         }
 70       }
 71     }
 72   }
 73 }






was (Author: harker99):
Thank you very much for reply!

I just look at code in Caffe for BP Algorithm which just same with http://ufldl.stanford.edu/wiki/index.php/Backpropagation_Algorithm .The code in line#62 #63 show that when compute gradient in  one layer , Caffe  use  input of current  layer and the gradient of next layer , but in Singa i find that Singa use the input of current layer and gradient of final layer. if there anything i miss with BP in Singa? Looking forward your reply again.
 
(code in https://github.com/apache/incubator-singa/blob/master/src/model/layer/convolution.cc
162     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
163     dw += Mult(grad_b, col_data.T());
)



 42 template <typename Dtype>
 43 void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
 44       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
 45   const Dtype* weight = this->blobs_[0]->cpu_data();
 46   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
 47   for (int i = 0; i < top.size(); ++i) {
 48     const Dtype* top_diff = top[i]->cpu_diff();
 49     const Dtype* bottom_data = bottom[i]->cpu_data();
 50     Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
 51     // Bias gradient, if necessary.
 52     if (this->bias_term_ && this->param_propagate_down_[1]) {
 53       Dtype* bias_diff = this->blobs_[1]->mutable_cpu_diff();
 54       for (int n = 0; n < this->num_; ++n) {
 55         this->backward_cpu_bias(bias_diff, top_diff + n * this->top_dim_);
 56       }
 57     }
 58     if (this->param_propagate_down_[0] || propagate_down[i]) {
 59       for (int n = 0; n < this->num_; ++n) {
 60         // gradient w.r.t. weight. Note that we will accumulate diffs.
 61         if (this->param_propagate_down_[0]) {
 62           this->weight_cpu_gemm(bottom_data + n * this->bottom_dim_,
 63               top_diff + n * this->top_dim_, weight_diff);
 64         }
 65         // gradient w.r.t. bottom data, if necessary.
 66         if (propagate_down[i]) {
 67           this->backward_cpu_gemm(top_diff + n * this->top_dim_, weight,
 68               bottom_diff + n * this->bottom_dim_);
 69         }
 70       }
 71     }
 72   }
 73 }





> Convolution BP
> --------------
>
>                 Key: SINGA-249
>                 URL: https://issues.apache.org/jira/browse/SINGA-249
>             Project: Singa
>          Issue Type: Wish
>         Environment: ubuntu 14.04，singa 1.0
>            Reporter: hacker99
>
> I'm curious about how to calculate the gradient of the back propagation algorithm eg. Convolution layer. Can anyone explain to me the details of the implementation of the formula and the code? Very grateful, if there is some documents or just tell why   dw += Mult(grad_b, col_data.T())?
> #code from src/model/layer/convolution.cc
> const std::pair<Tensor, vector<Tensor>> Convolution::Backward(
>     int flag, const Tensor &grad) {
>   CHECK_EQ(grad.device()->lang(), kCpp);
>   CHECK_EQ(grad.nDim(), 4u);
>   CHECK(!buf_.empty());
>   Tensor src_data = buf_.top();
>   buf_.pop();
>   vector<Tensor> param_grad;
>   Tensor dx;
>   Tensor db, dw;
>   dx.ResetLike(src_data);
>   db.ResetLike(bias_);
>   dw.ResetLike(weight_);
>   dw.SetValue(0.0f);
>   size_t batchsize = grad.shape(0);
>   size_t imagesize = src_data.Size() / batchsize;
>   if (bias_term_) {
>     Tensor tmp1 =
>         Reshape(grad, Shape{batchsize * num_filters_,
>                             grad.Size() / (batchsize * num_filters_)});
>     Tensor tmp2(Shape{batchsize * num_filters_});
>     SumColumns(tmp1, &tmp2);
>     Tensor tmp3 = Reshape(tmp2, Shape{batchsize, num_filters_});
>     SumRows(tmp3, &db);
>   }
>   auto in_data = src_data.data<float>();
>   Tensor col_data(Shape{col_height_, col_width_});
>   float *data_col = new float[col_height_ * col_width_];
>   float *dx_b = new float[imagesize];
>   for (size_t b = 0; b < batchsize; b++) {
>     Im2col(in_data + b * imagesize, channels_, height_, width_, kernel_h_,
>            kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_, data_col);
>     col_data.CopyDataFromHostPtr(data_col, col_height_ * col_width_);
>     Tensor grad_b(Shape{num_filters_, conv_height_ * conv_width_});
>     CopyDataToFrom(&grad_b, grad, grad_b.Size(), 0, b * grad_b.Size());
>     dw += Mult(grad_b, col_data.T());
>     Tensor dcol_b = Mult(weight_.T(), grad_b);
>     auto dcol_data = dcol_b.data<float>();
>     Col2im(dcol_data, channels_, height_, width_, kernel_h_, kernel_w_, pad_h_,
>            pad_w_, stride_h_, stride_w_, dx_b);
>     dx.CopyDataFromHostPtr(dx_b, imagesize, b * imagesize);
>   }
>   param_grad.push_back(dw);
>   param_grad.push_back(db);
>   delete[] data_col;
>   delete[] dx_b;
>   return std::make_pair(dx, param_grad);
> }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)